之前学习爬虫的时候一直了解、学习的是基于PhantomJS的,虽然Chrome的headless更加优秀、比PhantomJS更快、占用内存更少,而且还有个强大的爸爸。但是也不能把之前的学的给荒废了,先实践下再说,况且这种东西大部分应该是都是互通的。
PhantomJS 是无界面的 Webkit 解析器,提供了 JavaScript API 。由于去除了可视化界面,速度比一般 Webkit 浏览器要快很多。同时提供了很多监控和事件接口,可以方便的操作 DOM 节点,模拟用户操作等。
1. 爬虫功能
爬虫主要需要具备基本功能:
- javascript动态解析能力
- hook所有的网络请求
- 静态页面链接、表单自动分析能力
- 自动交互能力
1.1 静态页面链接和表单自动分析
phantomjs提供page.evaluate,这个方法会创建一个“沙盒”来解析javascript。所以,我们可以在沙盒中执行javascript代码,以此获得静态页面链接和表单
1.2 javascript动态解析
phantomjs在打开url的时候就会自动使用自己的webkit内核去执行对应的javascript代码,从而可以实现js动态解析
1.3 hook所有的网络请求
phantomjs使用page.onResourceRequested方法来hook所有的网络请求,所以可以在这个函数里面截获ajax请求,获取url和对应的参数
1.4 自动交互
爬虫的自动交互能力就是需要获取页面所有事件,并想办法触发事件,最后获取事件触发的结果。
页面中的事件一般分两种:
- 内联事件:比如on开头的事件onxxx、javascript:xxx
- 绑定事件:比如document.addEventListener、jQuery中的$('dom').xxx
自动交互事件即用户交互事件,而用户操作的本质,实际上是触发了绑定在DOM节点的事件。所以自动触发问题,可以简化为触发节点事件。
分解下自动触发事件的三个步骤,逐步实现
1、获取事件 2、触发事件 3、获取事件触发结果
2.1 内敛事件
获取内敛事件: 遍历节点,获取所有节点内的onxx属性和javascript:属性值
触发内敛事件: 调用eval执行所有的onxxxx属性和javascript:属性的值
2.2 绑定事件:
获取绑定事件: JavaScript中绑定事件,都会调用addEventListener函数,所以我们可以用hook addEventListener的办法来获取绑定事件:onInitialized
触发绑定事件: JavaScript中提供了dispatchEvent函数,可以触发指定DOM节点的指定事件
2.3 获取事件触发结果
等待页面加载完之后,需要获取所有的<a>、<iframe>、<form>等标签,开启页面DOM节点监听,并且触发所有的事件。
获取form表单属性和值
获取a标签的href值
获取link标签的href值
获取area标签的href值
获取img标签的src值
获取embed标签的src值
获取video标签的src值
获取audio标签的src值
2. 代码实现
具体代码如下:
var url, cookie, postdata, auth, post, timeout;
var referer = "https://www.baidu.com/";
var page = require('webpage').create();
var system = require('system');
if (system.args.length !== 2){
console.log('Usage: phantom.js <url> <cookie> <auth> <post> <timeout>');
phantom.exit(1);
}else{
// 初始化
var address = system.args[1];
// var cookie = system.args[2];
var cookie = ""
// var auth = system.args[3];
// var post = system.args[4];
var post = ""
// var timeout = system.args[5];
// 初始化
// page.settings.resourceTimeout = timeout ? timeout * 1000 : 5 * 1000;
page.settings.resourceTimeout = 30000;
page.settings.loadImage = false;
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
headers = {
"Connection" : "keep-alive",
"Cache-Control" : "max-age=0",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
// headers['Cookie'] = cookie;
// headers['authorization'] = auth;
page.customHeaders = headers;
page.viewportSize = {
width: 1024,
height: 768
page.onInitialized = function(){
_addEventListener = Element.prototype.addEventListener;
Element.prototype.addEventListener = function(a,b,c){
EVENT_LIST.push({"event":event, "element":this});
_addEventListener.apply(this, arguments);
console.dir(EVENT_LIST);
for (var i in EVENT_LIST){
var evt = document.createEvent('CustomEvent');
evt.initCustomEvent(EVENT_LIST[i]["event"], true, true, null);
EVENT_LIST[i]["element"].dispatchEvent(evt);
page.onLoadStarted = function () {
console.log("page.onLoadStarted");
page.onLoadFinished = function () {
console.log("page.onLoadFinished");
page.onConsoleMessage = function(msg){
console.log(msg);
// page.onResourceRequested = function(requestData, request){
// if ((/^(http:\/\/|https:\/\/).+?/).test(requestData['url'])){
// if((/.+?\.(css|jpg|jpeg|gif|png|mp4|avi|mp3|bmp|woff|js)$/gi).test(requestData['url'])){
// request.abort();
// }
// else{
// if((/logout|delete/).test(request.url)){
// request.abort();
// }
// else{
// console.log(requestData['url']);
// }
// }
// }
// else{
// request.abort();
// }
// };
page.onResourceRequested = function (request) {
var requestData = JSON.parse(JSON.stringify(request, undefined, 4))
if ((/^(http:\/\/|https:\/\/).+?/).test(requestData['url'])) {
if ((/(image\/(png|jpeg|gif)|text\/css)$/).test(requestData.headers['Content-Type'])) {
request.abort();
if ((/logout|delete|signout/).test(request.url.toLowerCase())) {
request.abort();
postdata = request.postData ? request.postData : "";
for (var i = 0; i < request.headers.length; i++) {
if (request.headers[i].name == "Referer") {
referer = request.headers[i].value;
requesturl = request.url.replace(/"/g, "\\\"");
requestdata = postdata.replace(/"/g, "\\\"");
requestreferer = referer.replace(/"/g, "\\\"");
console.log("> request : " + "{\"url\":\"" + requesturl + "\",\"method\":\"" + request.method + "\",\"cookie\":\"" + cookie + "\",\"post\":\"" + requestdata + "\",\"referer\":\"" + requestreferer + "\"}");
} else {
request.abort();
// hook prompt
page.onPrompt = function(){
// hook alert
page.onAlert = function(){
// hook confirm
page.onConfirm = function(){
// hook window.showModalDialog()
window.showModalDialog = function(url){
console.log("Show Modal Dialog" + url);
return true;
// hook window.close()
window.close = function(){
console.log("This page is closing, I stopped it. ")
window.setTimeout(
function () {
phantom.exit();
}, 20000 /* wait 20 seconds (20000ms) */
// hook window.open()
var method = post ? "POST" : "GET"
page.onPageCreated = function (newPage) {
newPage.onResourceRequested = function (request) {
postdata = request.postData ? request.postData : "";
for (var i = 0; i < request.headers.length; i++) {
if (request.headers[i].name == "Referer") {
referer = request.headers[i].value;
requesturl = request.url.replace(/"/g, "\\\"");
requestdata = postdata.replace(/"/g, "\\\"");
requestreferer = referer.replace(/"/g, "\\\"");
console.log("> open : " + "hook_url:{\"url\":\"" + requesturl + "\",\"method\":\"" + request.method+ "\",\"cookie\":\"" + cookie + "\",\"post\":\"" + requestdata + "\",\"referer\":\"" + requestreferer + "\"}");
newPage.close();
page.onError = function(msg, trace) {
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
console.error(msgStack.join('\n'));
page.onCallback = function(){
var urls = page.evaluate(function(printMessage){
var urls = Array();
// 遍历所有节点内的内联事件
function trigger_inline(){
var nodes = document.all;
for (var i = 0; i < nodes.length; i++) {
var attrs = nodes[i].attributes;
for (var j = 0; j < attrs.length; j++) {
attr_name = attrs[j].nodeName;
attr_value = attrs[j].nodeValue;
if (attr_name.substr(0, 2) == "on") {
//console.log(attr_name + ' : ' + attr_value);
eval(attr_value.split('return')[0]+';');
if (attr_name in {"src": 1, "href": 1} && attrs[j].nodeValue.substr(0, 11) == "javascript:") {
//console.log(attr_name + ' : ' + attr_value);
eval(attr_value.substr(11).split('return')[0]+';');
trigger_inline();
// 相对地址转绝对地址
var getAbsoluteUrl = (function(){
var a;
return function(url){
if(!a){
a = document.createElement('a');
a.href = url;
return a.href;
})();
// 获取form表单
ftags = document.getElementsByTagName("form");
for (var i = 0; i < ftags.length; i++) {
// form_action = ftags[i].action;
url = getAbsoluteUrl(ftags[i].getAttribute("action"))
//input
var inputs = ftags[i].getElementsByTagName('*');
var requestdata = "";
var len = inputs.length;
for (var j = 0; j < len; j++) {
if (inputs[j].hasAttributes("*") == true) {
if (j < len - 1) {
if (inputs[j].hasAttributes("name") && inputs[j].name != undefined && inputs[j].name != "") {
requestdata = requestdata + inputs[j].name
} else {
continue
if (inputs[j].hasAttributes("value") && inputs[j].value != "" && inputs[j].value != undefined) {
requestdata = requestdata + "=" + inputs[j].value + "&";
} else {
requestdata = requestdata + "=123456&";
if (j == len - 1) {
if (inputs[j].hasAttributes("name") && inputs[j].name != undefined && inputs[j].name != "") {
requestdata = requestdata + inputs[j].name
} else {
continue
if (inputs[j].hasAttributes("value") && inputs[j].value != "" && inputs[j].value != undefined) {
requestdata = requestdata + "=" + inputs[j].value;
} else {
requestdata = requestdata + "=123456";
// console.log(requestdata);
res = "{\"url\":\"" + url.replace(/"/g, "\\\"") + "\",\"method\":\"post\"," + "\"post\":\"" + requestdata + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && ftags[i]["action"].indexOf("javascript:") < 0 && ftags[i]["action"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
// 获取a标签的href值
var aelements = document.getElementsByTagName("a");
for (var i = 0; i < aelements.length; i++) {
res = "{\"url\":\"" + getAbsoluteUrl(aelements[i].getAttribute("href")) + "\",\"method\":\"get\",\"post\":\"" + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && aelements[i]["href"].indexOf("javascript:") < 0 && aelements[i]["href"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
// 获取link标签的href值
var linkelements = document.getElementsByTagName("link");
for (var i = 0; i < linkelements.length; i++) {
res = "{\"url\":\"" + getAbsoluteUrl(linkelements[i].getAttribute("href")) + "\",\"method\":\"get\",\"post\":\"" + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && linkelements[i]["href"].indexOf("javascript:") < 0 && linkelements[i]["href"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
// 获取area标签的href值
var areaelements = document.getElementsByTagName("area");
for (var i = 0; i < areaelements.length; i++) {
res = "{\"url\":\"" + getAbsoluteUrl(areaelements[i].getAttribute("href")) + "\",\"method\":\"get\",\"post\":\"" + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && areaelements[i]["href"].indexOf("javascript:") < 0 && areaelements[i]["href"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
// 获取img标签的src值
var imgelements = document.getElementsByTagName("img");
for (var i = 0; i < imgelements.length; i++) {
res = "{\"url\":\"" + getAbsoluteUrl(imgelements[i].getAttribute("src")) + "\",\"method\":\"get\",\"post\":\"" + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && imgelements[i]["src"].indexOf("javascript:") < 0 && imgelements[i]["src"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
// 获取embed标签的src值
var embedelements = document.getElementsByTagName("embed");
for (var i = 0; i < embedelements.length; i++) {
res = "{\"url\":\"" + getAbsoluteUrl(embedelements[i].getAttribute("src")) + "\",\"method\":\"get\",\"post\":\"" + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && embedelements[i]["src"].indexOf("javascript:") < 0 && embedelements[i]["src"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
// 获取video标签的src值
var videoelements = document.getElementsByTagName("video");
for (var i = 0; i < videoelements.length; i++) {
res = "{\"url\":\"" + getAbsoluteUrl(videoelements[i].getAttribute("src")) + "\",\"method\":\"get\",\"post\":\"" + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && videoelements[i]["src"].indexOf("javascript:") < 0 && videoelements[i]["src"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
// 获取audio标签的src值
var audioelements = document.getElementsByTagName("audio");
for (var i = 0; i < audioelements.length; i++) {
res = "{\"url\":\"" + getAbsoluteUrl(audioelements[i].getAttribute("src")) + "\",\"method\":\"get\",\"post\":\"" + "\",\"cookie\":\"" + "\",\"referer\":\"" + window.location.href + "\"}";
if (urls.indexOf(res) < 0 && audioelements[i]["src"].indexOf("javascript:") < 0 && audioelements[i]["src"].indexOf("mailto:") < 0) {
urls.push(res);
console.log(res);
document.addEventListener('DOMNodeInserted', function(e) {
var node = e.target;
if(node.src || node.href){
url = (getAbsoluteUrl(node.getAttribute("src")) || getAbsoluteUrl(node.getAttribute("href")));
// urls += (node.src || node.href)+'\n';
}, true);
_open = XMLHttpRequest.prototype.open
XMLHttpRequest.prototype.open = function (method, url) {
if (!this._url) {
this._url = url;
this._method = method;
_open.apply(this, arguments);
_send = XMLHttpRequest.prototype.send
XMLHttpRequest.prototype.send = function (data) {
window.$Result$.add_ajax(this._url, this._method, data);
_send.apply(this, arguments);
return urls;
// if(links != null){
// console.dir(links.trim('\n'));
if (urls.indexOf(res) < 0) {
urls.push(res);
console.log(res);
phantom.exit();
page.open(address, {
operation: method,
data: post,
}, function(status){
if (status !== 'success') {
console.log('unable to access target');
phantom.exit();
} else {
page.evaluateAsync(function(){
if (typeof window.callPhantom === 'function'){
window.callPhantom();
},1000);