反反爬虫系列(一)
这篇文章来自知乎大佬——不吃夹生饭,一位Python爬虫工程师。
前言
笔者决定写一个系列反反爬虫,目的是站在生产角度如何绕过各类网站的反爬虫,提供反反爬虫思路。
关于工程化,这里笔者暂不提及。希望各位看官能复现我的思路来完成反反爬虫过程,即提升了自己技术和思路同时也促使网站迭代自己的反爬虫策略(手动狗头。
首先我们来解决的网站是 同程旅游的酒店部分。这里设计的反爬虫是 antitoken,一个全局的token。
难度:中等
开发环境:
MacOS 10.14.2
python: python3.5.2
Sublime 支持JavaScript
假设当前需求是:获取该酒店的评论数据
那么我们需要做的事情:
调研
开发
部署并维护
我们着重研究调研部分,进入任意酒店页面可见长这个样的
然后评论部分长这个样的,通过ajax加载
接下应该做的是观察这个api
确定:
url
请求方式 get/post
参数及格式 params/payloads
请求头 headers
然后这个api长酱色的:
提取出信息
url = 'https://www.ly.com/hotel/api/tmapi/comment/list'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://www.ly.com/HotelInfo-50101461.html?spm0=10002.2001.1.0.1.4.11',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
params = {
'hotelid': '50101461',
'page': '2',
'pageSize': '10',
'commentType': '0',
'roomTypeId': '',
'tripPurposeId': '',
'RankType': '1',
'mainTagId': '',
'subTagId': '',
'antitoken': '9f5c5f8c288f4687e965d600e5115808'
}
可以观察到参数里有个antitoken 特别碍眼,接下来我们要开始撸这个antitoken
在调试的时候首先遇到的问题就是断点问题
同程的反爬虫策略:在打开开发者工具时候 debug,同时不断向内存写东西, 一会儿浏览器就卡的不行。大概长这个样
追踪源码可以看到是 leonid-tq-jq-v3-min.js 在耍怪:
好了,各位看官看好了,接下来我们要做的是,就是将这个网页保存下来,然后在leonid-tq-jq-v3-min.js 把我上图里红框里的这个函数内容全部删掉
注意,不要给这段js重新排版
好了,再打开本地的html,再打开开发者工具,不会再debug
接下来,咱们开始研究这个antitoken
首先是找到antitoken,我们先搜索
这里笔者打开的是本体保存后的html页面哈
然后我们可以看见 antitoken 是在 last.js这个文件里,那我们接下来去last.js里看看
嗯,起手就看到antitoken 的生成方法了
接下来是打断点调试
嗯?什么是打断点调试?呃,这是爬虫工程师的基本功。
我们可以看到先从cookie拿到一个参数的值,没错就是这个 wangba ~ 网吧?王八?
看了下面如果wangba为空,则重新创建一个
var e = $.cookie("wangba");
e && void 0 !== e || (e = (new Date).getTime().toString()
其实这个e就是个时间戳
好了,我打断点的地方就是这个antitoken的值,现在我们要去看看这个antitoken的生成过程
首先把框里的函数重写,自己定一个e,然后长酱色的
然后调试,两步转跳入antitoken的生成函数
function(e, t, a) {
var n, i, o, s, r;
n = a(117),
i = a(56).utf8,
o = a(118),
s = a(56).bin,
(r = function(e, t) {
e.constructor == String ? e = t && "binary" === t.encoding ? s.stringToBytes(e) : i.stringToBytes(e) : o(e) ? e = Array.prototype.slice.call(e, 0) : Array.isArray(e) || (e = e.toString());
for (var a = n.bytesToWords(e), l = 8 * e.length, c = 1732584193, d = -271733879, p = -1732584194, u = 271733878, m = 0; m < a.length; m++)
a = 16711935 & (a << 8 | a >>> 24) | 4278255360 & (a << 24 | a >>> 8);
a |= 128 << l % 32,
a = l;
var f = r._ff
, h = r._gg
, v = r._hh
, g = r._ii;
for (m = 0; m < a.length; m += 16) {
var y = c
, _ = d
, b = p
, $ = u;
d = g(d = g(d = g(d = g(d = v(d = v(d = v(d = v(d = h(d = h(d = h(d = h(d = f(d = f(d = f(d = f(d, p = f(p, u = f(u, c = f(c, d, p, u, a, 7, -680876936), d, p, a, 12, -389564586), c, d, a, 17, 606105819), u, c, a, 22, -1044525330), p = f(p, u = f(u, c = f(c, d, p, u, a, 7, -176418897), d, p, a, 12, 1200080426), c, d, a, 17, -1473231341), u, c, a, 22, -45705983), p = f(p, u = f(u, c = f(c, d, p, u, a, 7, 1770035416), d, p, a, 12, -1958414417), c, d, a, 17, -42063), u, c, a, 22, -1990404162), p = f(p, u = f(u, c = f(c, d, p, u, a, 7, 1804603682), d, p, a, 12, -40341101), c, d, a, 17, -1502002290), u, c, a, 22, 1236535329), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, -165796510), d, p, a, 9, -1069501632), c, d, a, 14, 643717713), u, c, a, 20, -373897302), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, -701558691), d, p, a, 9, 38016083), c, d, a, 14, -660478335), u, c, a, 20, -405537848), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, 568446438), d, p, a, 9, -1019803690), c, d, a, 14, -187363961), u, c, a, 20, 1163531501), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, -1444681467), d, p, a, 9, -51403784), c, d, a, 14, 1735328473), u, c, a, 20, -1926607734), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, -378558), d, p, a, 11, -2022574463), c, d, a, 16, 1839030562), u, c, a, 23, -35309556), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, -1530992060), d, p, a, 11, 1272893353), c, d, a, 16, -155497632), u, c, a, 23, -1094730640), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, 681279174), d, p, a, 11, -358537222), c, d, a, 16, -722521979), u, c, a, 23, 76029189), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, -640364487), d, p, a, 11, -421815835), c, d, a, 16, 530742520), u, c, a, 23, -995338651), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, -198630844), d, p, a, 10, 1126891415), c, d, a, 15, -1416354905), u, c, a, 21, -57434055), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, 1700485571), d, p, a, 10, -1894986606), c, d, a, 15, -1051523), u, c, a, 21, -2054922799), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, 1873313359), d, p, a, 10, -30611744), c, d, a, 15, -1560198380), u, c, a, 21, 1309151649), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, -145523070), d, p, a, 10, -1120210379), c, d, a, 15, 718787259), u, c, a, 21, -343485551),
c = c + y >>> 0,
d = d + _ >>> 0,
p = p + b >>> 0,
u = u + $ >>> 0
}
return n.endian()
}
)._ff = function(e, t, a, n, i, o, s) {
var r = e + (t & a | ~t & n) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._gg = function(e, t, a, n, i, o, s) {
var r = e + (t & n | a & ~n) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._hh = function(e, t, a, n, i, o, s) {
var r = e + (t ^ a ^ n) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._ii = function(e, t, a, n, i, o, s) {
var r = e + (a ^ (t | ~n)) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._blocksize = 16,
r._digestsize = 16,
e.exports = function(e, t) {
if (e === undefined || null === e)
throw new Error("Illegal argument " + e);
var a = n.wordsToBytes(r(e, t));
return t && t.asBytes ? a : t && t.asString ? s.bytesToString(a) : n.bytesToHex(a)
}
}
不可避免的,我们需要知道这个函数里各个参数(n,i,o,s)的含义,那只有继续打断点调试咯
一个一个看
首先是 n
n = a(117)
这个a又是啥,在 该出打断点,刷新,console看
原来 a(117)是一个函数,那么我们就去回到last.js拿到这个函数
然后n这个参数长这个样
n = {
rotl: function(e, t) {
return e << t | e >>> 32 - t
},
rotr: function(e, t) {
return e << 32 - t | e >>> t
},
endian: function(e) {
if (e.constructor == Number)
return 16711935 & n.rotl(e, 8) | 4278255360 & n.rotl(e, 24);
for (var t = 0; t < e.length; t++)
e = n.endian(e);
return e
},
randomBytes: function(e) {
for (var t = []; e > 0; e--)
t.push(Math.floor(256 * Math.random()));
return t
},
bytesToWords: function(e) {
for (var t = [], a = 0, n = 0; a < e.length; a++,
n += 8)
t |= e << 24 - n % 32;
return t
},
wordsToBytes: function(e) {
for (var t = [], a = 0; a < 32 * e.length; a += 8)
t.push(e >>> 24 - a % 32 & 255);
return t
},
bytesToHex: function(e) {
for (var t = [], a = 0; a < e.length; a++)
t.push((e >>> 4).toString(16)),
t.push((15 & e).toString(16));
return t.join("")
},
hexToBytes: function(e) {
for (var t = [], a = 0; a < e.length; a += 2)
t.push(parseInt(e.substr(a, 2), 16));
return t
},
bytesToBase64: function(e) {
for (var t = [], n = 0; n < e.length; n += 3)
for (var i = e << 16 | e << 8 | e, o = 0; o < 4; o++)
8 * n + 6 * o <= 8 * e.length ? t.push(a.charAt(i >>> 6 * (3 - o) & 63)) : t.push("=");
return t.join("")
},
base64ToBytes: function(e) {
e = e.replace(/[^A-Z0-9+\/]/gi, "");
for (var t = [], n = 0, i = 0; n < e.length; i = ++n % 4)
0 != i && t.push((a.indexOf(e.charAt(n - 1)) & Math.pow(2, -2 * i + 8) - 1) << 2 * i | a.indexOf(e.charAt(n)) >>> 6 - 2 * i);
return t
}
}
然后是 i和s这两个参数
i = a(56).utf8,
s = a(56).bin,
我们需要去看看a(56)又是啥玩意
可见a(56)也是一个函数
这时候拿出a(56),然后我们单独定义一个a56
var a56 = {
utf8: {
stringToBytes: function(e) {
return a.bin.stringToBytes(unescape(encodeURIComponent(e)))
},
bytesToString: function(e) {
return decodeURIComponent(escape(a.bin.bytesToString(e)))
}
},
bin: {
stringToBytes: function(e) {
for (var t = [], a = 0; a < e.length; a++)
t.push(255 & e.charCodeAt(a));
return t
},
bytesToString: function(e) {
for (var t = [], a = 0; a < e.length; a++)
t.push(String.fromCharCode(e));
return t.join("")
}
}
};
var i = a56.utf8
var s = a56.bin
好了,接下来就是o哦这个参数
o = a(118)
嗯,我们再看 a(118)
可见 o这个参数值为null就行了
var o = null
接下来,我们把这几个参数替换回去
笔者这里是写到Sublime里 然后运行
Sublime得自己去配置支持JavaScript哈
报错了诶
还有个t参数
那么t参数又是个啥呢
打断点调试咯
可见
t = undefined
那么在脚本里添加一个
var t = null
再运行
OK!,拿到antitoken了
接下来,我们去做个验证
随便打开一个同程的酒店,随便看一页评论,在cookie拿到 wangba 的值,同时看antitoken 的值
然后把wangba=1547187485089里的值带入改写后的脚本里运行
好了,咱们一起就这么把同程的反爬虫攻克了。
脚本代码在文末
// 先找到e,就是时间戳,也可以自己定义
e = (new Date()).getTime().toString();
// e = "1547187485089"
//定义antitoken
function antitoken(e){
var a56 = {
utf8: {
stringToBytes: function(e) {
return a56.bin.stringToBytes(unescape(encodeURIComponent(e)))
},
bytesToString: function(e) {
return decodeURIComponent(escape(a.bin.bytesToString(e)))
}
},
bin: {
stringToBytes: function(e) {
for (var t = [], a = 0; a < e.length; a++)
t.push(255 & e.charCodeAt(a));
return t
},
bytesToString: function(e) {
for (var t = [], a = 0; a < e.length; a++)
t.push(String.fromCharCode(e));
return t.join("")
}
}
};
// 这里t取任意值都行
// var t = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
var t = null;
var n, i, o, s, r;
// n = a117,
n = {
rotl: function(e, t) {
return e << t | e >>> 32 - t
},
rotr: function(e, t) {
return e << 32 - t | e >>> t
},
endian: function(e) {
if (e.constructor == Number)
return 16711935 & n.rotl(e, 8) | 4278255360 & n.rotl(e, 24);
for (var t = 0; t < e.length; t++)
e = n.endian(e);
return e
},
randomBytes: function(e) {
for (var t = []; e > 0; e--)
t.push(Math.floor(256 * Math.random()));
return t
},
bytesToWords: function(e) {
for (var t = [], a = 0, n = 0; a < e.length; a++,
n += 8)
t |= e << 24 - n % 32;
return t
},
wordsToBytes: function(e) {
for (var t = [], a = 0; a < 32 * e.length; a += 8)
t.push(e >>> 24 - a % 32 & 255);
return t
},
bytesToHex: function(e) {
for (var t = [], a = 0; a < e.length; a++)
t.push((e >>> 4).toString(16)),
t.push((15 & e).toString(16));
return t.join("")
},
hexToBytes: function(e) {
for (var t = [], a = 0; a < e.length; a += 2)
t.push(parseInt(e.substr(a, 2), 16));
return t
},
bytesToBase64: function(e) {
for (var t = [], n = 0; n < e.length; n += 3)
for (var i = e << 16 | e << 8 | e, o = 0; o < 4; o++)
8 * n + 6 * o <= 8 * e.length ? t.push(a.charAt(i >>> 6 * (3 - o) & 63)) : t.push("=");
return t.join("")
},
base64ToBytes: function(e) {
e = e.replace(/[^A-Z0-9+\/]/gi, "");
for (var t = [], n = 0, i = 0; n < e.length; i = ++n % 4)
0 != i && t.push((a.indexOf(e.charAt(n - 1)) & Math.pow(2, -2 * i + 8) - 1) << 2 * i | a.indexOf(e.charAt(n)) >>> 6 - 2 * i);
return t
}
},
i = a56.utf8,
o = null,
s = a56.bin,
(r = function(e, t) {
e.constructor == String ? e = t && "binary" === t.encoding ? s.stringToBytes(e) : i.stringToBytes(e) : o(e) ? e = Array.prototype.slice.call(e, 0) : Array.isArray(e) || (e = e.toString());
for (var a = n.bytesToWords(e), l = 8 * e.length, c = 1732584193, d = -271733879, p = -1732584194, u = 271733878, m = 0; m < a.length; m++)
a = 16711935 & (a << 8 | a >>> 24) | 4278255360 & (a << 24 | a >>> 8);
a |= 128 << l % 32,
a = l;
var f = r._ff
, h = r._gg
, v = r._hh
, g = r._ii;
for (m = 0; m < a.length; m += 16) {
var y = c
, _ = d
, b = p
, $ = u;
d = g(d = g(d = g(d = g(d = v(d = v(d = v(d = v(d = h(d = h(d = h(d = h(d = f(d = f(d = f(d = f(d, p = f(p, u = f(u, c = f(c, d, p, u, a, 7, -680876936), d, p, a, 12, -389564586), c, d, a, 17, 606105819), u, c, a, 22, -1044525330), p = f(p, u = f(u, c = f(c, d, p, u, a, 7, -176418897), d, p, a, 12, 1200080426), c, d, a, 17, -1473231341), u, c, a, 22, -45705983), p = f(p, u = f(u, c = f(c, d, p, u, a, 7, 1770035416), d, p, a, 12, -1958414417), c, d, a, 17, -42063), u, c, a, 22, -1990404162), p = f(p, u = f(u, c = f(c, d, p, u, a, 7, 1804603682), d, p, a, 12, -40341101), c, d, a, 17, -1502002290), u, c, a, 22, 1236535329), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, -165796510), d, p, a, 9, -1069501632), c, d, a, 14, 643717713), u, c, a, 20, -373897302), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, -701558691), d, p, a, 9, 38016083), c, d, a, 14, -660478335), u, c, a, 20, -405537848), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, 568446438), d, p, a, 9, -1019803690), c, d, a, 14, -187363961), u, c, a, 20, 1163531501), p = h(p, u = h(u, c = h(c, d, p, u, a, 5, -1444681467), d, p, a, 9, -51403784), c, d, a, 14, 1735328473), u, c, a, 20, -1926607734), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, -378558), d, p, a, 11, -2022574463), c, d, a, 16, 1839030562), u, c, a, 23, -35309556), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, -1530992060), d, p, a, 11, 1272893353), c, d, a, 16, -155497632), u, c, a, 23, -1094730640), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, 681279174), d, p, a, 11, -358537222), c, d, a, 16, -722521979), u, c, a, 23, 76029189), p = v(p, u = v(u, c = v(c, d, p, u, a, 4, -640364487), d, p, a, 11, -421815835), c, d, a, 16, 530742520), u, c, a, 23, -995338651), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, -198630844), d, p, a, 10, 1126891415), c, d, a, 15, -1416354905), u, c, a, 21, -57434055), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, 1700485571), d, p, a, 10, -1894986606), c, d, a, 15, -1051523), u, c, a, 21, -2054922799), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, 1873313359), d, p, a, 10, -30611744), c, d, a, 15, -1560198380), u, c, a, 21, 1309151649), p = g(p, u = g(u, c = g(c, d, p, u, a, 6, -145523070), d, p, a, 10, -1120210379), c, d, a, 15, 718787259), u, c, a, 21, -343485551),
c = c + y >>> 0,
d = d + _ >>> 0,
p = p + b >>> 0,
u = u + $ >>> 0
}
return n.endian()
}
)._ff = function(e, t, a, n, i, o, s) {
var r = e + (t & a | ~t & n) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._gg = function(e, t, a, n, i, o, s) {
var r = e + (t & n | a & ~n) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._hh = function(e, t, a, n, i, o, s) {
var r = e + (t ^ a ^ n) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._ii = function(e, t, a, n, i, o, s) {
var r = e + (a ^ (t | ~n)) + (i >>> 0) + s;
return (r << o | r >>> 32 - o) + t
}
,
r._blocksize = 16,
r._digestsize = 16;
var a = n.wordsToBytes(r(e, t));
return t && t.asBytes ? a : t && t.asString ? s.bytesToString(a) : n.bytesToHex(a);
};
//召唤神兽
console.log(antitoken(e))
如果你是新同学,长按下面二维码 - 识别图中二维码 - 关注,就可以每天一起学Python了。
页:
[1]