python爬虫简单js逆向(破译js)

很多同学由于学习任务需要用爬虫获取数据,学习了python爬虫的基础知识。但是在开始写爬虫程序的时候就出现了问题。我现将一些同学的爬虫获取数据的解决方式记录下来,供大家参考。

希望本文对你有所帮助~~如果对软件测试、接口测试、自动化测试、面试经验交流感兴趣可以私聊我。免费领取最新软件测试大厂面试资料和Python自动化、接口、框架搭建学习资料!技术大牛解惑答疑,同行一起交流。


网站分析

既然选定了目标,那就开始抓包,分析网页。



这是抓包后的情况,通过对多个包进行比较发现,实际变化的参数只有lastId、nonce、timestamp、pageNum。具体分析了一下,lastId:前一页最后一个发言用户的tid;pageNum:当前页码;timestamp:13位时间戳;nonce:不知道是什么,但是看他长了一副加密的脸。至此,目标就很清晰了,重点针对nonce。

js逆向分析



通过全局搜索,断点定位,最终将目标锁定在这一行代码上。

e.params.nonce = Object(u["md5"])(t + "" + parseInt(1e7 * Math.random(), 10) + 1, 32)

通过观察发现,这一行代码的最终输出结果正是我们今天的目标。

分析一下代码:

“t”:13位的时间戳

“+ "" +”:为将时间戳转为字符串

“1e7” :10000000,固定值

“Math.random()”:随机数

“parseInt”:取整

“t + "" +parseInt(1e7 * Math.random(), 10)+ 1”,这里的意思就很明显了,时间戳+取整的随机数+1,最终的结果是一个21位数。

继续分析Object(u["md5"])和32发现,这里是调用了u的[md5]方法,将前面的21位字符串和32作为参数,传给了MD5,网页源代码如下:

e.md5 = function(e, t) {
function n(e, t) {
return e << t | e >>> 32 - t
}
function i(e, t) {
var n, i, a, r, o;
return a = 2147483648 & e,
r = 2147483648 & t,
n = 1073741824 & e,
i = 1073741824 & t,
o = (1073741823 & e) + (1073741823 & t),
n & i ? 2147483648 ^ o ^ a ^ r : n | i ? 1073741824 & o ? 3221225472 ^ o ^ a ^ r : 1073741824 ^ o ^ a ^ r : o ^ a ^ r
}
…………//此处省略

目前为止,思路已经很清晰了,这里我们可以通过Python实现MD5加密,也可以抠源代码改写。为了保证百分百不出错,这里我选择了抠代码。


js代码改写

首先是源代码:

e.md5 = function(e, t) {
function n(e, t) {
return e << t | e >>> 32 - t
}
function i(e, t) {
var n, i, a, r, o;
return a = 2147483648 & e,
r = 2147483648 & t,
n = 1073741824 & e,
i = 1073741824 & t,
o = (1073741823 & e) + (1073741823 & t),
n & i ? 2147483648 ^ o ^ a ^ r : n | i ? 1073741824 & o ? 3221225472 ^ o ^ a ^ r : 1073741824 ^ o ^ a ^ r : o ^ a ^ r
}
function a(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return e & t | ~e & n
}(t, a, r), o), l)),
i(n(e, s), t)
}
function r(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return e & n | t & ~n
}(t, a, r), o), l)),
i(n(e, s), t)
}
function o(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return e ^ t ^ n
}(t, a, r), o), l)),
i(n(e, s), t)
}
function s(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return t ^ (e | ~n)
}(t, a, r), o), l)),
i(n(e, s), t)
}
function l(e) {
var t, n = "", i = "";
for (t = 0; t <= 3; t++)
n += (i = "0" + (e >>> 8 * t & 255).toString(16)).substr(i.length - 2, 2);
return n
}
var c, u, p, m, d, h, f, v, y, g = e, b = Array();
for (b = function(e) {
for (var t, n = e.length, i = n + 8, a = 16 * ((i - i % 64) / 64 + 1), r = Array(a - 1), o = 0, s = 0; s < n; )
o = s % 4 * 8,
r[t = (s - s % 4) / 4] = r[t] | e.charCodeAt(s) << o,
s++;
return t = (s - s % 4) / 4,
o = s % 4 * 8,
r[t] = r[t] | 128 << o,
r[a - 2] = n << 3,
r[a - 1] = n >>> 29,
r
}(g),
h = 1732584193,
f = 4023233417,
v = 2562383102,
y = 271733878,
c = 0; c < b.length; c += 16)
u = h,
p = f,
m = v,
d = y,
f = s(f = s(f = s(f = s(f = o(f = o(f = o(f = o(f = r(f = r(f = r(f = r(f = a(f = a(f = a(f = a(f, v = a(v, y = a(y, h = a(h, f, v, y, b[c + 0], 7, 3614090360), f, v, b[c + 1], 12, 3905402710), h, f, b[c + 2], 17, 606105819), y, h, b[c + 3], 22, 3250441966), v = a(v, y = a(y, h = a(h, f, v, y, b[c + 4], 7, 4118548399), f, v, b[c + 5], 12, 1200080426), h, f, b[c + 6], 17, 2821735955), y, h, b[c + 7], 22, 4249261313), v = a(v, y = a(y, h = a(h, f, v, y, b[c + 8], 7, 1770035416), f, v, b[c + 9], 12, 2336552879), h, f, b[c + 10], 17, 4294925233), y, h, b[c + 11], 22, 2304563134), v = a(v, y = a(y, h = a(h, f, v, y, b[c + 12], 7, 1804603682), f, v, b[c + 13], 12, 4254626195), h, f, b[c + 14], 17, 2792965006), y, h, b[c + 15], 22, 1236535329), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 1], 5, 4129170786), f, v, b[c + 6], 9, 3225465664), h, f, b[c + 11], 14, 643717713), y, h, b[c + 0], 20, 3921069994), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 5], 5, 3593408605), f, v, b[c + 10], 9, 38016083), h, f, b[c + 15], 14, 3634488961), y, h, b[c + 4], 20, 3889429448), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 9], 5, 568446438), f, v, b[c + 14], 9, 3275163606), h, f, b[c + 3], 14, 4107603335), y, h, b[c + 8], 20, 1163531501), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 13], 5, 2850285829), f, v, b[c + 2], 9, 4243563512), h, f, b[c + 7], 14, 1735328473), y, h, b[c + 12], 20, 2368359562), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 5], 4, 4294588738), f, v, b[c + 8], 11, 2272392833), h, f, b[c + 11], 16, 1839030562), y, h, b[c + 14], 23, 4259657740), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 1], 4, 2763975236), f, v, b[c + 4], 11, 1272893353), h, f, b[c + 7], 16, 4139469664), y, h, b[c + 10], 23, 3200236656), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 13], 4, 681279174), f, v, b[c + 0], 11, 3936430074), h, f, b[c + 3], 16, 3572445317), y, h, b[c + 6], 23, 76029189), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 9], 4, 3654602809), f, v, b[c + 12], 11, 3873151461), h, f, b[c + 15], 16, 530742520), y, h, b[c + 2], 23, 3299628645), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 0], 6, 4096336452), f, v, b[c + 7], 10, 1126891415), h, f, b[c + 14], 15, 2878612391), y, h, b[c + 5], 21, 4237533241), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 12], 6, 1700485571), f, v, b[c + 3], 10, 2399980690), h, f, b[c + 10], 15, 4293915773), y, h, b[c + 1], 21, 2240044497), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 8], 6, 1873313359), f, v, b[c + 15], 10, 4264355552), h, f, b[c + 6], 15, 2734768916), y, h, b[c + 13], 21, 1309151649), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 4], 6, 4149444226), f, v, b[c + 11], 10, 3174756917), h, f, b[c + 2], 15, 718787259), y, h, b[c + 9], 21, 3951481745),
h = i(h, u),
f = i(f, p),
v = i(v, m),
y = i(y, d);
return 32 == t ? l(h) + l(f) + l(v) + l(y) : l(f) + l(v)
}

其次是改写后的代码,这里遵循的是改得越少越好的原则:

function MD5 (e, t) {
function n(e, t) {
return e << t | e >>> 32 - t
}

function i(e, t) {
var n, i, a, r, o;
return a = 2147483648 & e,
r = 2147483648 & t,
n = 1073741824 & e,
i = 1073741824 & t,
o = (1073741823 & e) + (1073741823 & t),
n & i ? 2147483648 ^ o ^ a ^ r : n | i ? 1073741824 & o ? 3221225472 ^ o ^ a ^ r : 1073741824 ^ o ^ a ^ r : o ^ a ^ r
}

function a(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return e & t | ~e & n
}(t, a, r), o), l)),
i(n(e, s), t)
}

function r(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return e & n | t & ~n
}(t, a, r), o), l)),
i(n(e, s), t)
}

function o(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return e ^ t ^ n
}(t, a, r), o), l)),
i(n(e, s), t)
}

function s(e, t, a, r, o, s, l) {
return e = i(e, i(i(function(e, t, n) {
return t ^ (e | ~n)
}(t, a, r), o), l)),
i(n(e, s), t)
}

function l(e) {
var t, n = "",
i = "";
for (t = 0; t <= 3; t++)
n += (i = "0" + (e >>> 8 * t & 255).toString(16)).substr(i.length - 2, 2);
return n
}
var c, u, p, m, d, h, f, v, y, g = e,
b = Array();
for (b = function(e) {
for (var t, n = e.length, i = n + 8, a = 16 * ((i - i % 64) / 64 + 1), r = Array(a - 1), o = 0, s = 0; s < n;)
o = s % 4 * 8,
r[t = (s - s % 4) / 4] = r[t] | e.charCodeAt(s) << o,
s++;
return t = (s - s % 4) / 4,
o = s % 4 * 8,
r[t] = r[t] | 128 << o,
r[a - 2] = n << 3,
r[a - 1] = n >>> 29,
r
}(g),
h = 1732584193,
f = 4023233417,
v = 2562383102,
y = 271733878,
c = 0; c < b.length; c += 16)
u = h,
p = f,
m = v,
d = y,
f = s(f = s(f = s(f = s(f = o(f = o(f = o(f = o(f = r(f = r(f = r(f = r(f = a(f = a(f = a(f = a(f, v = a(v, y = a(y, h = a(h, f, v, y, b[c + 0], 7, 3614090360), f, v, b[c + 1], 12, 3905402710), h, f, b[c + 2], 17, 606105819), y, h, b[c + 3], 22, 3250441966), v = a(v, y = a(y, h = a(h, f, v, y, b[c + 4], 7, 4118548399), f, v, b[c + 5], 12, 1200080426), h, f, b[c + 6], 17, 2821735955), y, h, b[c + 7], 22, 4249261313), v = a(v, y = a(y, h = a(h, f, v, y, b[c + 8], 7, 1770035416), f, v, b[c + 9], 12, 2336552879), h, f, b[c + 10], 17, 4294925233), y, h, b[c + 11], 22, 2304563134), v = a(v, y = a(y, h = a(h, f, v, y, b[c + 12], 7, 1804603682), f, v, b[c + 13], 12, 4254626195), h, f, b[c + 14], 17, 2792965006), y, h, b[c + 15], 22, 1236535329), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 1], 5, 4129170786), f, v, b[c + 6], 9, 3225465664), h, f, b[c + 11], 14, 643717713), y, h, b[c + 0], 20, 3921069994), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 5], 5, 3593408605), f, v, b[c + 10], 9, 38016083), h, f, b[c + 15], 14, 3634488961), y, h, b[c + 4], 20, 3889429448), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 9], 5, 568446438), f, v, b[c + 14], 9, 3275163606), h, f, b[c + 3], 14, 4107603335), y, h, b[c + 8], 20, 1163531501), v = r(v, y = r(y, h = r(h, f, v, y, b[c + 13], 5, 2850285829), f, v, b[c + 2], 9, 4243563512), h, f, b[c + 7], 14, 1735328473), y, h, b[c + 12], 20, 2368359562), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 5], 4, 4294588738), f, v, b[c + 8], 11, 2272392833), h, f, b[c + 11], 16, 1839030562), y, h, b[c + 14], 23, 4259657740), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 1], 4, 2763975236), f, v, b[c + 4], 11, 1272893353), h, f, b[c + 7], 16, 4139469664), y, h, b[c + 10], 23, 3200236656), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 13], 4, 681279174), f, v, b[c + 0], 11, 3936430074), h, f, b[c + 3], 16, 3572445317), y, h, b[c + 6], 23, 76029189), v = o(v, y = o(y, h = o(h, f, v, y, b[c + 9], 4, 3654602809), f, v, b[c + 12], 11, 3873151461), h, f, b[c + 15], 16, 530742520), y, h, b[c + 2], 23, 3299628645), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 0], 6, 4096336452), f, v, b[c + 7], 10, 1126891415), h, f, b[c + 14], 15, 2878612391), y, h, b[c + 5], 21, 4237533241), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 12], 6, 1700485571), f, v, b[c + 3], 10, 2399980690), h, f, b[c + 10], 15, 4293915773), y, h, b[c + 1], 21, 2240044497), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 8], 6, 1873313359), f, v, b[c + 15], 10, 4264355552), h, f, b[c + 6], 15, 2734768916), y, h, b[c + 13], 21, 1309151649), v = s(v, y = s(y, h = s(h, f, v, y, b[c + 4], 6, 4149444226), f, v, b[c + 11], 10, 3174756917), h, f, b[c + 2], 15, 718787259), y, h, b[c + 9], 21, 3951481745),
h = i(h, u),
f = i(f, p),
v = i(v, m),
y = i(y, d);
return 32 == t ? l(h) + l(f) + l(v) + l(y) : l(f) + l(v)
}

经过测试,代码能完美实现我想要的功能,



然后将代码保存为.js文件。

Python代码编写

常规操作

import requests
import random
import execjs
import json
import pandas as pd
import time

url = 'https://bbs.vivo.com.cn/api/community/forum/threads'
headers = {
'accept': 'application/json, text/plain, */*',
'content-type': 'application/json;charset=UTF-8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4621.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="21", " Not;A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'origin': 'https://bbs.vivo.com.cn',
'referer': 'https://bbs.vivo.com.cn/newbbs/forum/9',
'cookie': 'cookieId=e1c6727a-9b29-1c13-a417-1b74440b9d521639290997482; KL9d_2132_saltkey=pU2Rr4AV; KL9d_2132_lastvisit=1639287439; Hm_lvt_9ef7debb81babe8b94af7f2c274869fd=1639291140,1639713347; Hm_lvt_a7471116b9007c038d41873ab9121a9e=1639291040,1639713440; sessionId=b6c66b37-b88e-f74d-fa6b-b7e526d5e5f7'
}

这里虽然导入了好多包,但其实都是根据使用需要一个个导入的。


Python生成js需要的参数

def get_timestamp():
timestamp = int(time.time() * 1000) # 获取13位时间戳
return timestamp


def get_str_():
num = int(float(str(random.random() * 10000000)[:10])) # 获取随机数
str_ = str(get_timestamp()) + str(num) + '1' # 获取21位随机数
return str_

这里分别生成时间戳和21位拼接字符串

导入js文件,获取最重要的参数nonce

def get_cxt():
with open("1.js") as file: # 打开js文件
cxt = execjs.compile(file.read()) # 导入js文件
return cxt


def get_nonce():
nonce = get_cxt().call('md5', get_str_(), '32') # 调用js文件md5函数加密,获取nonce
return nonce

获取data 万事具备,下一步生成data,这里我选择了第一页作为测试。

def get_data(): # 获取第一页data
data = {
'forumId': "9",
'imgSpecs': ["t577x324", "t577x4096"],
'lastId': "",
'nonce': get_nonce(),
'order': '1',
'pageNum': '1',
'pageSize': '10',
'timestamp': get_timestamp(),
'topicId': ""
}
return data

发起请求,拿到数据

def main():
res = requests.post(url, headers=headers, data=json.dumps(get_data())).text # 请求第一页数据
datss = json.loads(res)['data']['list']
data_list = []
for data in datss:
bbsname = data['author']['bbsName']
name = data['forum']['name']
summary = data['summary']
tid = data['tid']
data_list.append({
'bbsname': bbsname,
'name': name,
'summary': summary,
'tid': tid
})
return data_list


if __name__ == '__main__':
df = pd.DataFrame(main())
# df.index = df.index + 1
print(df)
df.to_excel('手机圈子0.xlsx')

全部代码展示

import requests
import random
import execjs
import json
import pandas as pd
import time

url = 'https://bbs.vivo.com.cn/api/community/forum/threads'
headers = {
'accept': 'application/json, text/plain, */*',
'content-type': 'application/json;charset=UTF-8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4621.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="21", " Not;A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'origin': 'https://bbs.vivo.com.cn',
'referer': 'https://bbs.vivo.com.cn/newbbs/forum/9',
'cookie': 'cookieId=e1c6727a-9b29-1c13-a417-1b74440b9d521639290997482; KL9d_2132_saltkey=pU2Rr4AV; KL9d_2132_lastvisit=1639287439; Hm_lvt_9ef7debb81babe8b94af7f2c274869fd=1639291140,1639713347; Hm_lvt_a7471116b9007c038d41873ab9121a9e=1639291040,1639713440; sessionId=b6c66b37-b88e-f74d-fa6b-b7e526d5e5f7'
}


def get_timestamp():
timestamp = int(time.time() * 1000) # 获取13位时间戳
return timestamp


def get_str_():
num = int(float(str(random.random() * 10000000)[:10])) # 获取随机数
str_ = str(get_timestamp()) + str(num) + '1' # 获取21位随机数
return str_


def get_cxt():
with open("1.js") as file: # 打开js文件
cxt = execjs.compile(file.read()) # 导入js文件
return cxt


def get_nonce():
nonce = get_cxt().call('md5', get_str_(), '32') # 调用js文件md5函数加密,获取nonce
return nonce


def get_data(): # 获取第一页data
data = {
'forumId': "9",
'imgSpecs': ["t577x324", "t577x4096"],
'lastId': "",
'nonce': get_nonce(),
'order': '1',
'pageNum': '1',
'pageSize': '10',
'timestamp': get_timestamp(),
'topicId': ""
}
return data


def main():
res = requests.post(url, headers=headers, data=json.dumps(get_data())).text # 请求第一页数据
datss = json.loads(res)['data']['list']
data_list = []
for data in datss:
bbsname = data['author']['bbsName']
name = data['forum']['name']
summary = data['summary']
tid = data['tid']
data_list.append({
'bbsname': bbsname,
'name': name,
'summary': summary,
'tid': tid
})
return data_list


if __name__ == '__main__':
df = pd.DataFrame(main())
# df.index = df.index + 1
print(df)
df.to_excel('手机圈子0.xlsx')


成果展示



本文参考整理自博客:爬遍天下无敌手,如有侵权,请联系删除。

希望本文对你有所帮助~~如果对软件测试、接口测试、自动化测试、面试经验交流感兴趣可以私聊我。免费领取最新软件测试大厂面试资料和Python自动化、接口、框架搭建学习资料!技术大牛解惑答疑,同行一起交流。

代做工资流水公司泉州工资流水app截图代开常德开企业对公流水滁州企业对公流水报价邯郸工作收入证明开具深圳打印房贷银行流水邢台银行流水图片海口工资代付流水费用潮州查银行流水电子版潮州做企业流水打印银行流水报价广州自存银行流水开具徐州对公账户流水图片赣州对公流水绍兴入职银行流水查询济宁打背调工资流水湘潭打收入证明湖州办理工资流水app截图湛江办流水滁州在职证明报价肇庆开购房银行流水潍坊银行流水账单代办揭阳房贷工资流水 模板广州企业流水打印费用太原公司流水办理宁波制作薪资流水单查流水账单菏泽查自存流水洛阳转账银行流水报价阜阳代开车贷银行流水邢台工资流水单公司香港通过《维护国家安全条例》两大学生合买彩票中奖一人不认账让美丽中国“从细节出发”19岁小伙救下5人后溺亡 多方发声卫健委通报少年有偿捐血浆16次猝死汪小菲曝离婚始末何赛飞追着代拍打雅江山火三名扑火人员牺牲系谣言男子被猫抓伤后确诊“猫抓病”周杰伦一审败诉网易中国拥有亿元资产的家庭达13.3万户315晚会后胖东来又人满为患了高校汽车撞人致3死16伤 司机系学生张家界的山上“长”满了韩国人?张立群任西安交通大学校长手机成瘾是影响睡眠质量重要因素网友洛杉矶偶遇贾玲“重生之我在北大当嫡校长”单亲妈妈陷入热恋 14岁儿子报警倪萍分享减重40斤方法杨倩无缘巴黎奥运考生莫言也上北大硕士复试名单了许家印被限制高消费奥巴马现身唐宁街 黑色着装引猜测专访95后高颜值猪保姆男孩8年未见母亲被告知被遗忘七年后宇文玥被薅头发捞上岸郑州一火锅店爆改成麻辣烫店西双版纳热带植物园回应蜉蝣大爆发沉迷短剧的人就像掉进了杀猪盘当地回应沈阳致3死车祸车主疑毒驾开除党籍5年后 原水城县长再被查凯特王妃现身!外出购物视频曝光初中生遭15人围殴自卫刺伤3人判无罪事业单位女子向同事水杯投不明物质男子被流浪猫绊倒 投喂者赔24万外国人感慨凌晨的中国很安全路边卖淀粉肠阿姨主动出示声明书胖东来员工每周单休无小长假王树国卸任西安交大校长 师生送别小米汽车超级工厂正式揭幕黑马情侣提车了妈妈回应孩子在校撞护栏坠楼校方回应护栏损坏小学生课间坠楼房客欠租失踪 房东直发愁专家建议不必谈骨泥色变老人退休金被冒领16年 金额超20万西藏招商引资投资者子女可当地高考特朗普无法缴纳4.54亿美元罚金浙江一高校内汽车冲撞行人 多人受伤

代做工资流水公司 XML地图 TXT地图 虚拟主机 SEO 网站制作 网站优化