网址:融资总额近3亿美元、药明康德押注,这家抗衰老明星公司有何过人之处-36氪
想要抓取文章内容,但是找不到啊,可能是文字格式的问题,也可能文章内容进行了加密。
在元素中查看,window.initialState返回的就是加密的内容。那么现在又要开始解密了,准备好了吗。
先进行关键字搜索,打上断点,刷新页面。
哇撒,感觉很明显的AES解密,在控制台打印参数,ne就是密文内容
复制代码:
var ne = ee.a.enc.Utf8.parse("efabccee-b754-4c");
var re, oe = window.initialState || {};
oe.isEncrypt && (oe = JSON.parse((re = window.initialState.state,ee.a.AES.decrypt(re, ne, {mode: ee.a.mode.ECB,padding: ee.a.pad.Pkcs7}).toString(ee.a.enc.Utf8).toString())))
看一下这个ee.a,果然是标准库的解密算法,导入标准库,替换掉ee.a即可,泪目,竟然这么简单!
window是啥呢?就是一个密文的格式!
究极改进代码
const CryptoJS = require('crypto-js');window = {initialState: {"state": "","isEncrypt": true}}//window.initialState.state类似于python字典取值window['initialState'][state'']
var ne = CryptoJS.enc.Utf8.parse("efabccee-b754-4c");re = window.initialState.state;
resp = CryptoJS.AES.decrypt(re, ne, {mode: CryptoJS.mode.ECB,padding: CryptoJS.pad.Pkcs7}).toString(CryptoJS.enc.Utf8).toString()
console.log(resp)
结果为:
为了与python代码进行交互,我们对代码稍微修改一下:
const CryptoJS = require('crypto-js');
function get_content(state){window = {initialState: {"state":state,"isEncrypt": true,}}//window.initialState.state类似于python字典取值window['initialState'][state'']
//ne是密钥var ne = CryptoJS.enc.Utf8.parse("efabccee-b754-4c");re = window.initialState.state;resp = CryptoJS.AES.decrypt(re, ne, {mode: CryptoJS.mode.ECB,padding: CryptoJS.pad.Pkcs7}).toString(CryptoJS.enc.Utf8).toString()return JSON.parse(resp)
}
接着写一个python代码获取密文,与js交互,再取出内容。
import requests
import re
import execjsdef GetResponse(url):headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',}response = requests.get(url=url,headers=headers)return responsedef GetContent():link = 'https://www.36kr.com/p/2729715501638664'html = GetResponse(link).textencrypted_data = re.findall('window.initialState=\{"state":"(.*?)"',html)[0]#解析数据,提取密文内容json_code = execjs.compile(open('demo11.js', 'r', encoding='utf-8').read())result = json_code.call('get_content',encrypted_data )title = result['articleDetail']['articleDetailData']['data']['widgetTitle']content = result['articleDetail']['articleDetailData']['data']['widgetContent']return title,contentif __name__ == '__main__':title,content = GetContent()print(title)print(content)
结果展现:
好了,获取到数据后,我们的目标是保存到pdf的格式,怎么做呢?
首先,把文章内容保存成html
然后,把html文件转成pdf
先要有一个前端的模板
html_str = '''<!doctype html><html lang="en"><head><meta charset="utf-8"><title>Document</title></head><body>{article}</body></html>'''
保存成html代码
import requests
import re
import execjsdef GetResponse(url):headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',}response = requests.get(url=url,headers=headers)return responsedef GetContent():link = 'https://www.36kr.com/p/2729715501638664'html = GetResponse(link).textencrypted_data = re.findall('window.initialState=\{"state":"(.*?)"',html)[0]#解析数据,提取密文内容json_code = execjs.compile(open('demo11.js', 'r', encoding='utf-8').read())result = json_code.call('get_content',encrypted_data )title = result['articleDetail']['articleDetailData']['data']['widgetTitle']content = result['articleDetail']['articleDetailData']['data']['widgetContent']return title,content
def Save(title,content):html_str = '''<!doctype html><html lang="en"><head><meta charset="utf-8"><title>Document</title></head><body>{article}</body></html>'''string = html_str.format(article=content)html_file = 'html\\'+title+'.html'with open(html_file,mode='w',encoding='utf-8') as f:f.write(string)if __name__ == '__main__':title,content = GetContent()Save(title,content)print(title)print(content)
结果展现:
现在开始转变成pdf
import requests
import re
import execjs
import pdfkitdef GetResponse(url):headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',}response = requests.get(url=url,headers=headers)return responsedef GetContent():link = 'https://www.36kr.com/p/2729715501638664'html = GetResponse(link).textencrypted_data = re.findall('window.initialState=\{"state":"(.*?)"',html)[0]#解析数据,提取密文内容json_code = execjs.compile(open('demo11.js', 'r', encoding='utf-8').read())result = json_code.call('get_content',encrypted_data )title = result['articleDetail']['articleDetailData']['data']['widgetTitle']content = result['articleDetail']['articleDetailData']['data']['widgetContent']return title,content
def Save(title,content):html_str = '''<!doctype html><html lang="en"><head><meta charset="utf-8"><title>Document</title></head><body>{article}</body></html>'''string = html_str.format(article=content)html_file = 'html\\'+title+'.html'with open(html_file,mode='w',encoding='utf-8') as f:f.write(string)pdf_path = 'pdf\\'+title+'.pdf'config = pdfkit.configuration(wkhtmltopdf=r'C:\Users\86187\PycharmProjects\pythonProject2\wkhtmltoodf\wkhtmltopdf\bin\wkhtmltopdf.exe')pdfkit.from_file(html_file, pdf_path, configuration=config)if __name__ == '__main__':title,content = GetContent()Save(title,content)print(title)print(content)
最终成功了
如果想多爬取页面,只要获取各个页面的url就行了。也就是获取每个文章的id,交给你自己去想办法。