该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
import requests
from multiprocessing import Pool
import re
import json
from requests.exceptions import ConnectionError
def get_to_html(url):
try:
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
response = requests.get(url,headers=headers)
# print(response)
if response.status_code ==200:
return response.content.decode("utf-8")
return None
except ConnectionError:
return None
def parse_index(html):
pattern = re.compile('
.*?board-index.*?>#(\d+).*?.*?data-src="(.*?)".*?name">(.*?)'+'.*?start">(.*?)
releasetime">(.*?).*?integer">(.*?).*?fracion">(.*?).*?',re.S)items=re.findall(pattern,html)
for item in items:
yield {
"index":item[0],
"image":item[1],
"name":item[2],
"actor":item[3].strip()[3:],
"releasetime":item[4].strip()[5:],
"count":item[5]+item[6]
}
def write_to_file(content):
with open("maoyan.txt","a") as f:
content = json.dumps(content,ensure_ascii=False)+'\n'
f.write(content,encoding ="utf-8")
def main():
url="https://maoyan.com/board/4"
html=get_to_html(url)
items=parse_index(html)
for item in items:
write_to_file(item)
if __name__ =="__main__":
# pool=Pool()
# pool.map(main)
main()