新闻分页地址:https://news.cnblogs.com/n/page/10/;url中最后一个数字代表页码
from concurrent.futures import ThreadPoolExecutor import threading import time from queue import Queue import logging import requests from bs4 import BeautifulSoup# 日志参数的设定 FORMAT = "%(asctime)s %(threadName)s %(thread)d %(message)s" logging.basicConfig(format=FORMAT, level=logging.INFO)# 多线程对象 event = threading.Event()# url的前缀和user-agent值的设定 base_url = 'https://news.cnblogs.com' page_path = '/n/page/' ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'# 队列对象 urls = Queue() # 待爬取队列,省略已爬取队列 htmls = Queue() # 从原网页里爬取的全部html内容:太大,无用数据太多,不保存 outputs = Queue() # 提取的数据,结果输出队列# 1.创建urls;周而复始的创建要爬取的url;start表示起始页面,stop表示终止页面 def create_urls(start, stop, step=1):for i in range(start, stop+1, step):url = "{}{}{}/".format(base_url, page_path, i)# print(url)urls.put(url) # 将生成的url放入待爬取的url的队列里print('url创建完毕')# create_urls(1,10) # 创建page1到page10的url # print(urls.qsize()) # 队列的大小为10# 2.使用url来发起request请求,返回response对象 def crawler(): # 多线程while not event.is_set():try:url = urls.get(True, 1) # 设置超时时间为1秒response = requests.get(url, headers={'User-agent': ua})with response:html = response.text # 异步方式获取文本信息htmls.put(html) # 将每个页面内容存放进对应的htmls队列里print('url:', url)# 捕获超时抛出的错误except Exception as e:print(e)# logging.error(e)# 3.分析提取有用的数据入库 def parse():while not event.is_set():try:html = htmls.get(True, 1) soup = BeautifulSoup(html, 'lxml') # 解析html内容news = soup.select('h2.news_entry a') # 提取所需标签内容for n in news:title = n.textref = base_url + n.attrs.get('href')print('get_title:', title, 'get_ref:', ref)outputs.put((title, ref)) # 提取出的标题和链接内容存放至对应队列里except Exception as e:print(e)# logging.error(e)# 4.入库;保存到文件中 def save(path):with open(path, 'a+', encoding='utf-8') as f:while not event.is_set():try:title, ref = outputs.get(True, 1) # 元组结构print('save_title:', title, 'save_ref:', ref)f.write('{}_{}\n'.format(title, ref))f.flush() # 爬取内容保存到文件中except Exception as e:print(e)# logging.error(e)# 线程池中,启动线程(最大线程数为10) executor = ThreadPoolExecutor(max_workers=10) executor.submit(create_urls, 1, 10) # 起始urls,以后queue中parse有用的url也可以加入 executor.submit(parse) executor.submit(save, 'news.txt')for i in range(7):executor.submit(crawler)while True:cmd = input('>>>')if cmd.strip() == 'q': # 在console栏里输入q,就会过一秒后停止多线程运行event.set()executor.shutdown()print('closing')time.sleep(1)break