使用队列完成生产者消费者模式 使用类创建多线程提高爬虫速度
'''
https://sc.chinaz.com/tupian/index.html
https://sc.chinaz.com/tupian/index_2.html
https://sc.chinaz.com/tupian/index_3.html
''' from threading import Thread
from queue import Queue
import requests
from bs4 import BeautifulSoup
import osheaders = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69' ,
}
class Put_Thread ( Thread) : def __init__ ( self, url_queue, img_queue) : super ( ) . __init__( ) self. url_queue = url_queueself. img_queue = img_queuedef run ( self) : while not self. url_queue. empty( ) : url = self. url_queue. get( ) self. fetch_url( url) def fetch_url ( self, url) : response = requests. get( url, headers= headers) response. encoding = 'utf-8' soup = BeautifulSoup( response. text, 'lxml' ) data_list = soup. find_all( 'img' , class_= 'lazy' ) for i in data_list: title = i. get( 'alt' ) href = 'https:' + i. get( 'data-original' ) . replace( '_s' , '' ) self. img_queue. put( ( title, href) ) class Get_Thread ( Thread) : def __init__ ( self, img_queue) : super ( ) . __init__( ) self. img_queue = img_queuedef run ( self) : while True : try : img_data = self. img_queue. get( timeout= 3 ) except : break else : title, href = img_dataif not os. path. exists( './image' ) : os. mkdir( './image' ) with open ( './image/' + title + '.jpg' , 'wb' ) as f: resp = requests. get( href, headers= headers) . contentf. write( resp) print ( title, '保存成功!' ) def main ( ) : '''存放url''' url_queue = Queue( ) '''存放图片的地址和名称''' img_queue = Queue( ) url_queue. put( 'https://sc.chinaz.com/tupian/index.html' ) for i in range ( 1 , 11 ) : url = 'https://sc.chinaz.com/tupian/index_{}.html' . format ( i) url_queue. put( url) for i in range ( 41 ) : t1 = Put_Thread( url_queue, img_queue) t1. start( ) t2 = Get_Thread( img_queue) t2. start( ) if __name__ == '__main__' : main( ) print ( '\n************主线程已结束************\n' )
通过队列可以让线程之间进行通信 创建继承Thread的类创建线程,run()会在线程start时执行 吃cpu性能