项目地址:https://gitee.com/wyu_001/myscrapy
接下来接着说明如何多线程运行多个爬虫脚本:
项目的根目录下有个batch.py文件,这个就是批量运行多个爬虫的脚本,这里使用了线程池,同时运行spider下的多个爬虫类,也可以在setting.py文件中设置运行的爬虫文件:
#batch
#批量运行默认情况下运行spider下继承myspider类的子类
#批量运行脚本参数定义,一次并发线程数BATCH_THREADS =10#batch run files in list
#自定义运行spider下脚本文件
BATCH_FILES =['dxyqueryhospital.py','haodfqueryhospital.py']
下面是batch.py脚本代码:
import inspect
from os import listdir,getcwd
from os.path import isfile,join
import importlibfrom config.setting import BATCH_THREADS
from config.setting import BATCH_FILESfrom concurrent.futures import ThreadPoolExecutor,as_completedcrawls=[]lib_dir = "spider"
file_path = join(getcwd(),lib_dir)
crawl_files = [ f for f in listdir(file_path) if isfile(join(file_path,f))]crawls_sets = set(crawl_files)
batch_sets = set(BATCH_FILES)if len(batch_sets):crawl_files = list(crawls_sets.intersection(batch_sets))for file in crawl_files:if file != "__init__.py" :file = f'.{file.split(".")[0]}'moudle = importlib.import_module(file,lib_dir)for name ,obj in inspect.getmembers(moudle,inspect.isclass):if obj.__base__.__name__ == "MySpider":crawls.append(obj())thread_num = 0tasks = []with ThreadPoolExecutor(max_workers= BATCH_THREADS) as tp:while(len(crawls)):task = tp.submit(crawls.pop().start_request)tasks.append(task)thread_num += 1if thread_num >= BATCH_THREADS :for future in as_completed(tasks):finish = future.result()thread_num = 0for future in as_completed(tasks):finish = future.result()