这是一个异步爬虫,上代码
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}async def fetch(url, semaphore):async with semaphore:async with aiohttp.ClientSession() as session:try:async with session.get(url, headers=headers, timeout=10) as response:# proxies = {"http": "http://10.10.1.10:3128","https": "http://10.10.1.10:1080"}# async with session.get(url, headers=headers, timeout=10,proxy=proxies) as response:#代理IP设置return await response.text(), urlexcept:return """<html><head><title>Error</title></head></html>""", urlasync def main():urls=[]#网址自定义semaphore = asyncio.Semaphore(500)#设置默认并发数500,在windows中最大为512, Linux中限制为1024tasks = [fetch(url,semaphore) for url in urls]responses = await asyncio.gather(*tasks)for response in responses:print(response[0],response[1])#可以在这里处置返回的网址数据loop = asyncio.get_event_loop()loop.run_until_complete(main())
将其放进Threading线程的时候会报错
- 错误提示1:RuntimeError: There is no current event loop in thread ‘Thread-1’.
- 使用网上的方法说是添加
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
https://www.codenong.com/48725890/
- 也会报错提示:AttributeError: module ‘asyncio‘ has no attribute ‘WindowsSelectorEventLoopPolicy‘ 。
- 可能是版本问题导致吧,使用的版本是python3.6.7版本。
解决方法
将调用代码
loop = asyncio.get_event_loop()#获取线程事件loop.run_until_complete(main())#调用
替换如下:
loop = asyncio.new_event_loop()#新建一个线程事件asyncio.set_event_loop(loop)#设置线程事件loop.run_until_complete(main())#调用
分析可能是由于直接调用线程事件和threading.Thread冲突了
t1=Reptile_Thread()t1.start()print("运行")
class Reptile_Thread(threading.Thread):"""网站爬取线程"""def __init__(self,parent=None):super(Reptile_Thread, self).__init__(parent)def run(self):headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}async def fetch(url, semaphore):async with semaphore:async with aiohttp.ClientSession() as session:try:async with session.get(url, headers=headers, timeout=10) as response:# proxies = {"http": "http://10.10.1.10:3128","https": "http://10.10.1.10:1080"}# async with session.get(url, headers=headers, timeout=10,proxy=proxies) as response:#代理IP设置return await response.text(), urlexcept:return """<html><head><title>Error</title></head></html>""", urlasync def main():urls=[]#网址自定义semaphore = asyncio.Semaphore(500)#设置默认并发数500,在windows中最大为512, Linux中限制为1024tasks = [fetch(url,semaphore) for url in urls]responses = await asyncio.gather(*tasks)for response in responses:print(response[0],response[1])#可以在这里处置返回的网址数据#使用一下方法可以解决该错误。loop = asyncio.new_event_loop()#新建一个线程事件asyncio.set_event_loop(loop)#设置线程事件loop.run_until_complete(main())#调用