增加定时执行功能
直接看代码实现:
# -*- coding:UTF-8 -*-import json
import random
import re
import threading
import time
from datetime import timedeltaimport requests
from bs4 import BeautifulSoup
from loguru import loggeruid = "qq_17328759" # CSDN的IDhost = "https://blog.csdn.net"
headers = {'Accept' : 'application/json, text/plain, */*','Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2','Referer' : f'https://blog.csdn.net/{uid}','Connection' : 'keep-alive','Sec-Fetch-Dest' : 'empty','Sec-Fetch-Mode' : 'cors','Sec-Fetch-Site' : 'same-origin'
}user_agent = ["Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 ""Safari/534.50","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0","Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; "".NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 ""Safari/535.11","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR ""2.0.50727; SE 2.X MetaSr 1.0)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
]def parseIPList(url = "https://www.kuaidaili.com/free/inha/"):"""获取代理地址:param url:代理之地的网站 # "https://www.beesproxy.com/free" "https://proxy.mimvp.com/freeopen":return: 代理的地址列表"""def search_by_re(string):IPs = []pattern = re.compile(r'const fpsList = (.*?);')re_result = pattern.search(string)if not re_result:return IPsips = json.loads(re_result.groups()[0])for ip_info in ips:IPs.append([ip_info['ip'], ip_info['port']])return IPsdef search_by_bs(string):IPs = []soup = BeautifulSoup(string)tds = soup.find_all("td")ip = ''port = ''for td in tds:if 'data-title' not in td.attrs:continueif 'IP' == td.attrs['data-title']:ip = td.textif "PORT" == td.attrs['data-title']:port = td.textIPs.append([ip, port])return IPsurl += f'{random.randint(1, 300)}/'headers_proxy = headers.copy()headers_proxy.update({ "User-Agent": random.choice(user_agent) })del headers_proxy['Referer']response = requests.get(url, headers=headers_proxy)IPs = []IPs.extend(search_by_bs(response.text))IPs.extend(search_by_re(response.text))return IPsdef articleId(uid):"""获取用户的博客列表:param uid: 用户的账号:return: 用户博客列表"""articleIds = []articleList = host + "/" + uidheaders.update({ "User-Agent": random.choice(user_agent) })response = requests.get(articleList, headers=headers).contentsoup = BeautifulSoup(response)articles = soup.find_all("article", attrs={ "class": "blog-list-box" })for article in articles:art_info = article.find_all('a', attrs={ "target": "_blank", 'href': True })# print(art_info[0].attrs['href'])articleIds.append(art_info[0].attrs['href'])return articleIdsdef articleIdByApi(uid, page = 1, size = 20):"""获取用户的博客列表:param uid: 用户的账号:return: 用户博客列表"""blogListUrl = f'{host}/community/home-api/v1/get-business-list'params = {"page" : page,"size" : size,"businessType": "lately","orderby" : "","noMore" : False,"year" : "","month" : "","username" : uid}articleIds = []headers.update({ "User-Agent": random.choice(user_agent) })currentPage = pagewhile True:try:response = requests.get(blogListUrl, params=params, headers=headers).json()articleList = response.get("data", { }).get('list', [])logger.debug(f"获取第 {currentPage} 页博客 {articleList.__len__()} 篇")for article_info in articleList:articleIds.append(article_info.get('url'))if articleList.__len__() < size or articleList > 30:breakcurrentPage += 1params.update({ 'page': currentPage })except Exception as e:logger.error(f'获取 {uid} 博客列表失败:{e}')breaklogger.debug(f"{uid} 有 {articleIds.__len__()} 篇博客")return articleIdsdef PV(IPs, uid, codes):s = requests.Session()count = 0url = host + "/{}/article/details/{}"while True:count += 1logger.info("正在进行第{}次访问\t".format(count))proxie = random.choice(IPs)logger.debug("{} -- {}".format(proxie, user_agent))s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }s.headers = headers.update({ "User-Agent": random.choice(user_agent) })for code in codes:articleUrl = code if 'http' in code else url.format(uid, code)html = s.get(articleUrl).textif not html:s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }continuesoup = BeautifulSoup(html, "html.parser")spans = soup.find_all(name="span", attrs={ "class": "read-count" })if spans.__len__() != 0:logger.debug(f"{code} 当前阅读量:{spans[0].text}")time.sleep(random.randint(1, 35))class addReadNum(threading.Thread):def __init__(self, IPs, uid, articleIds):threading.Thread.__init__(self)self.IPs = IPsself.uid = uidself.articleIds = articleIdsdef run(self):PV(self.IPs, self.uid, self.articleIds)def demo_schedule():articleIds = articleIdByApi(uid)IPs = parseIPList()PV(IPs, uid, articleIds)import scheduleschedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3)) # 每个整点开始执行,执行三分钟后停止schedule.run_all()while True:schedule.run_pending() # 运行所有可以运行的任务time.sleep(30)
补充定时任务配置:
def demo_schedule():articleIds = articleIdByApi(uid)IPs = parseIPList()PV(IPs, uid, articleIds)import scheduleschedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3)) # 每个整点开始执行,执行三分钟后停止schedule.run_all()while True:schedule.run_pending() # 运行所有可以运行的任务time.sleep(30)