由于爬虫代码都不多,
所以我决定在这篇博文上更新所有我觉得比较实用的python代码
方便以后自己调用
环境:python3.7
百度图片爬虫
二次元图片爬取
唐三小说爬取
文件格式命名
百度图片爬虫
百度图片网站
import re
import requests
from urllib import error
from bs4 import BeautifulSoup
import osnum = 0
numPicture = 0
file = ''
List = []def Find(url):global Listprint('正在检测图片总数,请稍等.....')t = 0i = 1s = 0while t < 1000:Url = url + str(t)try:Result = requests.get(Url, timeout=7)except BaseException:t = t + 60continueelse:result = Result.textpic_url = re.findall('"objURL":"(.*?)",', result, re.S) # 先利用正则表达式找到图片urls += len(pic_url)if len(pic_url) == 0:breakelse:List.append(pic_url)t = t + 60return sdef recommend(url):Re = []try:html = requests.get(url)except error.HTTPError as e:returnelse:html.encoding = 'utf-8'bsObj = BeautifulSoup(html.text, 'html.parser')div = bsObj.find('div', id='topRS')if div is not None:listA = div.findAll('a')for i in listA:if i is not None:Re.append(i.get_text())return Redef dowmloadPicture(html, keyword):global num# t =0pic_url = re.findall('"objURL":"(.*?)",', html, re.S) # 先利用正则表达式找到图片urlprint('找到关键词:' + keyword + '的图片,即将开始下载图片...')for each in pic_url:print('正在下载第' + str(num + 1) + '张图片,图片地址:' + str(each))try:if each is not None:pic = requests.get(each, timeout=7)else:continueexcept BaseException:print('错误,当前图片无法下载')continueelse:string = file + r'\\' + keyword + '_' + str(num) + '.jpg'fp = open(string, 'wb')fp.write(pic.content)fp.close()num += 1if num >= numPicture:returnif __name__ == '__main__': # 主函数入口word = input("请输入搜索关键词(可以是人名,地名等): ")#add = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%BC%A0%E5%A4%A9%E7%88%B1&pn=120'url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='tot = Find(url)Recommend = recommend(url) # 记录相关推荐print('经过检测%s类图片共有%d张' % (word, tot))numPicture = int(input('请输入想要下载的图片数量 '))file = input('请建立一个存储图片的文件夹,输入文件夹名称即可')y = os.path.exists(file)if y == 1:print('该文件已存在,请重新输入')file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可')os.mkdir(file)else:os.mkdir(file)t = 0tmp = urlwhile t < numPicture:try:url = tmp + str(t)result = requests.get(url, timeout=10)print(url)except error.HTTPError as e:print('网络错误,请调整网络后重试')t = t+60else:dowmloadPicture(result.text, word)t = t + 60print('当前搜索结束,感谢使用')print('猜你喜欢')for re in Recommend:print(re, end=' ')
二次元图片爬取
二次元图片网站
import urllib.request
import re #成功爬取当前页面所有图片地址def open_url(url):response =urllib.request.urlopen(url)html=response.read()html=html.decode("utf-8")return htmldef get_img(html):par =r'<img class="pic-large" src="(.*?)"'html=re.findall(par,html)for each in html:print(each)filename = each.split("/")[-1]urllib.request.urlretrieve(each,filename,None)if __name__ == '__main__':while(1):word=input(("请输入所要图片的首页网址:"))url1=word[:-5]url2=".html"url=url1+url2get_img(open_url(url))for num in range(2,9):url=url1+"_"+str(num)html=".html"text=url+htmlprint(text)get_img(open_url(text))
唐三小说爬取
# coding=utf-8
from bs4 import BeautifulSoup
import requests
#我这里直接定义了一个类,这样就可以方便的调用
class book(object):def __init__(self):self.target="http://www.qiushuge.net/daomubiji2019/"#目录网址self.names=[]#存放章节名字self.urls=[]#存放urlself.nums=0#章节数#获取url和章节数def getmessage(self):req=requests.get(url=self.target)#发出request请求,得到数据#把内容都编码,防止乱码问题出现req.encoding=('utf-8')content=req.text#获取内容bf_content=BeautifulSoup(content, "lxml")#对内容进行处理,以便后续过滤bf_list=bf_content.find_all('span')#查找所有的span标签下的内容如果有class可以加上:class_ = 'showtxt'bf_content2=BeautifulSoup(str(bf_list),"lxml")#再次进行过滤,因为我们需要span下的a标签(注意要先转换成str类型)bf_list2=bf_content2.find_all('a')#查找所有的a标签,找到所需要的数据for value in bf_list2:#遍历数据,把它放到列表中self.names.append(value.text)#添加数据 .text就是a标签的数据self.urls.append(value.get('href'))#.get可以查找标签里面的参数self.nums=len(self.names)#获取总的章节数#获取章节内容def gettext(self,target):req=requests.get(url=target)req.encoding = ('utf-8')content=req.textbf_content=BeautifulSoup(content,"lxml")bf_list=bf_content.find_all('p')val=''for value in bf_list:val+=(' '+value.text+'\n\n')#这里是循环遍历内容,然后不断把内容拼接起来return val#返回拼接的内容def write(self,name,path,text):with open(path,'a',encoding='utf-8') as f:#这里是把内容写到文本中f.write(name+'\n')f.write(text+'\n\n')if __name__ == "__main__":d1=book()#先初始化类d1.getmessage()#执行类的函数print('正在下载《盗墓笔记》...')for value in range(d1.nums):print('正在下载:%s',d1.names[value])d1.write(d1.names[value],'盗墓笔记.txt',d1.gettext(d1.urls[value]))#不断把每章的内容都写到文文中
文件格式命名
#coding=gbk
import os
import sys
def rename():path=input("请输入路径(例如D:\\\\picture):")name=input("请输入开头名:")startNumber=input("请输入开始数:")fileType=input("请输入后缀名(如 .jpg、.txt等等):")print("正在生成以"+name+startNumber+fileType+"迭代的文件名")count=0filelist=os.listdir(path)for files in filelist:Olddir=os.path.join(path,files)if os.path.isdir(Olddir):continueNewdir=os.path.join(path,name+str(count+int(startNumber))+fileType)os.rename(Olddir,Newdir)count+=1print("一共修改了"+str(count)+"个文件")rename()