准备:
获取所有的标签和链接,并存储在json文件中
main.py
from bs4 import BeautifulSoup
import requests
import extract
import Into_Tag
import read
import json
def get_info(filepath):try:with open(filepath,'r',encoding='utf-8')as file:content=file.read()except Exception as e:print(f"{e}")return content
#content=read.get_HTML()
def main(url):try:response=requests.get(url)content=response.textread.trace_web(content)except:print('WLAN Error')passreturndef clear_f(filename):with open(filename,'w',encoding='utf-8')as file:passdef store_tag(content):all_tag = read.trace_web(content)with open(Tag_file, 'w', encoding='utf-8') as file:for data in all_tag:json.dump(data, file)file.write('\n')returnfp = r"E:\Procedure\Python\Experiment\webpage.txt"
url='https://huggingface.co/datasets'
Tag_file=r'E:\Procedure\Python\Experiment\url_queue.json'f=r"E:\Procedure\Python\Experiment\memo.json"print('Succeed')
工具函数定义:
read.py
import requests
from bs4 import BeautifulSoup
import Into_Tagdef get_HTML():web='https://huggingface.co/datasets'try:response=requests.get(web,timeout=30)source_path=r"E:\Procedure\Python\Experiment\webpage.txt"with open(source_path,'w',encoding='utf-8')as file:file.write(response.text)return response.textexcept:return Nonedef trace_web(content):soup=BeautifulSoup(content,'html.parser')tag_divs = soup.find_all('div', class_='mb-3')all_tag=[]for tag_div in tag_divs:# 提取标签文本try:tag_text = tag_div.find('div', class_='mb-3 text-sm font-medium text-gray-500').get_text(strip=True)"""print("标签:", tag_text)"""# 提取小标签及其链接small_tags = tag_div.find_all('a', class_='tag tag-white')for small_tag_element in small_tags:try:small_tag_text = small_tag_element.span.get_text(strip=True)small_tag_link = small_tag_element['href']print("小标签:", small_tag_text)print("小标签链接:", small_tag_link)all_tag.append({'link':small_tag_link,'mainclass':tag_text,'subclass':small_tag_text})except AttributeError as e:continueprint("-" * 50)except AttributeError as e:continuereturn all_tag
开始爬取:
Into_Tag.py
import requests
from bs4 import BeautifulSoup
import json
import extractbase='https://huggingface.co'
tbase='https://huggingface.co/datasets'
def obtain_url():filename=r"E:\Procedure\Python\Experiment\url_queue.json"url_list=[]try:with open(filename,'r')as file:for line in file:data=json.loads(line)url_list.append(data)except Exception as e:print('Did not read:',e)return url_list
def store_Web(result):filename = r"E:\Procedure\Python\Experiment\res1.json"if (result==None):returnprint(filename)with open(filename, 'a') as file:for data in result:json.dump(data, file)file.write('\n')returndef judge(links,tag,s_tag):filename=r"E:\Procedure\Python\Experiment\memo1.json"print(filename)dt = {}result=[]try:with open(filename, 'r') as file:dt = json.load(file)except FileNotFoundError as e:dt={}passexcept json.decoder.JSONDecodeError as e:dt={}passif links==None or len(links)==0:print("links is empty")returnfor lk in links:if lk==None:continuelink=lk.get('href')if link==None:continueif link in dt:continueelse:try:response=requests.get(base+link,timeout=20)content=response.textres=extract.extract_info(content,tag,s_tag)dt.update({link:0})result.append(res)except requests.exceptions.Timeout as e:print('Timeout while accessing:', base + link)continueexcept Exception as e:print('Error while accessing:', base + link, e)continuetry:with open(filename,'w')as file:json.dump(dt,file)print('memo')except Exception as e:print('false')passreturn resultdef get_page_num(soup):"""获取页面数量:param soup::return:"""li=soup.find_all('li',class_='hidden sm:block')if li==None or len(li)==0:return -1num=int(li[-1].text)return numdef one_by_one(url,mclass='',sclass=''):try:response=requests.get(base+url,timeout=10)except Exception as e:print(e)return Nonecontent=response.text#print(content)soup=BeautifulSoup(response.text,'html.parser')#links=soup.find_all('a',class_='block p-2')links=[]num=get_page_num(soup)if num==-1:return linksfor i in range(5,num):params={'p':i,'sort':'trending'}page_url=base+urltry:content=requests.get(page_url,params=params,timeout=10).texttsoup=BeautifulSoup(content,'html.parser')tlinks=tsoup.find_all('a',class_='block p-2')links+=tlinksexcept:continuereturn links
def trytorep():url = '/datasets?task_categories=task_categories%3Aimage-classification'links = one_by_one(url)res = judge(links, "Computer Vision", 'Image Classification')store_Web(res)
def back():url_list=obtain_url()sum=45for d in url_list[45:46]:link = d.get('link')mclass = d.get('mainclass')sclass = d.get('subclass')links = one_by_one(link)res = judge(links, mclass, sclass)store_Web(res)print(sum)sum+=1
def main():url_list = obtain_url()sum=5for d in url_list[5:6]:link = d.get('link')mclass = d.get('mainclass')sclass = d.get('subclass')links = one_by_one(link)res = judge(links, mclass, sclass)store_Web(res)print(sum)sum+=1def get_tag_size(url_list):print("tag_size:",len(url_list))d=url_list[5]mclass=d.get('mainclass')sclass=d.get('subclass')links=[{'href':'/datasets/CyberHarem/surtr_arknights?not-for-all-audiences=true'}]res=judge(links,mclass,sclass)#store_Web(res)returnurl_list=obtain_url()
get_tag_size(url_list)
注意点:
注意如何正确翻页
注意特殊页面需要在链接后面加上 “?not-for-all-audiences=true" 才可以访问