import requests,bs4,csv,os,re,time
'''采集商品url'''
def shopifylist(url):
while True:
try:
res=requests.get(url,timeout=30)
res.encoding = res.apparent_encoding
print('请求',url,'状态',res.status_code)
res.raise_for_status()#如果返回状态不是200,则抛出异常
break
except:
timeout=3
print('链接失败,等待',timeout,'秒重试')
time.sleep(timeout)
print('')
print('重新链接中')
print('链接顺畅,开始获取商品链接')
noStarchSoup=bs4.BeautifulSoup(res.text,'html.parser')#html.parser 指定解析器
url=noStarchSoup.select('.product-card.sc-pb-element')
for i in range(len(url)):
imgurl='https://tribalhollywood.com'+url[i].get('href')
print('获取产品url')
shopify(imgurl,site)#调用采集内容方法
print('\n')
'''采集商品url结束'''
'''采集商品内容开始'''
def shopify(url,site):
print('开始请求产品页面',url)
while True:
try:
res=requests.get(url,timeout=30)
res.encoding = res.apparent_encoding
print('成功请求商品页面:',res.status_code)
res.raise_for_status()#如果下载发生问题,就抛出异常
break
except:
print('请求商品页面',url,'失败,重新链接')
noStarchSoup=bs4.BeautifulSoup(res.text,'html.parser')
#匹配class属性为‘wc-block-grid__product-title’的内容
name=noStarchSoup.select('.product-single__title')
name=name[0].getText()
price=noStarchSoup.select('.product-single__price')
price=price[0].getText()
price=re.sub(' ','',price)
price=re.sub('\n','',price)
#特别注意class="rte product-single__description"只需要product-single__description
des=noStarchSoup.select('.product-single__description')
des=des[0].getText()
des=re.sub('Hollywood','customadd.com',des)#替换版权信息
img=noStarchSoup.select('#ProductThumbs-product-template img')
if img==[]:
img=noStarchSoup.select('.sc-pb-element1')
l=img[0].get('src')
l='http:'+l
l=re.sub('_960x','',l)
else:
l=[]
for i in range(len(img)):
imgurl=img[i].get('src')
imgurl=re.sub('_160x160','',imgurl)
l.append('https:'+imgurl)
l=','.join(l)
fileHeader=['标题','产品url','价格','描述','图片']
file=[name,url,price,des,l]
#文件存储的地方,文件夹需要事先创建,并指定文件的格式为utf-8
while True:
try:
csvFile=open(site,'a',encoding='utf-8')
break
except:
print('')
print(site+'文件写入失败,重试中。。。。。')
time.sleep(5)
size=os.path.getsize(site)#判断文件大小,如果文件大于0则表示文件有内
writer=csv.writer(csvFile)
if size==0:
writer.writerow(fileHeader)
writer.writerow(file)
csvFile.close()
else:
writer.writerow(file)
csvFile.close()
print('采集成功!')
'''采集内容结束'''
#urlpro=str(input('输入要采集的商品列表'))
urlpro='https://www.tribalhollywood.com/collections/mens-necklaces'
site='D:\Backup\桌面\python3\mens-necklaces1.csv'
nt=['我不是空的']
n=1
while nt!=[]:
url=urlpro+'?page='+str(n)
prourl=shopifylist(url)#调用采集列表方法
print('成功采集',n,'页')
n=n+1
res=requests.get(url)
res.raise_for_status()
noStarchSoup=bs4.BeautifulSoup(res.text,'html.parser')
nt=noStarchSoup.select('.next')
print('全部采集完毕!!')