1.re实现
1 import re,os 2 import requests 3 from requests.exceptions import RequestException 4 5 MAX_PAGE = 10 #最大页数 6 KEYWORD = 'python' 7 headers = { 8 'User-Agent': 9 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' 10 } 11 file_name = 're_job51_python.txt' 12 13 # 获取网页源码 14 def getHtml(page): 15 try: 16 url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,{0},2,{1}.html?'.format(KEYWORD,page) 17 response = requests.get(url,headers=headers) 18 response.encoding = response.apparent_encoding 19 return response.text 20 except RequestException: 21 print('请求出错') 22 return None 23 24 # 解析网页源码,得到目标信息 25 def getTarget(html): 26 reg = re.compile( 27 r'class="t1 ">.*? <a target="_blank" ' 28 'title="(.*?)".*? <span class="t2"><a target="_blank" ' 29 'title="(.*?)".*?<span ' 30 'class="t3">(.*?)</span>.*?<span ' 31 'class="t4">(.*?)</span>.*? <span ' 32 'class="t5">(.*?)</span>', 33 re.S) # 匹配换行符 34 target = re.findall(reg,html) 35 return target 36 37 38 # 保存到文本中 39 def save_to_txt(item): 40 with open(file_name,'a',newline='') as f: # newline参数防止两行之间有空行 41 for i in range(len(item)): 42 # 最后一个元素换行,非最后则以','隔开 43 if i == len(item)-1: 44 f.write(item[i]) 45 f.write('\n') 46 else: 47 f.write(item[i]+',') 48 49 def main(): 50 # 每次执行前检查文件是否存在,存在则删除 51 if os.path.exists(file_name): 52 os.remove(file_name) 53 54 # 分页爬取 55 for page in range(MAX_PAGE+1): 56 html = getHtml(page) 57 content = getTarget(html) 58 for item in content: 59 save_to_txt(item) 60 61 if __name__ == '__main__': 62 main()
2.xpath实现
1 import os 2 import requests 3 from requests.exceptions import RequestException 4 from lxml import etree 5 import pymongo 6 from spiders.前程无忧.mongo_config import * 7 8 # mongo数据库设置 9 client = pymongo.MongoClient(MONGO_URL) 10 db = client[MONGO_DB] 11 12 MAX_PAGE = 5 13 KEYWORD = 'python' 14 headers = { 15 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '\ 16 'Chrome/63.0.3239.132 Safari/537.36' 17 } 18 file_name = 'xpath_job51_python.txt' 19 20 # 获取网页 21 def get_html(page): 22 try: 23 url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,{},2,{}.html?'.format(KEYWORD,page) 24 response = requests.get(url,headers=headers) 25 response.encoding = response.apparent_encoding 26 return response.text 27 except RequestException: 28 return None 29 30 # 解析网页 31 def parse_html(html): 32 # 构造xpath解析对象,可自动修整HTML文本 33 html = etree.HTML(html) 34 # 获取文本 /text() 35 # 获取属性 /@href 36 # 获取第i个标签 /tar_name[i] 从1开始 37 # normalize-space-->去空格换行符 38 # position_name = html.xpath('normalize-space(//div[@class="el"]/p/span/a/text())') 39 40 # 职位名称, 41 position_names = [] 42 for name in html.xpath('//div[@class="el"]/p/span/a/text()'): 43 position_name = name.strip() 44 position_names.append(position_name) 45 46 # 职位地址 47 position_urls = html.xpath('//div[@class="el"]/p/span/a/@href') 48 49 # 公司名称 50 company_names = html.xpath('//div[@class="el"]/span[1]/a/text()') 51 52 # 公司地址 53 company_urls = html.xpath('//div[@class="el"]/span[1]/a/@href') 54 55 # 位置 56 locations = html.xpath('//div[@class="el"]/span[@class="t3"]/text()') 57 58 # 薪资 59 salarys = html.xpath('//div[@class="el"]/span[@class="t4"]/text()') 60 61 # 发布时间 62 release_dates = html.xpath('//div[@class="el"]/span[4]/text()') 63 64 result = zip(position_names,position_urls,company_names,company_urls,locations,salarys,release_dates) 65 return result 66 67 68 def save_to_txt(element): 69 with open(file_name,'a',newline='') as f: 70 for i in range(len(element)): 71 # data = ','.join(element[i]) 72 if i == len(element)-1: 73 f.write(element[i]) 74 f.write('\n') 75 else: 76 f.write(element[i]+',') 77 78 79 def save_to_mongo(element): 80 keys = ['position_name','position_url','company_name', 81 'company_url','location','salary','release_date'] 82 result = dict(zip(keys,list(element))) 83 if db[MONGO_TABLE_XPATH].insert(result): 84 print('数据成功存储到mongo数据库中') 85 return True 86 return False 87 88 # 遍历字典元素 89 # for k,v in result.items(): 90 # print(k,':',v) 91 for key in result: 92 print(key,':',result[key]) 93 94 95 96 def main(): 97 if os.path.exists(file_name): 98 os.remove(file_name) 99 for page in range(1,MAX_PAGE+1): 100 html = get_html(page) 101 elements = parse_html(html) 102 if elements: 103 for element in elements: 104 save_to_txt(element) 105 save_to_mongo(element) 106 107 if __name__ == '__main__': 108 main()