爬虫之拉勾网职位获取

重点在于演示urllib.request.Request()请求中各项参数的书写格式譬如： url data headers...

Demo演示（POST请求）:

import urllib.request
import urllib.parse
import json, jsonpath, csv

url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
headers = {
    "Accept": "application/json, text/javascript, */*; q=0.单线程",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Connection": "keep-alive",
    "Content-Length": "38",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
    "Host": "www.lagou.com",
    "Origin": "https://www.lagou.com",
    "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "X-Anit-Forge-Code": "0",
    "X-Anit-Forge-Token": "None",
    "X-Requested-With": "XMLHttpRequest"}
# params = {"city": "上海", "needAddtionalResult": "false"}
list_position = []
for pn in range(1, 5):
    data = {
        "first": "false",
        "pn": pn,
        "kd": "爬虫"
    }
    # params = urllib.parse.urlencode(params)
    # url = url + params
    data = urllib.parse.urlencode(data).encode('utf-8')
    req = urllib.request.Request(url, data=data, headers=headers)
    print('正在请求第%d页' % pn)
    str_data = urllib.request.urlopen(req).read()
    with open('03.html', 'wb') as f:
        f.write(str_data)
    # 转换成python对象
    data_list = json.loads(str_data)
    job_list = jsonpath.jsonpath(data_list, "$..result")[0]

    for item in job_list:
        position_dict = {}
        position_dict['positionName'] = item.get('positionName')
        position_dict['createTime'] = item.get('createTime')
        position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'

        position_dict['salary'] = item.get('salary')
        position_dict['workYear'] = item.get('workYear')
        position_dict['companySize'] = item.get('companySize')
        list_position.append(position_dict)

# 保存到json文件
json.dump(list_position, open('03.json', 'w'))

# 保存到csv文件 'gbk' codec can't encode character '\u200b' in position 0: illegal multibyte seq
csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
sheets = list_position[0].keys() # 表头
row_content = []
for item in list_position:
    row_content.append(item.values()) # 内容
try:
    csv_writer.writerow(sheets)
    csv_writer.writerows(row_content)
except Exception as e:
    print(e)

 1 import urllib.request
 2 import urllib.parse
 3 import json, jsonpath, csv
 4 
 5 url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
 6 headers = {
 7     "Accept": "application/json, text/javascript, */*; q=0.单线程",
 8     "Accept-Encoding": "gzip, deflate, br",
 9     "Accept-Language": "zh-CN,zh;q=0.9",
10     "Connection": "keep-alive",
11     "Content-Length": "38",
12     "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
13     "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
14     "Host": "www.lagou.com",
15     "Origin": "https://www.lagou.com",
16     "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
17     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
18     "X-Anit-Forge-Code": "0",
19     "X-Anit-Forge-Token": "None",
20     "X-Requested-With": "XMLHttpRequest"}
21 # params = {"city": "上海", "needAddtionalResult": "false"}
22 list_position = []
23 for pn in range(1, 5):
24     data = {
25         "first": "false",
26         "pn": pn,
27         "kd": "爬虫"
28     }
29     # params = urllib.parse.urlencode(params)
30     # url = url + params
31     data = urllib.parse.urlencode(data).encode('utf-8')
32     req = urllib.request.Request(url, data=data, headers=headers)
33     print('正在请求第%d页' % pn)
34     str_data = urllib.request.urlopen(req).read()
35     with open('03.html', 'wb') as f:
36         f.write(str_data)
37     # 转换成python对象
38     data_list = json.loads(str_data)
39     job_list = jsonpath.jsonpath(data_list, "$..result")[0]
40 
41     for item in job_list:
42         position_dict = {}
43         position_dict['positionName'] = item.get('positionName')
44         position_dict['createTime'] = item.get('createTime')
45         position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'
46 
47         position_dict['salary'] = item.get('salary')
48         position_dict['workYear'] = item.get('workYear')
49         position_dict['companySize'] = item.get('companySize')
50         list_position.append(position_dict)
51 
52 # 保存到json文件
53 json.dump(list_position, open('03.json', 'w'))
54 
55 # 保存到csv文件  'gbk' codec can't encode character '\u200b' in position 0: illegal multibyte seq
56 csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
57 sheets = list_position[0].keys()  # 表头
58 row_content = []
59 for item in list_position:
60     row_content.append(item.values())  # 内容
61 try:
62     csv_writer.writerow(sheets)
63     csv_writer.writerows(row_content)
64 except Excepti