爬虫之拉勾网职位获取

重点在于演示urllib.request.Request()请求中各项参数的 书写格式 譬如: url data headers...

Demo演示(POST请求):

import urllib.request
import urllib.parse
import json, jsonpath, csv

url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
headers = {
    "Accept": "application/json, text/javascript, */*; q=0.单线程",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Connection": "keep-alive",
    "Content-Length": "38",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
    "Host": "www.lagou.com",
    "Origin": "https://www.lagou.com",
    "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "X-Anit-Forge-Code": "0",
    "X-Anit-Forge-Token": "None",
    "X-Requested-With": "XMLHttpRequest"}
# params = {"city": "上海", "needAddtionalResult": "false"}
list_position = []
for pn in range(1, 5):
    data = {
        "first": "false",
        "pn": pn,
        "kd": "爬虫"
    }
    # params = urllib.parse.urlencode(params)
    # url = url + params
    data = urllib.parse.urlencode(data).encode('utf-8')
    req = urllib.request.Request(url, data=data, headers=headers)
    print('正在请求第%d页' % pn)
    str_data = urllib.request.urlopen(req).read()
    with open('03.html', 'wb') as f:
        f.write(str_data)
    # 转换成python对象
    data_list = json.loads(str_data)
    job_list = jsonpath.jsonpath(data_list, "$..result")[0]

    for item in job_list:
        position_dict = {}
        position_dict['positionName'] = item.get('positionName')
        position_dict['createTime'] = item.get('createTime')
        position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'

        position_dict['salary'] = item.get('salary')
        position_dict['workYear'] = item.get('workYear')
        position_dict['companySize'] = item.get('companySize')
        list_position.append(position_dict)

# 保存到json文件
json.dump(list_position, open('03.json', 'w'))

# 保存到csv文件  'gbk' codec can't encode character '\u200b' in position 0: illegal multibyte seq
csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
sheets = list_position[0].keys()  # 表头
row_content = []
for item in list_position:
    row_content.append(item.values())  # 内容
try:
    csv_writer.writerow(sheets)
    csv_writer.writerows(row_content)
except Exception as e:
    print(e)


 1 import urllib.request
 2 import urllib.parse
 3 import json, jsonpath, csv
 4 
 5 url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
 6 headers = {
 7     "Accept": "application/json, text/javascript, */*; q=0.单线程",
 8     "Accept-Encoding": "gzip, deflate, br",
 9     "Accept-Language": "zh-CN,zh;q=0.9",
10     "Connection": "keep-alive",
11     "Content-Length": "38",
12     "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
13     "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
14     "Host": "www.lagou.com",
15     "Origin": "https://www.lagou.com",
16     "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
17     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
18     "X-Anit-Forge-Code": "0",
19     "X-Anit-Forge-Token": "None",
20     "X-Requested-With": "XMLHttpRequest"}
21 # params = {"city": "上海", "needAddtionalResult": "false"}
22 list_position = []
23 for pn in range(1, 5):
24     data = {
25         "first": "false",
26         "pn": pn,
27         "kd": "爬虫"
28     }
29     # params = urllib.parse.urlencode(params)
30     # url = url + params
31     data = urllib.parse.urlencode(data).encode('utf-8')
32     req = urllib.request.Request(url, data=data, headers=headers)
33     print('正在请求第%d页' % pn)
34     str_data = urllib.request.urlopen(req).read()
35     with open('03.html', 'wb') as f:
36         f.write(str_data)
37     # 转换成python对象
38     data_list = json.loads(str_data)
39     job_list = jsonpath.jsonpath(data_list, "$..result")[0]
40 
41     for item in job_list:
42         position_dict = {}
43         position_dict['positionName'] = item.get('positionName')
44         position_dict['createTime'] = item.get('createTime')
45         position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'
46 
47         position_dict['salary'] = item.get('salary')
48         position_dict['workYear'] = item.get('workYear')
49         position_dict['companySize'] = item.get('companySize')
50         list_position.append(position_dict)
51 
52 # 保存到json文件
53 json.dump(list_position, open('03.json', 'w'))
54 
55 # 保存到csv文件  'gbk' codec can't encode character '\u200b' in position 0: illegal multibyte seq
56 csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
57 sheets = list_position[0].keys()  # 表头
58 row_content = []
59 for item in list_position:
60     row_content.append(item.values())  # 内容
61 try:
62     csv_writer.writerow(sheets)
63     csv_writer.writerows(row_content)
64 except Excepti

转载于:https://www.cnblogs.com/We612/p/9978288.html

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/250231.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

小程序 --- 点击放大功能、获取位置信息、文字样式省略、页面跳转(navigateTo)

1. 点击放大功能的实现 需求: 点击轮播图中的图片会实现放大预览的功能。首先有轮播图的样式如下 <!-- pages/goods_detail/index.wxml --> <!-- 轮播图 --> <view class"detail_swiper"><swiperautoplaycircularindicator-dots><swip…

Axure实现多用户注册验证

*****多用户登录验证***** 一、&#xff08;常规想法&#xff09;方法&#xff1a;工作量较大&#xff0c;做起来繁琐 1、当用户名和密码相同时怎么区分两者&#xff0c;使用冒号和括号来区分&#xff1a; eg. (admin:123456)(123456:demo)(zhang:san);由此得出前面是括号后面是…

前端插件网址

http://www.swiper.com.cn/转载于:https://www.cnblogs.com/luchuangao/p/9088057.html

python --- opencv部分学习

1. OpenCV 1.1 opencv概念 OpenCV是一个基于BSD许可(开源)发行的跨平台计算机视觉库可以运行在Linux、Windows、Android和Mac OS操作系统上它轻量级而且高效 – 有一系列C函数和少量 C 类构成同时提供了 Python、Ruby、MATLAB等语言的接口实现了图像处理和计算机视觉方面的很…

hive与hbase集成

环境: hadoop2.7.7 hive3.1.0 hbase2.0.2 1.jar包拷贝(之所以用这种方式,是因为这种方式最为稳妥,最开始用的软连接的方式,总是却少jar包)到hive的lib目录下删除所有hbase相关的jar rm -rf hbase-*.jar 接着从hbase的lib目录下拷贝所有的hbase相关jar cp -a hbasehome/lib/hba…

Winform(C#)输入完毕后,按Enter键触发Button事件

如在输入“用户名”和“密码”之后&#xff0c;有些人习惯按“回车键”来代替页面上的“确定”按钮&#xff0c;那么这一功能在Winform(C#)里如何实现呢&#xff1f; 触发密码文本框的KeyDown事件&#xff0c;代码如下&#xff1a; [c-sharp] view plaincopy private void txtP…

Maximum Xor Secondary(单调栈好题)

Maximum Xor Secondary CodeForces - 280B Bike loves looking for the second maximum element in the sequence. The second maximum element in the sequence of distinct numbers x1, x2, ..., xk (k > 1) is such maximum element xj, that the following inequa…

python --- udp的使用

1. python的模块导入规则 参考 1.1 系统自带模块 系统自带的模块直接import导入 import time import unittest1.2 第三方下载模块 第三方下载模块也可以直接导入 import HTMLTestRunner import requests1.3 导入模块的部分函数或类 from time import sleep,strftime fro…

杂项-公司:唯品会

ylbtech-杂项-公司&#xff1a;唯品会唯品会公司成立于2008年08月&#xff0c;2012年3月23日登陆美国纽约证券交易所上市&#xff08;股票代码&#xff1a;VIPS&#xff09;。成为华南第一家在美国纽交所上市的电子商务企业。主营B2C商城唯品会名牌折扣网站是一家致力于打造中高…

python --- 使用socket创建tcp服务

1. 网络-tcp 参考 1.1 tcp简介 介绍 TCP协议,传输控制协议(英语: Transmission Control Protocol, 缩写为TCP)是一种面向连接的、可靠的、基于字节流的传输层通信协议,由IETF的RFC 793定义. TCP通信需要经过创建连接、数据传送、终止连接三个步骤. TCP通信模型中,在通信开…

Linux基本的操作

一、为什么我们要学习Linux 相信大部分人的PC端都是用Windows系统的&#xff0c;那我们为什么要学习Linux这个操作系统呢&#xff1f;&#xff1f;&#xff1f;Windows图形化界面做得这么好&#xff0c;日常基本使用的话&#xff0c;学习成本几乎为零。 而Linux不一样&#xff…

汇编语言 实验4

实验4 实验内容1&#xff1a;综合使用 loop,[bx]&#xff0c;编写完整汇编程序&#xff0c;实现向内存 b800:07b8 开始的连续 16 个 字单元重复填充字数据 0403H&#xff1b;修改0403H为0441H&#xff0c;再次运行 步骤1&#xff1a;在记事本中编写好temp.asm文件 步骤2&#x…

python --- 线程

1. 多任务 - 线程 参考 首先考虑一个没有多任务的程序: import timedef sing():# 唱歌 5 秒钟for i in range(5):print("-----菊花台ing....-----")time.sleep(1)def dance():# 跳舞 5秒钟for i in range(5):print("-----跳舞.....-----")time.sleep(5)d…

Python 链接汇总

MNIST手写识别 转载于:https://www.cnblogs.com/bycnboy/p/9095199.html

17种常用的JS正则表达式 非负浮点数 非负正数

<input typetext idSYS_PAGE_JumpPage nameSYS_PAGE_JumpPage size3 maxlength5 οnkeyupthis.valuethis.value.replace(/[^1-9]\D*$/,"") οndragenter"return false" οnpaste"return !clipboardData.getData(text).match(/\D/)"" sty…

python --- 使用conda配置pytorch

使用Conda配置PyTorch 1. 添加channels 下载地址 $ conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ $ conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ $ conda config --add channels htt…

LDAP第三天 MySQL+LDAP 安装

https://www.easysoft.com/applications/openldap/back-sql-odbc.html OpenLDAP 使用 SQLServer 和 Oracle 数据库。 https://www.cnblogs.com/bigbrotherer/p/7251372.html          CentOS7安装OpenLDAPMySQLPHPLDAPadmin 1.安装和设置数据库 在CentOS7下&…

Myeclipse连接Mysql数据库时报错:Error while performing database login with the pro driver:unable...

driver template: Mysql connector/j&#xff08;下拉框进行选择&#xff09; driver name: 任意填&#xff0c;最好是数据库名称&#xff0c;方便查找 connection URL: jdbc:mysql://localhost:3306/programmableweb User name: 用户名 password: 密码 Driver jars: 添加jar包…

Centos6.5静态IP设置

1.创建新的虚拟机 2.打开终端&#xff0c;打开/etc/sysconfig/network-scripts/ifcfg-eth0文件 3.将BOOTPROTOstatic&#xff0c;原值为dhcp 4.添加 IPADDR192.168.43.125  #静态IP GATEWAY192.168.43.1  #网关 NETMASK255.255.255.0  #子网掩码 NETWORK192.168.43.0  …

matlab --- 图像处理基础

MATLAB图像处理 1. 数字图像处理 参考 数字图像处理(Digital Image Processing)又称为计算机图像处理,是一种将图像信号数字化利用计算进行处理的过程。随着计算机科学、电子学和光学的发展,数字图像处理已经广泛的应用到诸多领域之中。本小节主要介绍图像的概念、分类和数字…