文章目录 爬虫介绍 requsets模块和快速使用 携带请求头 携带cookie的两种方式 post携带参数 模拟登录 取出cookie和处理cookie 响应对象 证书 使用代理 超时 异常处理 上传文件
爬虫介绍
- 通过编程技术- - - 》把互联网中的数据- - - 》获取到- - - 》数据清洗- - - 》存到库中python: request, selenium- - - 》app,小程序,网站- - - 》xpaht,lxml- - - 》mysql,redis,文件,excel,mongodb- 通过编程语言- - - 》模拟发送http请求- - - 》获取数据- - - 》解析- - 》入库- 爬取过程 解析过程 会遇到反扒- 抓app,小程序- - - 》抓包工具- - - 》抓取手机发送的所有请求- charles- Fiddler- 君子协议- https: // xxx/ robots. txt
- 百度/ 谷歌 搜索引擎- - - 》启动了一个爬虫- - - 》一刻不停的在互联网中爬取网站- - - 》存到库中( es) - 用户在百度输入框中- - - 》输入搜索内容- - - 》去百度的库中搜索- - 》返回给前端- - - 》前端点击- - - 》去了真正的地址- seo 优化- 不花钱- - - 》搜索关键词的结果- - - 》排的靠前- 伪静态- sem 优化- 花钱买关键词
requsets模块和快速使用
import requests
res = requests. get( 'https://api.map.baidu.com/place/v2/search?ak=6E823f587c95f0148c19993539b99295®ion=上海&query=肯德基&output=json' )
print ( res. text)
params = { }
params = { 'ak' : '6E823f587c95f0148c19993539b99295' , 'region' : '上海' , 'query' : '肯德基' , 'output' : 'json' ,
} res = requests. get( 'https://api.map.baidu.com/place/v2/search' , params= params)
print ( res. text)
from urllib. parse import quote, unquote
s= '上海'
print ( quote( s) ) print ( unquote( '%E4%B8%8A%E6%B5%B7' ) )
携带请求头
import requests
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
res = requests. get( 'https://dig.chouti.com/' , headers= headers) print ( res. text)
携带cookie的两种方式
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' , 'Cookie' : 'token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjdHVfNzA4NjEwMzY4NTkiLCJleHBpcmUiOiIxNzExMjAyMzY4ODAyIn0.TnUYjU6KqR1itEW6QkTSSUfqc48rkT3hnsg4Cvh4XA4; path=/; Max-Age=2592000' ,
}
data = { 'linkId' : '41601398'
}
res = requests. post( 'https://dig.chouti.com/link/vote' , headers= headers, data= data)
print ( res. text)
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ,
}
data = { 'linkId' : '41600539'
}
cookie = { 'token' : 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjdHVfNzA4NjEwMzY4NTkiLCJleHBpcmUiOiIxNzExMjAyMzY4ODAyIn0.TnUYjU6KqR1itEW6QkTSSUfqc48rkT3hnsg4Cvh4XA4; path=/; Max-Age=2592000' ,
}
res = requests. post( 'https://dig.chouti.com/link/vote' , headers= headers, data= data, cookies= cookie)
print ( res. text)
post携带参数
import requests
res= requests. post( '地址' , data= b'name=lqz&age=19' )
res= requests. post( '地址' , json= { 'name' : 'lqz' , 'age' : 19 } )
模拟登录 取出cookie和处理cookie
import requests
import requestsdata = { 'username' : '616564099@qq.com' , 'password' : 'lqz123111' , 'captcha' : '3333' , 'remember' : '1' , 'ref' : ' http://www.aa7a.cn/' , 'act' : 'act_login' ,
}
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
res = requests. post( 'http://www.aa7a.cn/user.php' , headers= header, data= data)
print ( res. text)
cookies = res. cookies
print ( cookies)
res = requests. get( 'http://www.aa7a.cn/' , cookies= cookies)
print ( '616564099@qq.com' in res. text)
import requests
session = requests. session( )
data = { 'username' : '616564099@qq.com' , 'password' : 'lqz12311' , 'captcha' : '3333' , 'remember' : '1' , 'ref' : ' http://www.aa7a.cn/' , 'act' : 'act_login' ,
}
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
res = session. post( 'http://www.aa7a.cn/user.php' , headers= header, data= data)
res1 = session. get( 'http://www.aa7a.cn/' )
print ( '616564099@qq.com' in res1. text)
响应对象
import requestsheader = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
respone = requests. get( 'https://www.jianshu.com/' , headers= header)
print ( respone. text) print ( respone. content) print ( respone. status_code) print ( respone. headers)
print ( respone. cookies)
print ( respone. cookies. get_dict( ) )
print ( respone. cookies. items( ) ) print ( respone. url)
print ( respone. history) print ( respone. encoding) res = respone. iter_content( ) == == == == == == == == == == == == == =
import requestsheader = { 'Referer' : 'http://www.baidu.com'
}
res = requests. get( 'https://meizi5.com/wp-content/uploads/2024/02/VOL_181_1.jpg' , headers= header)
print ( res. url)
print ( res. content)
with open ( '美女.jpg' , 'wb' ) as f: f. write( res. content)
证书
- http:超文本传输协议- https:安全的超文本传输协议- https= http+ ssl/ tls- 防止:篡改,截取。。。。- 必须有证书:才能通信import requests
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
respone = requests. get( 'https://www.jianshu.com/' , headers= header, verify= False )
print ( respone. text)
使用代理
import requests
res = requests. get( 'http://demo.spiderpy.cn/get/?type=https' )
print ( res. json( ) )
print ( res. json( ) [ 'proxy' ] )
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}
response = requests. get( 'https://www.jianshu.com/' , headers= header, proxies= { 'https' : res. json( ) [ 'proxy' ] } )
print ( response. text)
超时 异常处理 上传文件
import requests
respone= requests. get( 'https://www.baidu.com' , timeout= 1 )
import requests
from requests. exceptions import * try : r= requests. get( 'http://www.baidu.com' , timeout= 0.00001 )
except ReadTimeout: print ( '===:' )
except RequestException: print ( 'Error' ) import requests
files= { 'file' : open ( 'a.jpg' , 'rb' ) }
respone= requests. post( 'http://httpbin.org/post' , files= files)
print ( respone. status_code)