from bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_xx' xx='zz'>lqz <b>The Dormouse's story <span>彭于晏</span></b> xx</p><p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><p class="story">...</p>
"""if __name__ =='__main__':# soup = BeautifulSoup(html_doc, 'html.parser')soup = BeautifulSoup(html_doc,'lxml')# pip install lxml# print(soup.find_all(name='html'))1.文档容错能力res = soup.prettify()print(res)2.遍历文档树:文档树(html开头---->html结尾,中间包含了很多标签)# 通过 .来查找标签 ,且只能找到最先查找到的第一个print(soup.html)print(soup.html.body.p)# 一层一层的查找到指定的标签print(soup.p)# 跨层级,直接查找3.获取标签名称print(soup.body.name)4.获取标签的属性p = soup.html.body.pprint(p.attrs)# 获取p标签的所有属性print(p.attrs['class'])# 获取指定的一个属性 html类属性可以填写多个所以放在列表中 ['title']print(p.attrs.get('xx'))print(soup.a.attrs['href'])5.获取标签的内容# 标签对象.textprint(soup.p.b.text)# 获取b标签的所有文本# 标签对象.string 使用string指定的标签下,只有自己的文本即可获取,嵌套了标签则为Noneprint(soup.p.b.string)# None string不能有子 、孙标签print(soup.p.b.span.string)# 彭于晏# 标签对象.strings,strings拿到的是一个生成器对象,会把子子孙孙的文本内容都放入生成器中print(soup.p.b.strings)# 和text很像,不过更节约内存print(list(soup.p.b.strings))#["The Dormouse's story ", '彭于晏']6.嵌套选择print(soup.html.head.title)'''------了解内容------'''7.子节点、子孙节点print(soup.p.contents)# 获取p标签下所有的子节点,只取一个pprint(soup.p.children)# 直接子节点,得到一个迭代器,包含p标签下所有子节点for i,child inenumerate(soup.p.children):# list_iterator 迭代器print(i,child)print(soup.p.descendants)# 获取子孙节点,p标签下所有的标签都会选择出来for i,child inenumerate(soup.p.descendants):# generator 生成器print(i,child)8.父节点、祖先节点print(soup.a.parent)# 获取a标签的父节点print(soup.a.parents)# 找到a标签所有的祖先节点 generatorprint(list(soup.a.parents))9.兄弟节点print(soup.a.next_sibling)# 下一个兄弟标签print(soup.a.previous_sibling)# 上一个兄弟标签print(list(soup.a.next_siblings))# 下面的兄弟们=>生成器对象print(soup.a.previous_siblings)# 上面的兄弟们=>生成器对象
三、搜索文档树
from bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my_p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b>
</p><p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')"""五种过滤器: 字符串、正则表达式、列表、True、方法 """# find:找第一个,find_all:找所有1.字符串----->查询条件是字符串
res = soup.find(id='my_p')
res=soup.find(class_='boldest')
res=soup.find(href='http://example.com/elsie')
res=soup.find(name='a',href='http://example.com/elsie',id='link1')# 多个and条件'可以写成下面的,但是里面不能写name'
res = soup.find(attrs={'class':'sister','href':'http://example.com/elsie'})print(res)2.正则表达式
import re
res = soup.find_all(href=re.compile('^http'))# href属性以http为开头的所有
res = soup.find_all(class_=re.compile('^s'))# 所有class中以s为开头的print(res)3.列表
res = soup.find_all(name=['a','b'])# 拿到所有的a/b标签列表
res = soup.find_all(class_=['sister','boldest'])# 拿到类名为sister、boldest的标签print(res)4.布尔
res = soup.find_all(id=True)# 拿到所有带有id的标签列表
res = soup.find_all(href=True)# 所有href属性的标签
res = soup.find_all(class_=True)# 所有class_属性的标签print(res)5.方法
defhas_class_but_no_id(tag):# 查询所有有id但是没有class的标签return tag.has_attr('class')andnot tag.has_attr('id')print(soup.find_all(has_class_but_no_id))6.搜索文档树可以结合遍历文档树来使用
print(soup.html.body.find_all('p'))# 速度会更快一些,缩小范围查找7.recursive=True limit=1 limit 参数
print(soup.find_all(name='p',limit=2))# 只拿前两个p标签 限制拿取条数print(soup.find_all(name='p',recursive=False))# 是否递归查找
四、css选择器
from bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my_p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b>
</p><p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')'''select内写css选择器'''
res = soup.select('a.sister')
res = soup.select('#link1')
res = soup.select('p#my_p b')print(res)'''可以在网页中控制台里面,对应的标签中右键点击Copy selector'''import requests
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'}
res=requests.get('https://www.zdaye.com/free/',headers=header)# print(res.text)
soup=BeautifulSoup(res.text,'lxml')
res = soup.select('#ipc > tbody > tr:nth-child(2) > td.mtd')print(res[0].text)
website address
https://www.youtube.com/watch?vHD13eq_Pmp8
excerpt
All right, what’s going on? everybody. It’s your Bro, hope you’re doing well, and in this video I’m going to help you started with html; so sit back, relax and enjoy the show. If y…