一、实验目的
使用Python解决问题
二、实验要求
自主编写并运行代码,按照模板要求撰写实验报告
三、实验步骤
1 爬取并下载当当网某一本书的网页内容,并保存为html格式
2 在豆瓣网上爬取某本书的前50条短评内容并计算评分的平均值(自学正则表达式)
3 从https://cs.lianjia.com/上爬取长沙某小区的二手房信息(以名都花园为例),并将其保存到EXCEL文件当中
四、实验结果
T1
"""
爬取并下载当当网某一本书的网页内容,并保存为html格式
"""
import os
from urllib import requestheader = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}url = 'http://product.dangdang.com/24029955.html'req = request.Request(url, headers=header)html = str(request.urlopen(req).read)is_exist = os.path.exists('DangDang.html')if not is_exist:with open('DangDang.html', 'w+') as f:f.write(html)else:print('File already exsist')
T2
"""
在豆瓣网上爬取某本书的前50条短评内容并计算评分的平均值(自学正则表达式)
"""
import re
from urllib import requestfrom bs4 import BeautifulSoupcomments = []
list = []def get_commment(comment):count = 0for i in comment:count = count + 1# print(count, i.string) # 也可以使用正则comments.append(i.string)def get_score(score):pattern = re.compile('<span class="user-stars allstar(.*?) rating"')res = re.findall(pattern, str(score))for irr in res:list.append(float(irr))header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}p = 0for i in range(0, 3):url = f'https://book.douban.com/subject/26912767/comments/?start={i * 20}&limit={(i + 1) * 20}&status=P&sort=new_score'req = request.Request(url, headers=header)html = request.urlopen(req).read()soup = BeautifulSoup(html, 'html.parser')# get_commment(html.find_all("span", class_="short"))get_score(soup)get_commment(soup.find_all("span", class_="short"))for j in range(0, 50):print(comments[j])sum = 0.0
for j in range(0, 50):sum = sum + float(list[j])
print(sum / 50 * 2 / 10)
T3
"""
从https://cs.lianjia.com/上爬取长沙某小区的二手房信息(以名都花园为例),并将其保存到EXCEL文件当中
"""
from urllib import request
import xlwt
from bs4 import BeautifulSoupdef getHouseList(url):house = []header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}req = request.Request(url, headers = header)html = request.urlopen(req).read()soup = BeautifulSoup(html, 'html.parser')housename_divs = soup.find_all('div', class_='title')for housename_div in housename_divs:housename_as = housename_div.find_all('a')for housename_a in housename_as:housename = []housename.append(housename_a.get_text())housename.append(housename_a.get('href'))house.append(housename)huseinfo_divs = soup.find_all('div', class_='houseInfo')for i in range(len(huseinfo_divs)):info = huseinfo_divs[i].get_text()infos = info.split('|')# 小区名称house[i].append(infos[0])# 户型house[i].append(infos[1])# 平米house[i].append(infos[2])# 查询总价house_prices = soup.find_all('div', class_='totalPrice')for i in range(len(house_prices)):# 价格price = house_prices[i].get_text()house[i].append(price)return house# 爬取房屋详细信息:所在区域、套内面积
def houseinfo(url):header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}req = request.Request(url, headers=header)html = request.urlopen(req).read()soup = BeautifulSoup(html, 'html.parser')msg = []# 所在区域areainfos = soup.find_all('span', class_='info')for areainfo in areainfos:area = areainfo.find('a')if (not area):continuehrefStr = area['href']if (hrefStr.startswith('javascript')):continuemsg.append(area.get_text())breakinfolist = soup.find_all('div', id='infoList')num = []for info in infolist:cols = info.find_all('div', class_='col')for i in cols:pingmi = i.get_text()try:a = float(pingmi[:-2])num.append(a)except ValueError:continuemsg.append(sum(num))return msgdef writeExcel(excelPath, houses):workbook = xlwt.Workbook()sheet = workbook.add_sheet('git')row0 = ['标题', '链接地址', '户型', '面积', '朝向', '总价', '所属区域', '套内面积']for i in range(0, len(row0)):sheet.write(0, i, row0[i])for i in range(0, len(houses)):house = houses[i]print(house)for j in range(0, len(house)):sheet.write(i + 1, j, house[j])workbook.save(excelPath)# 主函数
def main():data = []for i in range(1, 5):print('-----分隔符', i, '-------')if i == 1:url = 'https://cs.lianjia.com/ershoufang/c3511059937033rs%E5%90%8D%E9%83%BD%E8%8A%B1%E5%9B%AD/'else:url = 'https://cs.lianjia.com/ershoufang/pg' + str(i) + 'c3511059937033rs%E5%90%8D%E9%83%BD%E8%8A%B1%E5%9B%AD/'houses = getHouseList(url)for house in houses:link = house[1]if (not link or not link.startswith('http')):continuemianji = houseinfo(link)house.extend(mianji)data.extend(houses)writeExcel('C:/Users/Lunatic/Desktop/cs.xls', data)if __name__ == '__main__':main()
五、实验体会
爬虫是Python重要的应用场景,在使用相关技术时不仅仅需要熟悉相关的Python库,更要仔细分析网页,寻找其中规律进行爬取,达成自动化的初衷。