爬虫安居客新房

一、首先看网址

后面有全部代码

https://hf.fang.anjuke.com/loupan/baohequ/p3

这种形式很好分析，https://hf.fang.anjuke.com/loupan/+行政区域+页码

xinfang_area = ["feixixian", "baohequ", "shushanqu", "luyangqu", "yaohaiqu", "gaoxinqu","feidongxian", "zhengwuqu", "jingjikaifaqu"]  # 行政区域
url = "https://hf.fang.anjuke.com/loupan"  # 新房
new_url = f"{url}/{area}/p{n}" # 网页

我们用requests库获取页面内容，再用bs解析，获得bs对象，代码：

for area in xinfang_area:n = 1while True:headers = make_headers()if n == 1:new_url = f"{url}/{area}"else:new_url = f"{url}/{area}/p{n}"print(new_url)res = requests.get(new_url, headers=headers).textcontent = BeautifulSoup(res, "html.parser")if content is None:  # 重试n = 1continue

二、看内容

每一块的内容都是在 <div class="item-mod">标签下面

根据刚获取的页面内容（页面包含当页所有楼盘的内容），用bs的find_all根据class:item-mod获得所有块的列表，我们看看每一块的网页是什么：

根据每一块的，内容代码基本完成了：

data = content.find_all('div', attrs={'class': 'item-mod'})
for d in data:lp_name = d.find_next("a", attrs={"class": "lp-name"}).textaddress = d.find_next("a", attrs={"class": "address"}).texthuxing = d.find_next("a", attrs={"class": "huxing"}).texttags = d.find_next("a", attrs={"class": "tags-wrap"}).textprices = d.find_next("a", attrs={"class": "favor-pos"}).textprice = re.findall(r'\d+', prices)[0]  # 具体价格# 写入数据row_data = [area, lp_name, address, huxing, tags, prices, price]with open(file_name, 'a', encoding='utf-8') as f:writer = csv.writer(f)writer.writerow(row_data)m += 1print(area, f"第{n}页第{m}条数据")

三、换区域逻辑

不废话，直接分析

我们看到页面有下一页标签，我们对比有下一页与尾页的下一页标签的不同

这是有下一页的

这是尾页的

我们发现，如果尾页的下一页标签<span class="next-page stat-disable"> 说明是尾页了

此时我们的网页可以到下一个区域爬取了

next_page = content.find('span', attrs={'class': 'next-page stat-disable'})
if next_page is not None:  # 没有下一页break

四、全部代码

注意，如果没有数据可能是网页需要验证！

其他城市自己分析网页试试吧，我就不解释了

import requests
import csv
import time
import refrom bs4 import BeautifulSoup
from user_agent import make_headersxinfang_area = ["feixixian", "baohequ", "shushanqu", "luyangqu", "yaohaiqu", "gaoxinqu","feidongxian", "zhengwuqu", "jingjikaifaqu"]
url = "https://hf.fang.anjuke.com/loupan"  # 新房
file_name = 'anjuke/xinfang.csv'
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"}with open(file_name, 'w', encoding='utf-8') as f:writer = csv.writer(f)# 2:写表头writer.writerow(['区域', '楼盘', '地址', '户型', "其他", '价格', '单价'])
for area in xinfang_area:n = 1while True:# headers = make_headers()if n == 1:new_url = f"{url}/{area}"else:new_url = f"{url}/{area}/p{n}"print(new_url)res = requests.get(new_url, headers=headers).textcontent = BeautifulSoup(res, "html.parser")if content is None:  # 重试n = 1print("正在重试")continue# 当前页和尾页判断next_page = content.find('span', attrs={'class': 'next-page stat-disable'})# 解析数据print(area, f"第{n}页数据")m = 0data = content.find_all('div', attrs={'class': 'item-mod'})for d in data:lp_name = d.find_next("a", attrs={"class": "lp-name"}).textaddress = d.find_next("a", attrs={"class": "address"}).texthuxing = d.find_next("a", attrs={"class": "huxing"}).texttags = d.find_next("a", attrs={"class": "tags-wrap"}).textprices = d.find_next("a", attrs={"class": "favor-pos"}).textprice = re.findall(r'\d+', prices)  # 具体价格if len(price) > 0:price = price[0]# 写入数据row_data = [area, lp_name, address, huxing, tags, prices, price]with open(file_name, 'a', encoding='utf-8') as f:writer = csv.writer(f)writer.writerow(row_data)m += 1print(area, f"第{n}页第{m}条数据")if next_page is not None:  # 没有下一页breakn += 1time.sleep(2)new_url = None

代码更新：

import requests
import csv
import time
import refrom bs4 import BeautifulSoup
from user_agent import make_headersclass CrawlAnJuKeXinFang:def __init__(self, areas, url, save_path):self.areas = areasself.url = urlself.save_path = save_path# 自机写的随机获取一个header的函数self.__headers = make_headers()self.__create_file()self.__nums = 0self.__crawl()print(f"完成！总计条{self.__nums}数据")# 创建文件def __create_file(self):with open(self.save_path, 'w', encoding='utf-8', newline="") as f:  # newline=""防止空行writer = csv.writer(f)writer.writerow(['区域', '楼盘', '地址', '户型', "其他", '价格', '单价', '在售'])# 保存数据def __save_data(self, row_data):with open(self.save_path, 'a', encoding='utf-8', newline="") as f:writer = csv.writer(f)writer.writerow(row_data)# 处理数据def __handle_data(self, data, area):m = 0for d in data:try:lp_name = d.find_next("a", attrs={"class": "lp-name"}).textaddress = d.find_next("a", attrs={"class": "address"}).texthuxing = d.find_next("a", attrs={"class": "huxing"}).texttags = d.find_next("a", attrs={"class": "tags-wrap"}).text.replace("\n", ";")tags = tags[2:]onsale = tags.split(';')[0]  # 是否在售prices = d.find_next("a", attrs={"class": "favor-pos"}).textprice = re.findall(r'\d+', prices)  # 具体价格price = price[0] if len(price) > 0 else "待定"row_data = [area, lp_name, address, huxing, tags, prices, price, onsale]self.__save_data(row_data)except Exception as err:print(err)print("数据获取有误！")# 写入数据m += 1print(area, f"第{m}条数据")self.__nums += mdef __crawl(self):for area in self.areas:n = 1while True:if n == 1:new_url = f"{self.url}/{area}"else:new_url = f"{self.url}/{area}/p{n}"print(new_url)print(f"{area}第{n}页数据——————————————————————————————————")res = requests.get(new_url, headers=self.__headers).textcontent = BeautifulSoup(res, "html.parser")# 当前页和尾页判断next_page = content.find('span', attrs={'class': 'next-page stat-disable'})# 解析数据data = content.find_all('div', attrs={'class': 'item-mod'})if data is None:  # 重试n = 1print("正在重试！")continue# 处理数据self.__handle_data(data, area)footer = content.find('div', attrs={"class": "pagination"})  # 是否有换页的控件没有就换一个区县# print(footer)if next_page is not None or footer is None:  # 没有下一页breakn += 1time.sleep(2)if __name__ == '__main__':xinfang_area = ["taihuxian", "susongxian", "wangjiangxian", "yuexixian", "qianshanshi", "tongchengshi","huaningxian","daguanqu", "yixiuqu", "yingjiangqu"]url = "https://aq.fang.anjuke.com/loupan"  # 安庆新房file_name = 'anjuke/anqing_xinfang.csv'Crawler = CrawlAnJuKeXinFang(xinfang_area, url, file_name)

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.mzph.cn/news/641804.shtml

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！