爬虫之获取人人车网站中车辆的信息
import base64
import timeimport pymysql
from fontTools.ttLib import TTFont
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import recar_id_list = []# 字体解析
def font_analysis(driver):style_text = driver.find_element(By.XPATH, "//style[1]").get_attribute("innerHTML")# 取出base64后的码base64_str = re.match("(.*?)base64,(.*?)'(.*?)", style_text).group(2)# 将其转换为# encode():以指定的编码格式编码字符串,默认编码为 'utf-8'# decodebytes():使用base64.decodebytes(s)方法,我们可以使用此方法获得二进制形式的解码字符串base64_bytes = base64.decodebytes(base64_str.encode())with open("./rrCar.ttf", 'wb') as fp:fp.write(base64_bytes)# TTFont()读取文件ttf = TTFont("./rrCar.ttf")# 输出数字与uni编码之间的关系li = ttf.getGlyphOrder()[1:]# 输出uni编码与数字之间的关系,输出类型为字典dic = ttf.getBestCmap()new_dict = {}for k, v in dic.items():new_dict[chr(k)] = li.index(v)return new_dict# 获取页面内的数据信息
def fetch_data(driver, font_dict):li_list = driver.find_elements(By.XPATH, "//ul[@class='infos infos-card h-clearfix']/li")# 存储car_id, car_brand,car_title, car_year, car_mile, new_car_pricecar_data = []for li in li_list:try:car_id = li.get_attribute("data-entid")#标签查找错误# car_brand = li.find_element(By.XPATH, "div/a/div[2]/h2/span/font").text.strip()# car_title = li.find_element(By.XPATH, "div/a/div[2]/h2/span").text.strip()# car_year_mile = li.find_element(By.XPATH, "div/a/div[2]/h2/div[2]").text.strip()# car_price = li.find_element(By.XPATH, "div/a/div[3]/b").text.strip()car_brand = li.find_element(By.XPATH,"div[@class='info--wrap']//span[@class='info_link']/font").text.strip()brand = li.find_element(By.CLASS_NAME, "info_link").text.strip()car_title = li.find_element(By.XPATH, "div[@class='info--wrap']//span[@class='info_link']").text.strip()car_year_mile = li.find_element(By.XPATH,"div[@class='info--wrap']//div[@class='info_params']").text.strip()car_price = li.find_element(By.XPATH,"div[@class='info--wrap']//b[@class='info_price fontSecret']").text.strip()car_year = re.match("(.*?)年", car_year_mile).group(1)car_mile = re.match("(.*?)·(.*?)万", car_year_mile).group(2)new_car_price = ''for i in car_price:if i == '.':new_car_price += '.'else:new_car_price += str(font_dict.get(i))print(car_id, "#", car_brand, "#", car_title, "#", car_year, "#", car_mile, "#", car_price, "#",new_car_price)if car_id not in car_id_list:car_id_list.append(car_id)car_data.append((car_id, car_brand, car_title, car_year, car_mile, new_car_price))except Exception as e:print(e)continuereturn car_data# 获取全部城市的名称的拼音
def fecth_cities(driver):js = """document.getElementsByClassName('citySelectWrap')[0].style.display='block';"""driver.execute_script(js)time.sleep(5)city_list = []a_list = driver.find_elements(By.XPATH, "//div[@class='citySelectWrap']//a[@class='city-item']")for a in a_list:city_list.append(a.get_attribute("listname"))print(city_list)return city_listpass# 写入数据库
# 建表语句
"""
create table rrCar(car_id bigint PRIMARY KEY, car_brand varchar(255), car_title varchar(255), car_year int, car_mile float, car_price float, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP on UPDATE CURRENT_TIMESTAMP
);
"""# 数据库插入数据
def mysql_insert(conn, cursor, car_data):try:insert_sql = "insert int project.rrCar(car_id, car_brand,car_title, car_year, car_mile, new_car_price) values(%s,%s,%s,%s,%s,%s)"cursor.execute(insert_sql, car_data)except Exception as e:print(e)print("failure")else:conn.commit()print("success")if __name__ == '__main__':# 配置无头模式opt = Options()opt.add_argument("--headless")driver = webdriver.Edge(options=opt)# 连接数据库conn = pymysql.connect(host='master', port=3306, user='root', password='123456')# 创建游标cursor = conn.cursor()# 获取所有城市# 这个地址只是用于获取所有的城市信息url = f'https://www.renrenche.com/hf/ershouche'driver.get(url)time.sleep(5)city_list = fecth_cities(driver)for city in city_list:time.sleep(5)print(city)url = f'https://www.renrenche.com/{city}/ershouche'driver.get(url)# 获取字体解析的信息font_dict = font_analysis(driver)# 将驱动传入函数,在函数内部获取网页的详细详信息car_data = fetch_data(driver, font_dict)# 将数据写入MySQLmysql_insert(conn, cursor, car_data)print(city)driver.close()cursor.close()conn.close()pass