深交所的股票信息读取比较简单:
看上图,爬虫读取到下载按钮的链接之后发起请求,得到XLS文件后直接解析就可以了。
这里放出深交所爬虫模块的代码:
# -*- coding: utf-8 -*-
# 深圳交易所爬虫
import osimport pandas as pd
import requests#读取最新深交所股票列表
def get_stock_list():cache_file_path = "./sotck_file.xlsx"url = "https://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1110&TABKEY=tab1"response = requests.get(url)open(cache_file_path, "wb").write(response.content)file = pd.read_excel(cache_file_path, dtype={'A股代码': str})stocks = []for index, row in file.iterrows():# 处理每一行的数据stocks.append(row)os.remove(cache_file_path)return stocks
爬虫模块向控制模块返回数据后由控制模块作格式统一处理:
import A_SH_basic
from MySQL import ExecInsert, ExecSelect
import A_SZ_basic
from Tools import CustomExceptionLIST = "L" #上市状态:上市
DELISTED = "D"#上市状态:退市
PAUSED = "P" #上市状态:暂停上市
SZSE = "SZSE" #交易所:深交所
SSE = "SSE" #交易所:上交所#更新A股股票列表
def update_A_stock_list(SZ=False,SH=False,BJ=False):database = "stock_a"if SZ == True:# 执行更新前先读取历史数据,若某条待更新数据与历史数据一致则跳过该条数据select_sql = "select * from stock_list where exchange = 'SZSE'"insert_sql = "insert into stock_list(stock_code,stock_name,province,industry,industry_2,enname,market,exchange,list_status,list_date,delist_date,total_share,float_share) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"update_sql = "update stock_list set stock_name=%s,province=%s,industry=%s,industry_2=%s,enname=%s,market=%s,exchange=%s,list_status=%s,list_date=%s,delist_date=%s,total_share=%s,float_share=%s where stock_code=%s"select_result = ExecSelect(database, select_sql) # 读取查询结果# 调用深交所爬虫读取上市股票列表stocks = A_SZ_basic.get_stock_list()insert_rows = []update_rows = []for stock in stocks:exist_flag = False# 遍历历史数据查询该股票是否在历史数据中for history in select_result:if history[0] == stock["A股代码"]:exist_flag = Truebreak# 该股票在历史数据中查得到,说明是旧股票,检查是否有需要更新的地方,有则放入update_rows等待写入,无则丢弃if exist_flag:if history[1] == stock["A股简称"] and history[2] == stock["省 份"] and history[3] == stock["所属行业"].split(" ")[0] and history[5] == stock["英文名称"] and history[6] == stock["板块"] and history[7] == SZSE and history[8] == LIST and history[9] == stock["A股上市日期"] and history[11] == stock["A股总股本"].replace(',', '') and history[12] == stock["A股流通股本"].replace(',', ''):passelse:stock_code = stock["A股代码"]stock_name = stock["A股简称"]province = stock["省 份"]industry = stock["所属行业"].split(" ")[0] # 一级行业industry_2 = None # 深交所数据无二级行业enname = stock["英文名称"]market = stock["板块"] # 市场类型(主板、创业板、科创板、CDR)exchange = SZSE # 交易所list_status = LIST # 上市状态list_date = stock["A股上市日期"]delist_date = None # 上市股暂无退市日期total_share = stock["A股总股本"].replace(',', '')float_share = stock["A股流通股本"].replace(',', '')update_rows.append((stock_name, province, industry, industry_2, enname, market, exchange,list_status, list_date, delist_date, total_share, float_share, stock_code))# 如果该股票在历史数据中查不到,说明是新股票,放入insert_rows等待写入else:stock_code = stock["A股代码"]stock_name = stock["A股简称"]province = stock["省 份"]industry = stock["所属行业"].split(" ")[0] # 一级行业industry_2 = None # 深交所数据无二级行业enname = stock["英文名称"]market = stock["板块"] # 市场类型(主板、创业板、科创板、CDR)exchange = SZSE # 交易所list_status = LIST # 上市状态list_date = stock["A股上市日期"]delist_date = None # 上市股暂无退市日期total_share = stock["A股总股本"].replace(',', '')float_share = stock["A股流通股本"].replace(',', '')insert_rows.append((stock_code, stock_name, province, industry, industry_2, enname, market, exchange,list_status, list_date, delist_date, total_share, float_share))# 写入数据库中不存在的新股票if len(insert_rows) > 0:result = ExecInsert(database, insert_sql, insert_rows)if result == 'success':print("写入深交所上市股票成功.")else:raise CustomException("写入深交所上市股票时发生数据库异常:" + result)# 更新数据库中存在的股票信息if len(update_rows) > 0:result = ExecInsert(database, update_sql, update_rows)if result == 'success':print("更新深交所上市股票成功.")else:raise CustomException("更新深交所上市股票时发生数据库异常:" + result)print("深交所上市股票更新结束.")
至此,我们实现了第一步:本地化存储两市5000多家股票的基本信息:
stock_list的字段结构如下: