确定股票池
from concurrent.futures import ThreadPoolExecutor
import urllib
import os
from time import sleep
import pandas as pd
# 上证代码
shanghaicode = []
for i in range(600000, 606000, 1):shanghaicode.append(str(i))# 深证代码
shenzhencode = []
for i in range(1000000, 1005000, 1):i = str(i)[1:]shenzhencode.append(i)
爬取数据
def get_data(num):url = 'http://quotes.money.163.com/service/lrb_' + str(num) + '.html'while True:try:content = urllib.request.urlopen(url, timeout=2).read()path = '利润表_multi/' + str(num) + '.csv'if os.path.exists(path):print(path + " already existed!!!")breakwith open('利润表_multi/' + str(num) + '.csv', 'wb') as f:f.write(content)print(num)sleep(1)except Exception as e:if str(e) == 'HTTP Error 404: Not Found':print(f"{num} : {e}")breakelse:print(e)
多线程运作
executor = ThreadPoolExecutor(max_workers=10)
executor.map(get_data, shenzhencode)
executor.shutdown()executor = ThreadPoolExecutor(max_workers=10)
executor.map(get_data, shanghaicode)
executor.shutdown()
读取本地数据
def generatefile(path):names = []for dirpath, dirnames, filenames in os.walk(path):names = filenamesreturn names
datapath = '利润表_multi/'
datalist = generatefile(datapath)
invest = []
for data in datalist:try:path = datapath + datatemp = pd.read_csv(path, encoding='gbk', header=None)temp = pd.DataFrame(temp.values.T, index=temp.columns, columns=temp.index)temp.columns = temp.loc[0]temp = temp[1:]temp = temp[:-1]#temp['报告日期'] = temp['报告日期'].apply(convert_date)temp = temp[['报告日期','净利润(万元)']]temp['净利润(万元)'] = temp['净利润(万元)'].astype(int)temp_g = pd.DataFrame(temp.groupby('报告日期').sum())temp_g = temp_g[:-1] # 去除2021temp_g.reset_index(inplace=True)temp_g = temp_g['净利润(万元)']anu_diff = temp_g.diff()temp_g = temp_g.values# anu_diff = anu_diff.valuestemp_g = temp_g[::-1]ratio = 0.3if len(temp_g) >= 5:# rate = anu_diff[-5:]/temp_g[-6:-1]# if rate[-1] >= ratio and rate[-2] >= ratio and rate[-3] >= ratio and rate[-4] >= ratio:# invest.append(data)growth_anu = []for i in range(len(temp_g)):if i == (len(temp_g)-1):continueyear = temp_g[i]ex_year = temp_g[i+1]if i+1 <= 5:growth = (year - ex_year)/ex_yeargrowth_anu.append(growth)if growth_anu[0] >= ratio and growth_anu[1] >= ratio and growth_anu[2] >= ratio and growth_anu[3] >= ratio:invest.append(data)except Exception as e:print(data + f':{e}')
invest = pd.DataFrame(invest)
invest.to_excel('连续4年增长30%.xls')