记录最近一次爬虫和数据可视化练习
0、数据获取
import pandas as pdyear = [2018,2019,2020,2021,2022]
header = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36 Edg/111.0.1661.44"
}
for i in year:url = ""df = pd.read_html(url, header=None)[13]# 删除空行df = df.drop(index=[1])# 删除最后一列df = df.drop(columns=["久其软件(002279) 利润表单位:万元.5"])# 如果不是2018年,则删除第一列if i != 2018:df = df.drop(columns=["久其软件(002279) 利润表单位:万元"])df1 = df.T# 写入csv文件df1.to_csv('2019.csv', header=None, mode="a", index=None)
1、数据可视化
1.1从文件中读取数据
# 读取写入的文件
all_data = pd.read_csv("2019.csv")
# all_data.duplicated().sum() # 查看重复值
# all_data.drop_duplicates(inplace=True)
# all_data.isnull().sum() # 查看空值
# all_data.fillna(0, inplace=True)
# all_data.reset_index(drop=True, inplace=True) # 重置索引
1.2绘图
这里使用pyecharts来进行绘制,官方文档,官方demo
绘制折柱混合图
import pyecharts
from pyecharts.charts import Line, Bar, Grid, WordCloud
import pyecharts.options as opts# 省略数据处理部分bar1 = (Bar().add_xaxis(year).add_yaxis("营业收入", year_income, bar_width=50).set_global_opts(title_opts=opts.TitleOpts(title="营业收入", subtitle="单位(元)"), yaxis_opts=opts.AxisOpts(min_=200000, max_=400000),legend_opts=opts.LegendOpts(is_show=False))
)
line1 = (Line().add_xaxis(year).add_yaxis("营业收入", year_income, linestyle_opts=opts.LineStyleOpts(color="red", width=4))
)bar2 = (Bar().add_xaxis(year).add_yaxis("营业费用", year_fee, bar_width=50).set_global_opts(title_opts=opts.TitleOpts(title="营业费用", subtitle="单位(元)", pos_left="48%"), yaxis_opts=opts.AxisOpts(min_=250000, max_=400000),legend_opts=opts.LegendOpts(is_show=False))
)
line2 = (Line().add_xaxis(year).add_yaxis("营业费用", year_fee, linestyle_opts=opts.LineStyleOpts(color="red", width=4))
)
overlap1 = bar1.overlap(line1)
overlap2 = bar2.overlap(line2)grid = (Grid(init_opts=opts.InitOpts(width="1200px", height="800px", page_title="营收与利润")).add(overlap1, grid_opts=opts.GridOpts(pos_right="58%")).add(overlap2, grid_opts=opts.GridOpts(pos_left="58%")).render('1.html')
)
绘制标签切换图
# 绘图
def create_line(profit):season = []accu_money = [] # 累积利润money = [] # 每季度利润for key in profit:season.append(key)accu_money.append(profit[key])accu_money = accu_money[::-1]money.append(accu_money[0])money.append(accu_money[1] - accu_money[0])money.append(accu_money[2] - accu_money[1])money.append(accu_money[3] - accu_money[2])line = (Line().add_xaxis(season[::-1]).add_yaxis("", money).set_global_opts(title_opts=opts.TitleOpts(title="季度利润", subtitle="单位(元)")))return linetab = Tab(page_title="季度利润") # 设置网页标签的标题
tab.add(create_line(profit_2018), "2018")
tab.add(create_line(profit_2019), "2019")
tab.add(create_line(profit_2020), "2020")
tab.add(create_line(profit_2021), "2021")
tab.add(create_line(profit_2022), "2022")
tab.render("2.html")
绘制大屏
首先绘制要展示的各种图表,通过page对象进行整合
# 创建page对象整合布局
from pyecharts.charts import Page# 实例化page对象,指定布局方式
page = Page(layout=Page.DraggablePageLayout, page_title="基于Pyecharts的数据大屏")
page.add(bar1,line1,bar2,line2,create_line(profit_2018),create_line(profit_2019),create_line(profit_2020),create_line(profit_2021),create_line(profit_2022),
)
page.render("4.html")
接着在浏览器中调整各图表的大小和位置,保存json文件后,重新进行渲染
Page.save_resize_html(source="4.html", cfg_file="./chart_config.json", dest="5.html")