import pandas as pddef calculate_goods_covariance():# 定义商品销售数据字典goods_sales_data = {"时期": ["一期", "二期", "三期", "四期"],"苹果": [15, 16, 3, 2],"橘子": [12, 14, 16, 18],"石榴": [11, 8, 7, 1]}# 将字典转换为DataFrame对象goods_dataframe = pd.DataFrame(goods_sales_data)# 选取数值列进行协方差计算numerical_columns = goods_dataframe.select_dtypes(include=['number'])# 计算商品销售数据的协方差矩阵covariance_matrix = numerical_columns.cov()return covariance_matrix# 调用函数计算协方差矩阵
result = calculate_goods_covariance()
# 打印协方差矩阵
print(result)
苹果 橘子 石榴
苹果 56.666667 -17.333333 24.333333
橘子 -17.333333 6.666667 -10.333333
石榴 24.333333 -10.333333 17.583333
import pandas as pd# 示例数据
data = {'x': [1, 2, 3, 4, 5],'y': [5, 4, 3, 2, 1]
}
df = pd.DataFrame(data)
# 计算协方差
result = df['x'].cov(df['y'])
print("使用 Pandas 计算的协方差:", result)# 本人计算过程手动推导
# x的平均值等于 (1 + 2 + 3 + 4 + 5 ) / 5 = 3
# y的平均值等于 (5 + 4 + 3 + 2 + 1 ) / 5 = 3
# w =1-3)* (5-3) + (2-3) * (4-3) + (3-3) * (3-3) + (4-3) * (2-3) + (5-3) * (1-3)
# w/(n-1) n = 5
# ((1-3)* (5-3) + (2-3) * (4-3) + (3-3) * (3-3) + (4-3) * (2-3) + (5-3) * (1-3) ) /4
# => -2.5
样本协方差公式:
C o v ( X , Y ) = 1 n − 1 ∑ i = 1 n ( x i − x ˉ ) ( y i − y ˉ ) Cov(X,Y) = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y}) Cov(X,Y)=n−11i=1∑n(xi−xˉ)(yi−yˉ)
总体协方差公式:
C o v ( X , Y ) = 1 n ∑ i = 1 n ( x i − μ x ) ( y i − μ y ) Cov(X,Y) = \frac{1}{n} \sum_{i=1}^{n} (x_i - \mu_x)(y_i - \mu_y) Cov(X,Y)=n1i=1∑n(xi−μx)(yi−μy)
样本协方差公式
样本协方差是衡量两个变量之间相关性的统计量,其公式为:
s X Y = 1 n − 1 ∑ i = 1 n ( X i − X ˉ ) ( Y i − Y ˉ ) s_{XY} = \frac{1}{n-1} \sum_{i=1}^{n} (X_i - \bar{X})(Y_i - \bar{Y}) sXY=n−11i=1∑n(Xi−Xˉ)(Yi−Yˉ)
其中:
- (X_i) 和 (Y_i) 是样本数据点。
- (\bar{X}) 和 (\bar{Y}) 是样本 (X) 和 (Y) 的平均值。
- (n) 是样本数据点的数量。
- (s_{XY}) 是样本协方差。
样本协方差的意义:
- 正协方差表示两个变量倾向于同方向变动。
- 负协方差表示两个变量倾向于反方向变动。
- 协方差接近零表示两个变量之间无线性相关关系。
from pandas import DataFrame
import numpy as np
paints={"时期":["一期","二期","三期","四期"],"苹果":[15,16,3,2],"橘子":[12,14,16,18],"石榴":[11,8,7,1]}
goods_in=DataFrame(paints)
numerical_columns = goods_in.select_dtypes(include=['number'])
goods_sum=numerical_columns.corr()
print(goods_sum)
苹果 橘子 石榴
苹果 1.000000 -0.891793 0.770881
橘子 -0.891793 1.000000 -0.954411
石榴 0.770881 -0.954411 1.000000
from pandas import DataFrame
paints={"字画名称":["旭日东升","富水长流","招财进宝","鸿运当头"],"字画底价":[2860,498,1068,598],"字画拍卖加价":[1000,2000,500,1500]}
goods_in=DataFrame(paints,index=["第一幅","第二幅","第三幅","第四幅"])
goods_in.to_csv("paint.csv")
from pandas import DataFrame
import numpy as np
paints={"电话号码":["138xxxx1111","189xxxx1111","139xxxx1111","130xxxx1111","131xxxx1111"]}
goods_in=DataFrame(paints)
# 从电话号码中提取了前3位作为运营商前缀这个维度的数据。
goods_in["运营商前缀"]=goods_in["电话号码"].str.slice(0,3)
print(goods_in)
电话号码 运营商前缀
0 138xxxx1111 138
1 189xxxx1111 189
2 139xxxx1111 139
3 130xxxx1111 130
4 131xxxx1111 131
import pandas as pd
from pandas import DataFrame
login={"会员Id":[110,111,112,113],"会员名称":["刘一","赵二","薛三","陆四"],"会员密码":["admin","123456","000000","888888"]}
info={"会员Id":[110,111,112,113],"会员地址":["北京朝阳","北京丰台","北京大兴","河北廊坊"],"会员会费":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,on="会员Id")
print(member)# 代码中定义了会员的登录login_member的DataFrame,又定义了会
# 员信息member_info的DataFrame。这两个DataFrame通过“会员Id”实
# 现了一一对应的关系,通过merge()方法把两个DataFrame数据结构合
# 并,在合并时指明参数on的值是以“会员Id”这个维度作为参考的
会员Id 会员名称 会员密码 会员地址 会员会费
0 110 刘一 admin 北京朝阳 250
1 111 赵二 123456 北京丰台 360
2 112 薛三 000000 北京大兴 470
3 113 陆四 888888 河北廊坊 550
import pandas as pd
from pandas import DataFrame
login={"会员Number号码":[110,111,112,113],"会员名称":["刘一","赵二","薛三","陆四"],"会员密码":["admin","123456","000000","888888"]}
info={"会员Card":[110,111,112,113],"会员地址":["北京朝阳","北京丰台","北京大兴","河北廊坊"],"会员会费":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_on="会员Number号码",right_on="会员Card")
print(member)
会员Number号码 会员名称 会员密码 会员Card 会员地址 会员会费
0 110 刘一 admin 110 北京朝阳 250
1 111 赵二 123456 111 北京丰台 360
2 112 薛三 000000 112 北京大兴 470
3 113 陆四 888888 113 河北廊坊 550
import pandas as pd
from pandas import DataFrame
login={"会员Number号码":[110,111,112,114],"会员名称":["刘一","赵二","薛三","陆四"],"会员密码":["admin","123456","000000","888888"]}
info={"会员Card":[110,111,112,113],"会员地址":["北京朝阳","北京丰台","北京大兴","河北廊坊"],"会员会费":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_on="会员Number号码",right_on="会员""Card")
print(member)
会员Number号码 会员名称 会员密码 会员Card 会员地址 会员会费
0 110 刘一 admin 110 北京朝阳 250
1 111 赵二 123456 111 北京丰台 360
2 112 薛三 000000 112 北京大兴 470
Cov ( X , Y ) = 1 n − 1 ∑ i = 1 n ( X i − X ˉ ) ( Y i − Y ˉ ) \text{Cov}(X, Y) = \frac{1}{n-1} \sum_{i=1}^{n} (X_i - \bar{X})(Y_i - \bar{Y}) Cov(X,Y)=n−11i=1∑n(Xi−Xˉ)(Yi−Yˉ)
import pandas as pd
from pandas import DataFrame
login={"会员Number号码":[110,111,112,114],"会员名称":["刘一","赵二","薛三","陆四"],"会员密码":["admin","123456","000000","888888"]}
info={"会员Card":[110,111,112,113],"会员地址":["北京朝阳","北京丰台","北京大兴","河北廊坊"],"会员会费":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_on="会员Number号码",right_on="会员Card",how="outer")
print(member)
会员Number号码 会员名称 会员密码 会员Card 会员地址 会员会费
0 110.0 刘一 admin 110.0 北京朝阳 250.0
1 111.0 赵二 123456 111.0 北京丰台 360.0
2 112.0 薛三 000000 112.0 北京大兴 470.0
3 NaN NaN NaN 113.0 河北廊坊 550.0
4 114.0 陆四 888888 NaN NaN NaN
import pandas as pd
from pandas import DataFrame
login={"会员名称":["刘一","赵二","薛三","陆四"],"会员密码":["admin","123456","000000","888888"]}
info={"会员地址":["北京朝阳","北京丰台","北京大兴","河北廊坊"],"会员会费":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_index=True,right_index=True)
print(member)
会员名称 会员密码 会员地址 会员会费
1 刘一 admin 北京朝阳 250
2 赵二 123456 北京丰台 360
3 薛三 000000 北京大兴 470
4 陆四 888888 河北廊坊 550
import pandas as pd
from pandas import DataFrame
login={"会员名称":["刘一","赵二","薛三","陆四"],"会员密码":["admin","123456","000000","888888"]}
info={"会员地址":["北京朝阳","北京丰台","北京大兴","河北廊坊"],"会员会费":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,5])
member_info=DataFrame(info,index=[1,2,3,4])
member=login_member.join(member_info)
print(member)
会员名称 会员密码 会员地址 会员会费
1 刘一 admin 北京朝阳 250.0
2 赵二 123456 北京丰台 360.0
3 薛三 000000 北京大兴 470.0
5 陆四 888888 NaN NaN
import pandas as pd
from pandas import Series
member1=Series([1,350],index=["会员级别","会员最低消费"])
member2=Series([2,100,10],index=["会员购买产品次数","会员卡最低存额","会员活动次数"])
member3=Series([2],index=["会员推荐人数"])
member=pd.concat([member1,member2,member3])
print(member)
会员级别 1
会员最低消费 350
会员购买产品次数 2
会员卡最低存额 100
会员活动次数 10
会员推荐人数 2
dtype: int64
import pandas as pd
from pandas import Series
member1=Series([1,350],index=["会员级别","会员最低消费"])
member2=Series([2,100,10],index=["会员购买产品次数","会员卡最低存额","会员活动次数"])
member3=Series([2],index=["会员推荐人数"])
member=pd.concat([member1,member2,member3],axis=1)
print(member)# 从结果上看,输出了DataFrame数据结构,这是由于传入axis=1,
# 结果就会变成一个DataFrame,其中的axis=1是列,没有数据的地方就
# 会显示NaN。concat()在这里也体现了并运算,把所有不同的维度连接
# 在了一起。在这种情况下,另外一条轴上没有重叠,从索引的有序并
# 集(外连接)上就可以看出来。
0 1 2
会员级别 1.0 NaN NaN
会员最低消费 350.0 NaN NaN
会员购买产品次数 NaN 2.0 NaN
会员卡最低存额 NaN 100.0 NaN
会员活动次数 NaN 10.0 NaN
会员推荐人数 NaN NaN 2.0
import pandas as pd
from pandas import Series
member1=Series([1,350],index=["会员级别","会员最低消费"])
member2=Series([1,100,10,2],index=["会员级别","会员卡最低存额","会员活动次数","会员推荐人数"])
member3=Series([1,350,2],index=["会员级别","会员最低消费","会员推荐人数"])
member=pd.concat([member1,member2,member3],axis=1,join="inner")
print(member)
0 1 2
会员级别 1 1 1
# 代码中使用groupby()方法对“会员消费”与“会员级别”组成的
# DataFrame数据进行分组,分组参照的维度为“会员级别”。sum()方
# 法的作用是对分组之后的“会员消费”进行求和运算,旨在统计不同
# 的会员级别的消费情况import pandas as pd
from pandas import DataFrame
member=DataFrame({"会员级别":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"会员消费情况":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900]})
member_group=member.groupby("会员级别").sum()
print(member_group)
会员消费情况
会员级别
1 466
2 1966
3 4231
4 1900
5 7387
import pandas as pd
from pandas import DataFrame
member=DataFrame({"会员级别":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"会员消费情况":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900],"会员参与活动数目":[1,3,10,5,3,3,6,8,4,2,3,3,6,5,4,1]})
member_group=member.groupby(["会员级别","会员参与活动数目"]).sum()
print(member_group)
会员消费情况
会员级别 会员参与活动数目
1 1 1003 366
2 3 5004 4866 980
3 2 13155 2916
4 1 1900
5 4 23898 249810 2500
import pandas as pd
import numpy as np
from pandas import DataFrame
member=DataFrame({"会员级别":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"会员消费情况":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900],"会员参与活动数目":[1,3,10,5,3,3,6,8,4,2,3,3,6,5,4,1]})
member_group=member.groupby("会员级别").agg([np.sum,np.mean,np.std])
print(member_group)
会员消费情况 会员参与活动数目 sum mean std sum mean std
会员级别
1 466 93.200000 4.969909 13 2.600000 0.894427
2 1966 491.500000 5.972158 19 4.750000 1.500000
3 4231 1410.333333 88.189191 12 4.000000 1.732051
4 1900 1900.000000 NaN 1 1.000000 NaN
5 7387 2462.333333 63.516402 22 7.333333 3.055050
# 代码中pivot_table后面的参数,第一个参数是需要进行透视表操
# 作的DataFrame数据,第二个参数是建立透视表时以“会员级别”维度
# 作为索引,第三个参数是统计的时候的运算方法,如是求和还是求平
# 均数等。这里是求和
import pandas as pd
import numpy as np
from pandas import DataFrame
member=DataFrame({"会员级别":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"会员消费情况":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900],"会员参与活动数目":[1,3,10,5,3,3,6,8,4,2,3,3,6,5,4,1]})
member_table=pd.pivot_table(member,index=["会员级别"],aggfunc=
[np.sum])
print(member_table)
sum 会员参与活动数目 会员消费情况
会员级别
1 13 466
2 19 1966
3 12 4231
4 1 1900
5 22 7387