快速入门
1
import numpy as nps=pd.series([1,3,5,np.nan,8,4])
Series
s=pd.Series([1,3,5,np.nan,8,4])sOut[6]: 0 1.01 3.02 5.03 NaN4 8.05 4.0dtype: float64
date_range
dates=pd.date_range('20190301',periods=6)datesOut[10]: DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04','2019-03-05', '2019-03-06'],dtype='datetime64[ns]', freq='D')
二维数组索引
data=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))dataOut[14]: A B C D2019-03-01 1.866219 -1.003057 2.526899 -0.5633432019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.4124362019-03-05 -1.854786 0.980092 0.797543 -0.432381data.shapeOut[15]: (6, 4)data.valuesOut[16]: array([[ 1.86621859, -1.0030566 , 2.52689932, -0.56334339],[-1.41362647, 1.11672695, -0.47005354, -0.56793016],[ 0.11220208, -0.62181257, -0.65821206, -0.53731156],[-0.5678393 , 2.17593279, 1.12604991, -0.412436 ],[-1.85478576, 0.98009218, 0.79754332, -0.43238061],[-0.28618627, 0.7689724 , 0.75578607, 0.15118955]])
字典时间戳
d={'A':1,'B':pd.Timestamp('20130301'),'C':range(4),'D':np.arange(4)}dOut[19]: {'A': 1,'B': Timestamp('2013-03-01 00:00:00'),'C': range(0, 4),'D': array([0, 1, 2, 3])}
构造二维形式
df=pd.DataFrame(d)dfOut[21]: A B C D0 1 2013-03-01 0 01 1 2013-03-01 1 12 1 2013-03-01 2 2
查看数据和排序修改
data.head(2)Out[22]: A B C D2019-03-01 1.866219 -1.003057 2.526899 -0.5633432019-03-02 -1.413626 1.116727 -0.470054 -0.567930data.tail()Out[23]: A B C D2019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.4124362019-03-05 -1.854786 0.980092 0.797543 -0.4323812019-03-06 -0.286186 0.768972 0.755786 0.151190data.indexOut[24]: DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04','2019-03-05', '2019-03-06'],dtype='datetime64[ns]', freq='D')data.columnsOut[25]: Index(['A', 'B', 'C', 'D'], dtype='object')data.describe()Out[26]: A B C Dcount 6.000000 6.000000 6.000000 6.000000mean -0.357336 0.569476 0.679669 -0.393702std 1.309362 1.181577 1.161292 0.275140min -1.854786 -1.003057 -0.658212 -0.56793025% -1.202180 -0.274116 -0.163594 -0.55683550% -0.427013 0.874532 0.776665 -0.48484675% 0.012605 1.082568 1.043923 -0.417422max 1.866219 2.175933 2.526899 0.151190data.TOut[27]: 2019-03-01 2019-03-02 2019-03-03 2019-03-04 2019-03-05 2019-03-06A 1.866219 -1.413626 0.112202 -0.567839 -1.854786 -0.286186B -1.003057 1.116727 -0.621813 2.175933 0.980092 0.768972C 2.526899 -0.470054 -0.658212 1.126050 0.797543 0.755786D -0.563343 -0.567930 -0.537312 -0.412436 -0.432381 0.151190安列排序data.sort_index(axis=1)Out[28]: A B C D2019-03-01 1.866219 -1.003057 2.526899 -0.5633432019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.4124362019-03-05 -1.854786 0.980092 0.797543 -0.4323812019-03-06 -0.286186 0.768972 0.755786 0.151190data.sort_index(axis=1,ascending=False)Out[29]: D C B A2019-03-01 -0.563343 2.526899 -1.003057 1.8662192019-03-02 -0.567930 -0.470054 1.116727 -1.4136262019-03-03 -0.537312 -0.658212 -0.621813 0.1122022019-03-04 -0.412436 1.126050 2.175933 -0.5678392019-03-05 -0.432381 0.797543 0.980092 -1.8547862019-03-06 0.151190 0.755786 0.768972 -0.286186data.sort_index(axis=1,ascending=True)Out[30]: A B C D2019-03-01 1.866219 -1.003057 2.526899 -0.5633432019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.4124362019-03-05 -1.854786 0.980092 0.797543 -0.4323812019-03-06 -0.286186 0.768972 0.755786 0.151190data.sort_values(by='A')Out[31]: A B C D2019-03-05 -1.854786 0.980092 0.797543 -0.4323812019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-04 -0.567839 2.175933 1.126050 -0.4124362019-03-06 -0.286186 0.768972 0.755786 0.1511902019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-01 1.866219 -1.003057 2.526899 -0.563343data.AOut[32]: 2019-03-01 1.8662192019-03-02 -1.4136262019-03-03 0.1122022019-03-04 -0.5678392019-03-05 -1.8547862019-03-06 -0.286186Freq: D, Name: A, dtype: float64data[2:4]Out[33]: A B C D2019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.412436data['20190302':'20190305']Out[34]: A B C D2019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.412436loc效率较高data.loc['20190302':'20190304']Out[35]: A B C D2019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.412436data.iloc[2:4]Out[36]: A B C D2019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.412436data.loc[:,['B','C']]Out[37]: B C2019-03-01 -1.003057 2.5268992019-03-02 1.116727 -0.4700542019-03-03 -0.621813 -0.6582122019-03-04 2.175933 1.1260502019-03-05 0.980092 0.7975432019-03-06 0.768972 0.755786at效率更高访问特定值data.at[pd.Timestamp('20190302'),'B']Out[38]: 1.116726953479249data.iat[1,1]Out[39]: 1.116726953479249布尔索引data[data.A>0]Out[40]: A B C D2019-03-01 1.866219 -1.003057 2.526899 -0.5633432019-03-03 0.112202 -0.621813 -0.658212 -0.537312data[data>0]Out[41]: A B C D2019-03-01 1.866219 NaN 2.526899 NaN2019-03-02 NaN 1.116727 NaN NaN2019-03-03 0.112202 NaN NaN NaN2019-03-04 NaN 2.175933 1.126050 NaN2019-03-05 NaN 0.980092 0.797543 NaN2019-03-06 NaN 0.768972 0.755786 0.15119data2=data.copy()data2Out[43]: A B C D2019-03-01 1.866219 -1.003057 2.526899 -0.5633432019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.4124362019-03-05 -1.854786 0.980092 0.797543 -0.4323812019-03-06 -0.286186 0.768972 0.755786 0.151190tag=['a']*2+['b']*2+['c']*2data2['TAG']=tagdata2Out[46]: A B C D TAG2019-03-01 1.866219 -1.003057 2.526899 -0.563343 a2019-03-02 -1.413626 1.116727 -0.470054 -0.567930 a2019-03-03 0.112202 -0.621813 -0.658212 -0.537312 b2019-03-04 -0.567839 2.175933 1.126050 -0.412436 b2019-03-05 -1.854786 0.980092 0.797543 -0.432381 c2019-03-06 -0.286186 0.768972 0.755786 0.151190 cdata2[data2.TAG .isin(['a','c'])]Out[48]: A B C D TAG2019-03-01 1.866219 -1.003057 2.526899 -0.563343 a2019-03-02 -1.413626 1.116727 -0.470054 -0.567930 a2019-03-05 -1.854786 0.980092 0.797543 -0.432381 c2019-03-06 -0.286186 0.768972 0.755786 0.151190 cdata.iat[0,0]=100dataOut[50]: A B C D2019-03-01 100.000000 -1.003057 2.526899 -0.5633432019-03-02 -1.413626 1.116727 -0.470054 -0.5679302019-03-03 0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839 2.175933 1.126050 -0.4124362019-03-05 -1.854786 0.980092 0.797543 -0.4323812019-03-06 -0.286186 0.768972 0.755786 0.151190data.A=range(6)dataOut[53]: A B C D2019-03-01 0 -1.003057 2.526899 -0.5633432019-03-02 1 1.116727 -0.470054 -0.5679302019-03-03 2 -0.621813 -0.658212 -0.5373122019-03-04 3 2.175933 1.126050 -0.4124362019-03-05 4 0.980092 0.797543 -0.432381
2
import matplotlib.pyplot as pltimport numpy as np#矩阵运算import pandas as pddates=pd.date_range('20190301',periods=6)df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))dfOut[6]: A B C D2019-03-01 -1.916667 -0.291255 -0.733860 2.4341082019-03-02 1.557134 -1.545051 -0.961491 0.5150892019-03-03 1.331631 0.714138 1.231407 -0.4228832019-03-04 0.719223 -0.260048 0.523958 1.1728322019-03-05 0.293174 -0.045853 1.150185 0.631227
处理丢失数据
df1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])df1Out[8]: A B C D E2019-03-01 -1.916667 -0.291255 -0.733860 2.434108 NaN2019-03-02 1.557134 -1.545051 -0.961491 0.515089 NaN2019-03-03 1.331631 0.714138 1.231407 -0.422883 NaN2019-03-04 0.719223 -0.260048 0.523958 1.172832 NaNdf1.loc[dates[1:3],'E']=2df1Out[10]: A B C D E2019-03-01 -1.916667 -0.291255 -0.733860 2.434108 NaN2019-03-02 1.557134 -1.545051 -0.961491 0.515089 2.02019-03-03 1.331631 0.714138 1.231407 -0.422883 2.02019-03-04 0.719223 -0.260048 0.523958 1.172832 NaN丢掉空数据df1.dropna()Out[11]: A B C D E2019-03-02 1.557134 -1.545051 -0.961491 0.515089 2.02019-03-03 1.331631 0.714138 1.231407 -0.422883 2.0填充数据df1.fillna(value=5)Out[12]: A B C D E2019-03-01 -1.916667 -0.291255 -0.733860 2.434108 5.02019-03-02 1.557134 -1.545051 -0.961491 0.515089 2.02019-03-03 1.331631 0.714138 1.231407 -0.422883 2.02019-03-04 0.719223 -0.260048 0.523958 1.172832 5.0判断是否有空数据pd.isnull(df1)Out[13]: A B C D E2019-03-01 False False False False True2019-03-02 False False False False False2019-03-03 False False False False False2019-03-04 False False False False Truepd.isnull(df1).any()Out[14]: A FalseB FalseC FalseD FalseE Truedtype: boolpd.isnull(df1).any().any()Out[15]: Trueshift隐去数据s=pd.Series((1,3,5,np.nan,6,8),index=dates).shift(2)sOut[17]: 2019-03-01 NaN2019-03-02 NaN2019-03-03 1.02019-03-04 3.02019-03-05 5.02019-03-06 NaNFreq: D, dtype: float64dfOut[18]: A B C D2019-03-01 -1.916667 -0.291255 -0.733860 2.4341082019-03-02 1.557134 -1.545051 -0.961491 0.5150892019-03-03 1.331631 0.714138 1.231407 -0.4228832019-03-04 0.719223 -0.260048 0.523958 1.1728322019-03-05 0.293174 -0.045853 1.150185 0.6312272019-03-06 -1.177185 -0.906234 2.267641 -0.571508df.sub(s,axis='index')Out[19]: A B C D2019-03-01 NaN NaN NaN NaN2019-03-02 NaN NaN NaN NaN2019-03-03 0.331631 -0.285862 0.231407 -1.4228832019-03-04 -2.280777 -3.260048 -2.476042 -1.8271682019-03-05 -4.706826 -5.045853 -3.849815 -4.3687732019-03-06 NaN NaN NaN NaNapply函数,传入函数df.apply(np.cumsum)Out[22]: A B C D2019-03-01 -1.916667 -0.291255 -0.733860 2.4341082019-03-02 -0.359533 -1.836306 -1.695351 2.9491962019-03-03 0.972098 -1.122168 -0.463944 2.5263142019-03-04 1.691322 -1.382216 0.060014 3.6991462019-03-05 1.984496 -1.428069 1.210199 4.3303732019-03-06 0.807311 -2.334303 3.477840 3.758864dfOut[23]: A B C D2019-03-01 -1.916667 -0.291255 -0.733860 2.4341082019-03-02 1.557134 -1.545051 -0.961491 0.5150892019-03-03 1.331631 0.714138 1.231407 -0.4228832019-03-04 0.719223 -0.260048 0.523958 1.1728322019-03-05 0.293174 -0.045853 1.150185 0.6312272019-03-06 -1.177185 -0.906234 2.267641 -0.571508df.apply(lambda x:x.max()-x.min())Out[25]: A 3.473800B 2.259189C 3.229132D 3.005616dtype: float64
数据运算
s=pd.Series(np.random.randint(10,20,size=20))sOut[29]: 0 131 132 103 164 105 196 137 148 139 1310 1811 1012 1313 1614 1815 1616 1617 1418 1619 13dtype: int32s.value_counts()Out[31]: 13 716 510 318 214 219 1dtype: int64s.mode(3)Out[33]: 0 13dtype: int32
数据合并
df=pd.DataFrame(np.random.randn(10,4),columns=list('ABCD'))dfOut[35]: A B C D0 -0.068485 -0.731070 -1.158196 -0.9524691 -0.776078 0.118621 0.359391 -0.4275182 2.190398 -0.170339 -0.275725 0.1843323 0.111006 2.263383 -1.164128 -1.6531604 0.454094 -0.390870 0.181000 -0.7138915 1.770307 -0.125938 -0.470755 0.0730456 0.178509 -0.007117 -1.474438 1.2801517 -1.074046 -1.068972 0.821342 -1.0323828 0.767212 0.886415 -0.453329 -1.2609799 -0.053771 2.024723 -0.387595 0.039338df.iloc[:3]Out[36]: A B C D0 -0.068485 -0.731070 -1.158196 -0.9524691 -0.776078 0.118621 0.359391 -0.4275182 2.190398 -0.170339 -0.275725 0.184332df.iloc[3:7]Out[37]: A B C D3 0.111006 2.263383 -1.164128 -1.6531604 0.454094 -0.390870 0.181000 -0.7138915 1.770307 -0.125938 -0.470755 0.0730456 0.178509 -0.007117 -1.474438 1.280151df.iloc[7:]Out[38]: A B C D7 -1.074046 -1.068972 0.821342 -1.0323828 0.767212 0.886415 -0.453329 -1.2609799 -0.053771 2.024723 -0.387595 0.039338pd.concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]])Out[40]: A B C D0 -0.068485 -0.731070 -1.158196 -0.9524691 -0.776078 0.118621 0.359391 -0.4275182 2.190398 -0.170339 -0.275725 0.1843323 0.111006 2.263383 -1.164128 -1.6531604 0.454094 -0.390870 0.181000 -0.7138915 1.770307 -0.125938 -0.470755 0.0730456 0.178509 -0.007117 -1.474438 1.2801517 -1.074046 -1.068972 0.821342 -1.0323828 0.767212 0.886415 -0.453329 -1.2609799 -0.053771 2.024723 -0.387595 0.039338pd.merge(left,right,on='key')Out[45]: key lval_x lval_y0 foo 1 41 foo 1 52 foo 2 43 foo 2 5等价select*from left INNER JOIN right ON left.key=right.keydf.append(s,ignore_index=True)Out[46]: A B C D 0 ... 15 16 17 18 190 -0.068485 -0.731070 -1.158196 -0.952469 NaN ... NaN NaN NaN NaN NaN1 -0.776078 0.118621 0.359391 -0.427518 NaN ... NaN NaN NaN NaN NaN2 2.190398 -0.170339 -0.275725 0.184332 NaN ... NaN NaN NaN NaN NaN3 0.111006 2.263383 -1.164128 -1.653160 NaN ... NaN NaN NaN NaN NaN4 0.454094 -0.390870 0.181000 -0.713891 NaN ... NaN NaN NaN NaN NaN5 1.770307 -0.125938 -0.470755 0.073045 NaN ... NaN NaN NaN NaN NaN6 0.178509 -0.007117 -1.474438 1.280151 NaN ... NaN NaN NaN NaN NaN7 -1.074046 -1.068972 0.821342 -1.032382 NaN ... NaN NaN NaN NaN NaN8 0.767212 0.886415 -0.453329 -1.260979 NaN ... NaN NaN NaN NaN NaN9 -0.053771 2.024723 -0.387595 0.039338 NaN ... NaN NaN NaN NaN NaN10 NaN NaN NaN NaN 13.0 ... 16.0 16.0 14.0 16.0 13.0
数据分组
df=pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],'B':['one','one','two','three','two','two','one','three'],'C':np.random.randn(8),'D':np.random.randn(8)})dfOut[49]: A B C D0 foo one -0.985579 -1.1844931 bar one 0.462434 2.0054622 foo two -0.397243 -0.4022883 bar three 1.273866 -0.1701834 foo two 0.621279 0.3424495 bar two 0.839009 0.2783416 foo one -0.620062 1.0624727 foo three -0.368640 -0.846278df.groupby('A').sum()Out[50]: C DA bar 2.575308 2.113620foo -1.750244 -1.028139
3
数据整形
tuples=list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two',]]))tuplesOut[54]: [('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')]index=pd.MultiIndex.from_tuples(tuples,names=['first','second'])indexOut[57]: MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],codes=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])df=pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])dfOut[59]: A Bfirst second bar one 0.811642 -1.801518two -1.307845 -0.983155baz one -1.939176 -0.270718two 0.141660 -1.333879foo one -0.311622 1.027429two -1.819690 0.366389qux one -0.714346 2.603627two -1.325926 1.577150列索引变行索引stack=df.stack()stackOut[61]: first second bar one A 0.811642B -1.801518two A -1.307845B -0.983155baz one A -1.939176B -0.270718two A 0.141660B -1.333879foo one A -0.311622B 1.027429two A -1.819690B 0.366389qux one A -0.714346B 2.603627two A -1.325926B 1.577150dtype: float64stack.indexOut[62]: MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two'], ['A', 'B']],codes=[[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second', None])stack.unstack()Out[63]: A Bfirst second bar one 0.811642 -1.801518two -1.307845 -0.983155baz one -1.939176 -0.270718two 0.141660 -1.333879foo one -0.311622 1.027429two -1.819690 0.366389qux one -0.714346 2.603627two -1.325926 1.577150stack.unstack().unstack()Out[64]: A B second one two one twofirst bar 0.811642 -1.307845 -1.801518 -0.983155baz -1.939176 0.141660 -0.270718 -1.333879foo -0.311622 -1.819690 1.027429 0.366389qux -0.714346 -1.325926 2.603627 1.577150
数据透视
df=pd.DataFrame({'A':['one','one','two','three']*3,'B':['A','B','C']*4,'C':['foo','foo','foo','bar','bar','bar',]*2,'D':np.random.randn(12),'E':np.random.randn(12)})dfOut[66]: A B C D E0 one A foo 1.242650 -2.1862491 one B foo 0.390195 -0.0031802 two C foo 0.572055 -0.5565803 three A bar 1.125122 -0.1828274 one B bar 0.518552 0.2746755 one C bar -2.022590 -1.3099446 two A foo 0.509303 0.5740057 three B foo 0.204616 0.2865398 one C foo 1.657085 0.6997819 one A bar -0.502274 0.19188510 two B bar 0.799141 0.49477511 three C bar -0.355739 0.357618df.pivot_table(values=['D'],index=['A','B'],columns=['C'])Out[68]: D C bar fooA B one A -0.502274 1.242650B 0.518552 0.390195C -2.022590 1.657085three A 1.125122 NaNB NaN 0.204616C -0.355739 NaNtwo A NaN 0.509303B 0.799141 NaNC NaN 0.572055多个数据求平均值
时间序列
rng=pd.date_range('20190301',periods=600,freq='s')rngOut[71]: DatetimeIndex(['2019-03-01 00:00:00', '2019-03-01 00:00:01','2019-03-01 00:00:02', '2019-03-01 00:00:03','2019-03-01 00:00:04', '2019-03-01 00:00:05','2019-03-01 00:00:06', '2019-03-01 00:00:07','2019-03-01 00:00:08', '2019-03-01 00:00:09',...'2019-03-01 00:09:50', '2019-03-01 00:09:51','2019-03-01 00:09:52', '2019-03-01 00:09:53','2019-03-01 00:09:54', '2019-03-01 00:09:55','2019-03-01 00:09:56', '2019-03-01 00:09:57','2019-03-01 00:09:58', '2019-03-01 00:09:59'],dtype='datetime64[ns]', length=600, freq='S')a=pd.Series(np.random.randint(0,500,len(rng)),index=rng)aOut[73]: 2019-03-01 00:00:00 1872019-03-01 00:00:01 1402019-03-01 00:00:02 4062019-03-01 00:00:03 3982019-03-01 00:00:04 922019-03-01 00:00:05 3872019-03-01 00:00:06 452019-03-01 00:00:07 3052019-03-01 00:00:08 3562019-03-01 00:00:09 2522019-03-01 00:00:10 3832019-03-01 00:00:11 3432019-03-01 00:00:12 4552019-03-01 00:00:13 702019-03-01 00:00:14 4932019-03-01 00:00:15 3082019-03-01 00:00:16 4922019-03-01 00:00:17 492019-03-01 00:00:18 4002019-03-01 00:00:19 902019-03-01 00:00:20 3362019-03-01 00:00:21 502019-03-01 00:00:22 3972019-03-01 00:00:23 852019-03-01 00:00:24 4372019-03-01 00:00:25 4482019-03-01 00:00:26 4602019-03-01 00:00:27 3952019-03-01 00:00:28 4942019-03-01 00:00:29 463... 2019-03-01 00:09:30 1722019-03-01 00:09:31 602019-03-01 00:09:32 4692019-03-01 00:09:33 732019-03-01 00:09:34 652019-03-01 00:09:35 412019-03-01 00:09:36 1062019-03-01 00:09:37 1402019-03-01 00:09:38 2402019-03-01 00:09:39 4542019-03-01 00:09:40 4032019-03-01 00:09:41 3802019-03-01 00:09:42 962019-03-01 00:09:43 3432019-03-01 00:09:44 1822019-03-01 00:09:45 2462019-03-01 00:09:46 3662019-03-01 00:09:47 2832019-03-01 00:09:48 3992019-03-01 00:09:49 2782019-03-01 00:09:50 4042019-03-01 00:09:51 4242019-03-01 00:09:52 1112019-03-01 00:09:53 1262019-03-01 00:09:54 1172019-03-01 00:09:55 1202019-03-01 00:09:56 1682019-03-01 00:09:57 922019-03-01 00:09:58 1472019-03-01 00:09:59 455Freq: S, Length: 600, dtype: int32a.resample('2Min',how='sum')D:\PyCharm\helpers\pydev\pydevconsole.py:1: FutureWarning: how in .resample() is deprecatedthe new syntax is .resample(...).sum()'''Out[74]: 2019-03-01 00:00:00 324982019-03-01 00:02:00 329542019-03-01 00:04:00 292392019-03-01 00:06:00 302752019-03-01 00:08:00 27312Freq: 2T, dtype: int32rng=pd.period_range('2000Q1','2019Q1',freq='Q')rngOut[76]: PeriodIndex(['2000Q1', '2000Q2', '2000Q3', '2000Q4', '2001Q1', '2001Q2','2001Q3', '2001Q4', '2002Q1', '2002Q2', '2002Q3', '2002Q4','2003Q1', '2003Q2', '2003Q3', '2003Q4', '2004Q1', '2004Q2','2004Q3', '2004Q4', '2005Q1', '2005Q2', '2005Q3', '2005Q4','2006Q1', '2006Q2', '2006Q3', '2006Q4', '2007Q1', '2007Q2','2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3', '2008Q4','2009Q1', '2009Q2', '2009Q3', '2009Q4', '2010Q1', '2010Q2','2010Q3', '2010Q4', '2011Q1', '2011Q2', '2011Q3', '2011Q4','2012Q1', '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2','2013Q3', '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4','2015Q1', '2015Q2', '2015Q3', '2015Q4', '2016Q1', '2016Q2','2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4','2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1'],dtype='period[Q-DEC]', freq='Q-DEC')rng.to_timestamp()Out[77]: DatetimeIndex(['2000-01-01', '2000-04-01', '2000-07-01', '2000-10-01','2001-01-01', '2001-04-01', '2001-07-01', '2001-10-01','2002-01-01', '2002-04-01', '2002-07-01', '2002-10-01','2003-01-01', '2003-04-01', '2003-07-01', '2003-10-01','2004-01-01', '2004-04-01', '2004-07-01', '2004-10-01','2005-01-01', '2005-04-01', '2005-07-01', '2005-10-01','2006-01-01', '2006-04-01', '2006-07-01', '2006-10-01','2007-01-01', '2007-04-01', '2007-07-01', '2007-10-01','2008-01-01', '2008-04-01', '2008-07-01', '2008-10-01','2009-01-01', '2009-04-01', '2009-07-01', '2009-10-01','2010-01-01', '2010-04-01', '2010-07-01', '2010-10-01','2011-01-01', '2011-04-01', '2011-07-01', '2011-10-01','2012-01-01', '2012-04-01', '2012-07-01', '2012-10-01','2013-01-01', '2013-04-01', '2013-07-01', '2013-10-01','2014-01-01', '2014-04-01', '2014-07-01', '2014-10-01','2015-01-01', '2015-04-01', '2015-07-01', '2015-10-01','2016-01-01', '2016-04-01', '2016-07-01', '2016-10-01','2017-01-01', '2017-04-01', '2017-07-01', '2017-10-01','2018-01-01', '2018-04-01', '2018-07-01', '2018-10-01','2019-01-01'],dtype='datetime64[ns]', freq='QS-OCT')时间运算pd.Timestamp('20190302')-pd.Timestamp('20180817')Out[78]: Timedelta('197 days 00:00:00')pd.Timestamp('20190821')+pd.Timedelta(days=90)Out[79]: Timestamp('2019-11-19 00:00:00')
数据可视化
插入操作df=pd.DataFrame({'ID':(1,2,3,4,5,6),'raw_grade':('a','b','b','a','a','d')})dfOut[82]: ID raw_grade0 1 a1 2 b2 3 b3 4 a4 5 a5 6 ddf['grade']=df.raw_grade.astype('category')dfOut[84]: ID raw_grade grade0 1 a a1 2 b b2 3 b b3 4 a a4 5 a a5 6 d ddf.grade.cat.categories=['very good','good','bad']dfOut[86]: ID raw_grade grade0 1 a very good1 2 b good2 3 b good3 4 a very good4 5 a very good5 6 d bads=pd.Series(np.random.randn(1000),index=pd.date_range('20000101',periods=1000))sOut[88]: 2000-01-01 0.2117372000-01-02 0.9776882000-01-03 0.1587802000-01-04 -0.6080212000-01-05 -0.5789802000-01-06 0.6408272000-01-07 0.2359842000-01-08 0.2403082000-01-09 -0.8685252000-01-10 0.3306772000-01-11 -0.4211352000-01-12 -0.6731162000-01-13 -0.2724062000-01-14 1.2920672000-01-15 0.2501422000-01-16 -1.7803872000-01-17 0.5105832000-01-18 -0.5836812000-01-19 0.5412232000-01-20 0.7150702000-01-21 0.0970762000-01-22 0.4506962000-01-23 0.3883712000-01-24 -0.1224932000-01-25 -0.1981852000-01-26 -1.3141342000-01-27 -0.4212042000-01-28 -0.8476152000-01-29 0.1046112000-01-30 -0.067303... 2002-08-28 -0.5816892002-08-29 0.5331282002-08-30 1.2635522002-08-31 -0.5281982002-09-01 0.7690582002-09-02 -0.7378472002-09-03 0.6027402002-09-04 -0.0255942002-09-05 -1.7567332002-09-06 0.7118662002-09-07 0.4726912002-09-08 0.0075982002-09-09 -0.0272242002-09-10 0.6759312002-09-11 -1.9791122002-09-12 1.4647652002-09-13 -0.7214142002-09-14 1.3668402002-09-15 2.7342682002-09-16 1.8831772002-09-17 -0.2896672002-09-18 0.8464532002-09-19 1.3584232002-09-20 -0.4050802002-09-21 -0.2339492002-09-22 -0.6099622002-09-23 0.2924902002-09-24 1.4788422002-09-25 0.3041812002-09-26 -0.369478Freq: D, Length: 1000, dtype: float64
数据载入与保存
df=pd.DataFrame(np.random.randn(100,4),columns=list('ABCD'))dfOut[94]: A B C D0 0.064095 0.909678 1.028639 -0.0274571 -1.471539 -0.903830 -0.736624 -0.9598362 -0.386030 0.435065 -0.550544 0.9427603 -1.781232 0.709337 0.757159 -1.2241604 3.036006 1.778238 -0.055653 2.6889935 -0.052378 -0.917533 -0.215525 -0.3554016 -1.154648 -0.130338 0.335879 -0.0621727 -1.392447 1.605805 0.456320 0.4342868 1.977163 1.002907 -0.272517 -0.7184669 0.424313 -1.493273 0.015595 0.33460010 -1.469211 -0.774529 0.642443 0.99821111 -0.818350 0.756107 -0.830342 0.27486112 0.337988 0.023247 -0.811474 2.67975413 0.089583 0.088795 -0.804172 -1.08564714 -1.257731 -1.034421 0.713171 -0.41232615 -0.767504 -0.011130 0.906505 -0.30648316 0.849606 -0.108564 -0.081838 -1.15181017 -0.028719 -0.783104 -0.772919 -0.23664018 0.421343 0.695579 0.293997 1.17423119 -1.154045 0.530179 -0.300266 0.35187720 0.075423 -0.602728 0.229882 0.33420721 -0.097215 -1.332227 -2.146795 -0.38558922 -0.300275 -2.516605 -0.925743 -1.65615223 1.398598 0.286433 -0.524563 1.69535224 -0.748971 0.807054 0.909789 -0.29306925 -2.221560 0.081356 0.161429 0.17538126 0.329723 -1.212668 0.102878 1.64539727 1.695421 1.404585 -1.787636 1.67941028 0.166341 1.225791 -0.852990 0.33060529 -0.539736 -0.807441 -0.140344 -0.018509.. ... ... ... ...70 -2.225945 -2.131682 0.127904 -0.64824971 -3.111897 -0.408759 1.322142 2.63505472 -0.776495 -1.335791 -0.090325 1.97878673 0.574853 0.394074 -0.064251 -1.41325674 0.111574 -1.042979 0.320441 1.22979575 -0.775243 -0.125456 0.191114 0.44007376 -0.539562 -3.155613 -0.371731 -0.39665577 -1.613776 -1.332295 0.590957 -0.18333578 1.682681 -1.148102 -2.044948 -0.86774279 -0.330760 0.907191 -0.909002 -0.11094380 1.913363 0.008444 -0.809529 -1.40346381 0.850681 0.357707 -0.042104 0.66765782 -0.467310 -0.123689 0.616570 -1.74664583 0.004234 0.000694 1.493874 -0.49754484 -0.230444 -1.056287 -1.514723 1.02414785 -1.380981 1.905973 -0.096171 -0.15153686 -0.276202 -1.549406 -1.069241 1.18122187 1.637849 -0.077122 0.338336 -2.04856688 1.656047 0.279875 -1.131464 1.26485689 -1.151340 -0.575027 -0.517369 -1.07637990 -0.410812 -0.302089 -0.157782 0.08797991 0.094909 -1.238561 -0.757331 -1.23659092 1.164323 -0.615039 1.863732 0.74173293 -0.145811 -0.470317 0.773096 -0.76026194 1.095176 1.654621 1.456416 0.18886695 -0.675272 -1.333038 0.605275 -0.82146396 0.403931 0.528502 -0.123698 0.94132197 1.000746 0.519680 -1.481362 -0.05145798 -0.388573 -0.994269 0.170004 -0.85186599 2.851694 0.458063 0.691514 -0.507752[100 rows x 4 columns]df.to_csv(data.csv)Traceback (most recent call last):File "C:\Users\HP\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_codeexec(code_obj, self.user_global_ns, self.user_ns)File "<ipython-input-95-7a5c562dad46>", line 1, in <module>df.to_csv(data.csv)NameError: name 'data' is not defineddf.to_csv('data.csv')
电影评分数据
合并
不同性别平均得分
热门电影
核心数据结构
Series
一维带标签数组,数组里可以放任意数据(整数,浮点数,字符串,python object)
基本创建函数是
s=pd.Series(data,index=index)
其中index是一个列表 用来作为数据标签,data可以是不同数据类型
python字典
ndarray对象
标量值
创建
[4]:s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])sOut[4]:a 0.643028b -0.006442c 1.466846d -0.828449e 1.953144dtype: float6[6]:d={'a':0,'b':1,'d':3}s=pd.Series(d,index=list('abcd'))sOut[6]:a 0.0b 1.0c NaNd 3.0dtype: float64:s=pd.Series(5,index=list('abcd'))sOut[7]:a 5b 5c 5d 5dtype: int64
特性
Series对象性质类ndarray对象标签对齐操作[9]:np.sin(s)Out[9]:a -0.958924b -0.958924c -0.958924d -0.958924dtype: float6411]:s['g']=100sOut[11]:a 5b 5c 5d 5g 100dtype: int64s1=pd.Series(np.random.randn(3),index=['a','c','e'])s2=pd.Series(np.random.randn(3),index=['a','d','e'])s1+s2Out[12]:a 1.054570c NaNd NaNe -0.658003dtype: float64
DataFrame
1DataFrame是二维带行标签和列标签的数组,可以把DataFrame像成一个Excel列表和一个sq数据库表格,还可以想象成一个Series对象字典,他是pandas里最常用的数据结构
2创建DataFrame基本格式是
df=pd.DataFrame(Data,index=index,columns=columns)
其中index是行标签,columns是列标签
data可以是下面数据:
有一维numpy数组,list,Series构成的字典
二维numpy数组
一个Series
另外的DataFrame
创建
d={'one':pd.Series([1,2,3],index=['a','b','c']),'two':pd.Series([1,2,3,4],index=['a','b','c','d'])}df=pd.DataFrame(d)dfOut[14]:onetwoa1.01b2.02c3.03dNaN4d={'one':[1,2,3,4],'two':[21,22,23,24]}df=pd.DataFrame(d)dfOut[15]:onetwo0121122223233424
特性
列选择/增加/删除
使用assign()来插入新列
索引选择
选择一列->df[col]-.>Series
Series
根据行标签选择一行->df.loc[label]->Series
根据行位置选择一行->df.iloc[label]->Series
选择多行->df[5:10]->DataFrame
根据布尔向量选择多行->df[bool_vector]->DataFrame
数据对齐
使用numpy函数
Panel
panel是三维带标签数组,实际上pandas名称有panel演进的,即pan(el)da(ta)s
panel比较少用,但依然是重要数据结构之一
items:坐标轴为0索引对应元素DataFrame
major_axis:坐标轴为1,DataFrame里行标签
minor_axis坐标轴2DataFrame列标签
基础运算
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
s=pd.Series([1,3,5,6,8],index=list('acefh'))
s
Out[6]:
a 1
c 3
e 5
f 6
h 8
dtype: int64
重新索引
s.reindex(list('abcdefgh'))
Out[7]:
a 1.0
b NaN
c 3.0
d NaN
e 5.0
f 6.0
g NaN
h 8.0
dtype: float64
s.reindex(list('abcdefgh'),fill_value=0)
Out[8]:
a 1
b 0
c 3
d 0
e 5
f 6
g 0
h 8
dtype: int64
s.reindex(list('abcdefgh'),method='ffill')
Out[9]:
a 1
b 1
c 3
d 3
e 5
f 6
g 6
h 8
dtype: int64
DataFram
df=pd.DataFrame(np.random.randn(4,6),index=list('ADFH'),columns=['one','two','three','four','five','six'])
df
Out[11]: one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
df2=df.reindex(index=list('ABCDEFGH'))
df2
Out[14]: one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
B NaN NaN NaN NaN NaN NaN
C NaN NaN NaN NaN NaN NaN
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
E NaN NaN NaN NaN NaN NaN
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
G NaN NaN NaN NaN NaN NaN
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
向前填充
df.reindex(index=list('ABCDEFGH'),method='ffill')
Out[15]: one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
B -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
C -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
E -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
G 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
向后填空
df.reindex(index=list('ABCDEFGH'),method='bfill')
Out[16]: one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
B -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
C -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
E 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
G -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
丢弃部分数据
df
Out[17]: one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
df.drop('A')
Out[18]: one two three four five six
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
丢弃列
df.drop(['two','four'],axis=1)
Out[23]: one three five six
A -0.636050 0.037713 0.146753 0.227147
D -0.407534 -0.477477 0.036828 0.095793
F 1.562102 0.353708 -0.209741 0.081160
H -0.936974 -2.331884 1.599978 0.010540
映射函数
df=pd.DataFrame(np.arange(12).reshape(4,3),index=['one','two','three','four'],columns=list('ABC'))
df
Out[25]: A B C
one 0 1 2
two 3 4 5
three 6 7 8
four 9 10 11
apply
df.apply(lambda x:x.max()-x.min())
Out[27]:
A 9
B 9
C 9
dtype: int64
df.apply(lambda x:x.max()-x.min(),axis=1)
Out[28]:
one 2
two 2
three 2
four 2
dtype: int64
def min_max(x):return pd.Series([x.min(),x.max()],index=['min','max'])
df.apply(min_max)
Out[29]: A B C
min 0 1 2
max 9 10 11
apply map
formater='(0:.03f)'.format
df.applymap(formater)
Out[39]: A B C
one (0:.03f) (0:.03f) (0:.03f)
two (0:.03f) (0:.03f) (0:.03f)
three (0:.03f) (0:.03f) (0:.03f)
four (0:.03f) (0:.03f) (0:.03f)
排名
s
Out[42]:
0 3
1 6
2 2
3 6
4 4
dtype: int64
s.rank()
Out[44]:
0 2.0
1 4.5
2 1.0
3 4.5
4 3.0
dtype: float64
s.rank(method='first')
Out[45]:
0 2.0
1 4.0
2 1.0
3 5.0
4 3.0
dtype: float64
数据唯一性及成员
s=pd.Series(list('ABCDEFGHDFDFDFD'))
s.value_counts()
Out[47]:
D 5
F 4
B 1
C 1
E 1
A 1
H 1
G 1
dtype: int64
s.unique()
Out[48]: array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], dtype=object)
s.isin(['A','B','C'])
Out[50]:
0 True
1 True
2 True
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
dtype: bool
s.isin(s.unique())
Out[51]:
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
dtype: bool