快速入门

1

import numpy as nps=pd.series([1,3,5,np.nan,8,4])

Series

s=pd.Series([1,3,5,np.nan,8,4])sOut[6]: 0    1.01    3.02    5.03    NaN4    8.05    4.0dtype: float64

date_range

dates=pd.date_range('20190301',periods=6)datesOut[10]: DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04','2019-03-05', '2019-03-06'],dtype='datetime64[ns]', freq='D')

二维数组索引

data=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))dataOut[14]: A         B         C         D2019-03-01  1.866219 -1.003057  2.526899 -0.5633432019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.4124362019-03-05 -1.854786  0.980092  0.797543 -0.432381data.shapeOut[15]: (6, 4)data.valuesOut[16]: array([[ 1.86621859, -1.0030566 ,  2.52689932, -0.56334339],[-1.41362647,  1.11672695, -0.47005354, -0.56793016],[ 0.11220208, -0.62181257, -0.65821206, -0.53731156],[-0.5678393 ,  2.17593279,  1.12604991, -0.412436  ],[-1.85478576,  0.98009218,  0.79754332, -0.43238061],[-0.28618627,  0.7689724 ,  0.75578607,  0.15118955]])

字典时间戳

d={'A':1,'B':pd.Timestamp('20130301'),'C':range(4),'D':np.arange(4)}dOut[19]: {'A': 1,'B': Timestamp('2013-03-01 00:00:00'),'C': range(0, 4),'D': array([0, 1, 2, 3])}

构造二维形式

df=pd.DataFrame(d)dfOut[21]: A          B  C  D0  1 2013-03-01  0  01  1 2013-03-01  1  12  1 2013-03-01  2  2

查看数据和排序修改

data.head(2)Out[22]: A         B         C         D2019-03-01  1.866219 -1.003057  2.526899 -0.5633432019-03-02 -1.413626  1.116727 -0.470054 -0.567930data.tail()Out[23]: A         B         C         D2019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.4124362019-03-05 -1.854786  0.980092  0.797543 -0.4323812019-03-06 -0.286186  0.768972  0.755786  0.151190data.indexOut[24]: DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04','2019-03-05', '2019-03-06'],dtype='datetime64[ns]', freq='D')data.columnsOut[25]: Index(['A', 'B', 'C', 'D'], dtype='object')data.describe()Out[26]: A         B         C         Dcount  6.000000  6.000000  6.000000  6.000000mean  -0.357336  0.569476  0.679669 -0.393702std    1.309362  1.181577  1.161292  0.275140min   -1.854786 -1.003057 -0.658212 -0.56793025%   -1.202180 -0.274116 -0.163594 -0.55683550%   -0.427013  0.874532  0.776665 -0.48484675%    0.012605  1.082568  1.043923 -0.417422max    1.866219  2.175933  2.526899  0.151190data.TOut[27]: 2019-03-01  2019-03-02  2019-03-03  2019-03-04  2019-03-05  2019-03-06A    1.866219   -1.413626    0.112202   -0.567839   -1.854786   -0.286186B   -1.003057    1.116727   -0.621813    2.175933    0.980092    0.768972C    2.526899   -0.470054   -0.658212    1.126050    0.797543    0.755786D   -0.563343   -0.567930   -0.537312   -0.412436   -0.432381    0.151190安列排序data.sort_index(axis=1)Out[28]: A         B         C         D2019-03-01  1.866219 -1.003057  2.526899 -0.5633432019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.4124362019-03-05 -1.854786  0.980092  0.797543 -0.4323812019-03-06 -0.286186  0.768972  0.755786  0.151190data.sort_index(axis=1,ascending=False)Out[29]: D         C         B         A2019-03-01 -0.563343  2.526899 -1.003057  1.8662192019-03-02 -0.567930 -0.470054  1.116727 -1.4136262019-03-03 -0.537312 -0.658212 -0.621813  0.1122022019-03-04 -0.412436  1.126050  2.175933 -0.5678392019-03-05 -0.432381  0.797543  0.980092 -1.8547862019-03-06  0.151190  0.755786  0.768972 -0.286186data.sort_index(axis=1,ascending=True)Out[30]: A         B         C         D2019-03-01  1.866219 -1.003057  2.526899 -0.5633432019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.4124362019-03-05 -1.854786  0.980092  0.797543 -0.4323812019-03-06 -0.286186  0.768972  0.755786  0.151190data.sort_values(by='A')Out[31]: A         B         C         D2019-03-05 -1.854786  0.980092  0.797543 -0.4323812019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-04 -0.567839  2.175933  1.126050 -0.4124362019-03-06 -0.286186  0.768972  0.755786  0.1511902019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-01  1.866219 -1.003057  2.526899 -0.563343data.AOut[32]: 2019-03-01    1.8662192019-03-02   -1.4136262019-03-03    0.1122022019-03-04   -0.5678392019-03-05   -1.8547862019-03-06   -0.286186Freq: D, Name: A, dtype: float64data[2:4]Out[33]: A         B         C         D2019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.412436data['20190302':'20190305']Out[34]: A         B         C         D2019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.412436loc效率较高data.loc['20190302':'20190304']Out[35]: A         B         C         D2019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.412436data.iloc[2:4]Out[36]: A         B         C         D2019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.412436data.loc[:,['B','C']]Out[37]: B         C2019-03-01 -1.003057  2.5268992019-03-02  1.116727 -0.4700542019-03-03 -0.621813 -0.6582122019-03-04  2.175933  1.1260502019-03-05  0.980092  0.7975432019-03-06  0.768972  0.755786at效率更高访问特定值data.at[pd.Timestamp('20190302'),'B']Out[38]: 1.116726953479249data.iat[1,1]Out[39]: 1.116726953479249布尔索引data[data.A>0]Out[40]: A         B         C         D2019-03-01  1.866219 -1.003057  2.526899 -0.5633432019-03-03  0.112202 -0.621813 -0.658212 -0.537312data[data>0]Out[41]: A         B         C        D2019-03-01  1.866219       NaN  2.526899      NaN2019-03-02       NaN  1.116727       NaN      NaN2019-03-03  0.112202       NaN       NaN      NaN2019-03-04       NaN  2.175933  1.126050      NaN2019-03-05       NaN  0.980092  0.797543      NaN2019-03-06       NaN  0.768972  0.755786  0.15119data2=data.copy()data2Out[43]: A         B         C         D2019-03-01  1.866219 -1.003057  2.526899 -0.5633432019-03-02 -1.413626  1.116727 -0.470054 -0.5679302019-03-03  0.112202 -0.621813 -0.658212 -0.5373122019-03-04 -0.567839  2.175933  1.126050 -0.4124362019-03-05 -1.854786  0.980092  0.797543 -0.4323812019-03-06 -0.286186  0.768972  0.755786  0.151190tag=['a']*2+['b']*2+['c']*2data2['TAG']=tagdata2Out[46]: A         B         C         D TAG2019-03-01  1.866219 -1.003057  2.526899 -0.563343   a2019-03-02 -1.413626  1.116727 -0.470054 -0.567930   a2019-03-03  0.112202 -0.621813 -0.658212 -0.537312   b2019-03-04 -0.567839  2.175933  1.126050 -0.412436   b2019-03-05 -1.854786  0.980092  0.797543 -0.432381   c2019-03-06 -0.286186  0.768972  0.755786  0.151190   cdata2[data2.TAG .isin(['a','c'])]Out[48]: A         B         C         D TAG2019-03-01  1.866219 -1.003057  2.526899 -0.563343   a2019-03-02 -1.413626  1.116727 -0.470054 -0.567930   a2019-03-05 -1.854786  0.980092  0.797543 -0.432381   c2019-03-06 -0.286186  0.768972  0.755786  0.151190   cdata.iat[0,0]=100dataOut[50]: A         B         C         D2019-03-01  100.000000 -1.003057  2.526899 -0.5633432019-03-02   -1.413626  1.116727 -0.470054 -0.5679302019-03-03    0.112202 -0.621813 -0.658212 -0.5373122019-03-04   -0.567839  2.175933  1.126050 -0.4124362019-03-05   -1.854786  0.980092  0.797543 -0.4323812019-03-06   -0.286186  0.768972  0.755786  0.151190data.A=range(6)dataOut[53]: A         B         C         D2019-03-01  0 -1.003057  2.526899 -0.5633432019-03-02  1  1.116727 -0.470054 -0.5679302019-03-03  2 -0.621813 -0.658212 -0.5373122019-03-04  3  2.175933  1.126050 -0.4124362019-03-05  4  0.980092  0.797543 -0.432381

2

import matplotlib.pyplot as pltimport numpy as np#矩阵运算import pandas as pddates=pd.date_range('20190301',periods=6)df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))dfOut[6]: A         B         C         D2019-03-01 -1.916667 -0.291255 -0.733860  2.4341082019-03-02  1.557134 -1.545051 -0.961491  0.5150892019-03-03  1.331631  0.714138  1.231407 -0.4228832019-03-04  0.719223 -0.260048  0.523958  1.1728322019-03-05  0.293174 -0.045853  1.150185  0.631227

处理丢失数据

df1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])df1Out[8]: A         B         C         D   E2019-03-01 -1.916667 -0.291255 -0.733860  2.434108 NaN2019-03-02  1.557134 -1.545051 -0.961491  0.515089 NaN2019-03-03  1.331631  0.714138  1.231407 -0.422883 NaN2019-03-04  0.719223 -0.260048  0.523958  1.172832 NaNdf1.loc[dates[1:3],'E']=2df1Out[10]: A         B         C         D    E2019-03-01 -1.916667 -0.291255 -0.733860  2.434108  NaN2019-03-02  1.557134 -1.545051 -0.961491  0.515089  2.02019-03-03  1.331631  0.714138  1.231407 -0.422883  2.02019-03-04  0.719223 -0.260048  0.523958  1.172832  NaN丢掉空数据df1.dropna()Out[11]: A         B         C         D    E2019-03-02  1.557134 -1.545051 -0.961491  0.515089  2.02019-03-03  1.331631  0.714138  1.231407 -0.422883  2.0填充数据df1.fillna(value=5)Out[12]: A         B         C         D    E2019-03-01 -1.916667 -0.291255 -0.733860  2.434108  5.02019-03-02  1.557134 -1.545051 -0.961491  0.515089  2.02019-03-03  1.331631  0.714138  1.231407 -0.422883  2.02019-03-04  0.719223 -0.260048  0.523958  1.172832  5.0判断是否有空数据pd.isnull(df1)Out[13]: A      B      C      D      E2019-03-01  False  False  False  False   True2019-03-02  False  False  False  False  False2019-03-03  False  False  False  False  False2019-03-04  False  False  False  False   Truepd.isnull(df1).any()Out[14]: A    FalseB    FalseC    FalseD    FalseE     Truedtype: boolpd.isnull(df1).any().any()Out[15]: Trueshift隐去数据s=pd.Series((1,3,5,np.nan,6,8),index=dates).shift(2)sOut[17]: 2019-03-01    NaN2019-03-02    NaN2019-03-03    1.02019-03-04    3.02019-03-05    5.02019-03-06    NaNFreq: D, dtype: float64dfOut[18]: A         B         C         D2019-03-01 -1.916667 -0.291255 -0.733860  2.4341082019-03-02  1.557134 -1.545051 -0.961491  0.5150892019-03-03  1.331631  0.714138  1.231407 -0.4228832019-03-04  0.719223 -0.260048  0.523958  1.1728322019-03-05  0.293174 -0.045853  1.150185  0.6312272019-03-06 -1.177185 -0.906234  2.267641 -0.571508df.sub(s,axis='index')Out[19]: A         B         C         D2019-03-01       NaN       NaN       NaN       NaN2019-03-02       NaN       NaN       NaN       NaN2019-03-03  0.331631 -0.285862  0.231407 -1.4228832019-03-04 -2.280777 -3.260048 -2.476042 -1.8271682019-03-05 -4.706826 -5.045853 -3.849815 -4.3687732019-03-06       NaN       NaN       NaN       NaNapply函数，传入函数df.apply(np.cumsum)Out[22]: A         B         C         D2019-03-01 -1.916667 -0.291255 -0.733860  2.4341082019-03-02 -0.359533 -1.836306 -1.695351  2.9491962019-03-03  0.972098 -1.122168 -0.463944  2.5263142019-03-04  1.691322 -1.382216  0.060014  3.6991462019-03-05  1.984496 -1.428069  1.210199  4.3303732019-03-06  0.807311 -2.334303  3.477840  3.758864dfOut[23]: A         B         C         D2019-03-01 -1.916667 -0.291255 -0.733860  2.4341082019-03-02  1.557134 -1.545051 -0.961491  0.5150892019-03-03  1.331631  0.714138  1.231407 -0.4228832019-03-04  0.719223 -0.260048  0.523958  1.1728322019-03-05  0.293174 -0.045853  1.150185  0.6312272019-03-06 -1.177185 -0.906234  2.267641 -0.571508df.apply(lambda x:x.max()-x.min())Out[25]: A    3.473800B    2.259189C    3.229132D    3.005616dtype: float64

数据运算

s=pd.Series(np.random.randint(10,20,size=20))sOut[29]: 0     131     132     103     164     105     196     137     148     139     1310    1811    1012    1313    1614    1815    1616    1617    1418    1619    13dtype: int32s.value_counts()Out[31]: 13    716    510    318    214    219    1dtype: int64s.mode(3)Out[33]: 0    13dtype: int32

数据合并

df=pd.DataFrame(np.random.randn(10,4),columns=list('ABCD'))dfOut[35]: A         B         C         D0 -0.068485 -0.731070 -1.158196 -0.9524691 -0.776078  0.118621  0.359391 -0.4275182  2.190398 -0.170339 -0.275725  0.1843323  0.111006  2.263383 -1.164128 -1.6531604  0.454094 -0.390870  0.181000 -0.7138915  1.770307 -0.125938 -0.470755  0.0730456  0.178509 -0.007117 -1.474438  1.2801517 -1.074046 -1.068972  0.821342 -1.0323828  0.767212  0.886415 -0.453329 -1.2609799 -0.053771  2.024723 -0.387595  0.039338df.iloc[:3]Out[36]: A         B         C         D0 -0.068485 -0.731070 -1.158196 -0.9524691 -0.776078  0.118621  0.359391 -0.4275182  2.190398 -0.170339 -0.275725  0.184332df.iloc[3:7]Out[37]: A         B         C         D3  0.111006  2.263383 -1.164128 -1.6531604  0.454094 -0.390870  0.181000 -0.7138915  1.770307 -0.125938 -0.470755  0.0730456  0.178509 -0.007117 -1.474438  1.280151df.iloc[7:]Out[38]: A         B         C         D7 -1.074046 -1.068972  0.821342 -1.0323828  0.767212  0.886415 -0.453329 -1.2609799 -0.053771  2.024723 -0.387595  0.039338pd.concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]])Out[40]: A         B         C         D0 -0.068485 -0.731070 -1.158196 -0.9524691 -0.776078  0.118621  0.359391 -0.4275182  2.190398 -0.170339 -0.275725  0.1843323  0.111006  2.263383 -1.164128 -1.6531604  0.454094 -0.390870  0.181000 -0.7138915  1.770307 -0.125938 -0.470755  0.0730456  0.178509 -0.007117 -1.474438  1.2801517 -1.074046 -1.068972  0.821342 -1.0323828  0.767212  0.886415 -0.453329 -1.2609799 -0.053771  2.024723 -0.387595  0.039338pd.merge(left,right,on='key')Out[45]: key  lval_x  lval_y0  foo       1       41  foo       1       52  foo       2       43  foo       2       5等价select*from left INNER JOIN right ON left.key=right.keydf.append(s,ignore_index=True)Out[46]: A         B         C         D     0  ...    15    16    17    18    190  -0.068485 -0.731070 -1.158196 -0.952469   NaN  ...   NaN   NaN   NaN   NaN   NaN1  -0.776078  0.118621  0.359391 -0.427518   NaN  ...   NaN   NaN   NaN   NaN   NaN2   2.190398 -0.170339 -0.275725  0.184332   NaN  ...   NaN   NaN   NaN   NaN   NaN3   0.111006  2.263383 -1.164128 -1.653160   NaN  ...   NaN   NaN   NaN   NaN   NaN4   0.454094 -0.390870  0.181000 -0.713891   NaN  ...   NaN   NaN   NaN   NaN   NaN5   1.770307 -0.125938 -0.470755  0.073045   NaN  ...   NaN   NaN   NaN   NaN   NaN6   0.178509 -0.007117 -1.474438  1.280151   NaN  ...   NaN   NaN   NaN   NaN   NaN7  -1.074046 -1.068972  0.821342 -1.032382   NaN  ...   NaN   NaN   NaN   NaN   NaN8   0.767212  0.886415 -0.453329 -1.260979   NaN  ...   NaN   NaN   NaN   NaN   NaN9  -0.053771  2.024723 -0.387595  0.039338   NaN  ...   NaN   NaN   NaN   NaN   NaN10       NaN       NaN       NaN       NaN  13.0  ...  16.0  16.0  14.0  16.0  13.0

数据分组

df=pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],'B':['one','one','two','three','two','two','one','three'],'C':np.random.randn(8),'D':np.random.randn(8)})dfOut[49]: A      B         C         D0  foo    one -0.985579 -1.1844931  bar    one  0.462434  2.0054622  foo    two -0.397243 -0.4022883  bar  three  1.273866 -0.1701834  foo    two  0.621279  0.3424495  bar    two  0.839009  0.2783416  foo    one -0.620062  1.0624727  foo  three -0.368640 -0.846278df.groupby('A').sum()Out[50]: C         DA                      bar  2.575308  2.113620foo -1.750244 -1.028139

3

数据整形

tuples=list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two',]]))tuplesOut[54]: [('bar', 'one'),('bar', 'two'),('baz', 'one'),('baz', 'two'),('foo', 'one'),('foo', 'two'),('qux', 'one'),('qux', 'two')]index=pd.MultiIndex.from_tuples(tuples,names=['first','second'])indexOut[57]: MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],codes=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second'])df=pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])dfOut[59]: A         Bfirst second                    bar   one     0.811642 -1.801518two    -1.307845 -0.983155baz   one    -1.939176 -0.270718two     0.141660 -1.333879foo   one    -0.311622  1.027429two    -1.819690  0.366389qux   one    -0.714346  2.603627two    -1.325926  1.577150列索引变行索引stack=df.stack()stackOut[61]: first  second   bar    one     A    0.811642B   -1.801518two     A   -1.307845B   -0.983155baz    one     A   -1.939176B   -0.270718two     A    0.141660B   -1.333879foo    one     A   -0.311622B    1.027429two     A   -1.819690B    0.366389qux    one     A   -0.714346B    2.603627two     A   -1.325926B    1.577150dtype: float64stack.indexOut[62]: MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two'], ['A', 'B']],codes=[[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]],names=['first', 'second', None])stack.unstack()Out[63]: A         Bfirst second                    bar   one     0.811642 -1.801518two    -1.307845 -0.983155baz   one    -1.939176 -0.270718two     0.141660 -1.333879foo   one    -0.311622  1.027429two    -1.819690  0.366389qux   one    -0.714346  2.603627two    -1.325926  1.577150stack.unstack().unstack()Out[64]: A                   B          second       one       two       one       twofirst                                         bar     0.811642 -1.307845 -1.801518 -0.983155baz    -1.939176  0.141660 -0.270718 -1.333879foo    -0.311622 -1.819690  1.027429  0.366389qux    -0.714346 -1.325926  2.603627  1.577150

数据透视

df=pd.DataFrame({'A':['one','one','two','three']*3,'B':['A','B','C']*4,'C':['foo','foo','foo','bar','bar','bar',]*2,'D':np.random.randn(12),'E':np.random.randn(12)})dfOut[66]: A  B    C         D         E0     one  A  foo  1.242650 -2.1862491     one  B  foo  0.390195 -0.0031802     two  C  foo  0.572055 -0.5565803   three  A  bar  1.125122 -0.1828274     one  B  bar  0.518552  0.2746755     one  C  bar -2.022590 -1.3099446     two  A  foo  0.509303  0.5740057   three  B  foo  0.204616  0.2865398     one  C  foo  1.657085  0.6997819     one  A  bar -0.502274  0.19188510    two  B  bar  0.799141  0.49477511  three  C  bar -0.355739  0.357618df.pivot_table(values=['D'],index=['A','B'],columns=['C'])Out[68]: D          C             bar       fooA     B                    one   A -0.502274  1.242650B  0.518552  0.390195C -2.022590  1.657085three A  1.125122       NaNB       NaN  0.204616C -0.355739       NaNtwo   A       NaN  0.509303B  0.799141       NaNC       NaN  0.572055多个数据求平均值

时间序列

rng=pd.date_range('20190301',periods=600,freq='s')rngOut[71]: DatetimeIndex(['2019-03-01 00:00:00', '2019-03-01 00:00:01','2019-03-01 00:00:02', '2019-03-01 00:00:03','2019-03-01 00:00:04', '2019-03-01 00:00:05','2019-03-01 00:00:06', '2019-03-01 00:00:07','2019-03-01 00:00:08', '2019-03-01 00:00:09',...'2019-03-01 00:09:50', '2019-03-01 00:09:51','2019-03-01 00:09:52', '2019-03-01 00:09:53','2019-03-01 00:09:54', '2019-03-01 00:09:55','2019-03-01 00:09:56', '2019-03-01 00:09:57','2019-03-01 00:09:58', '2019-03-01 00:09:59'],dtype='datetime64[ns]', length=600, freq='S')a=pd.Series(np.random.randint(0,500,len(rng)),index=rng)aOut[73]: 2019-03-01 00:00:00    1872019-03-01 00:00:01    1402019-03-01 00:00:02    4062019-03-01 00:00:03    3982019-03-01 00:00:04     922019-03-01 00:00:05    3872019-03-01 00:00:06     452019-03-01 00:00:07    3052019-03-01 00:00:08    3562019-03-01 00:00:09    2522019-03-01 00:00:10    3832019-03-01 00:00:11    3432019-03-01 00:00:12    4552019-03-01 00:00:13     702019-03-01 00:00:14    4932019-03-01 00:00:15    3082019-03-01 00:00:16    4922019-03-01 00:00:17     492019-03-01 00:00:18    4002019-03-01 00:00:19     902019-03-01 00:00:20    3362019-03-01 00:00:21     502019-03-01 00:00:22    3972019-03-01 00:00:23     852019-03-01 00:00:24    4372019-03-01 00:00:25    4482019-03-01 00:00:26    4602019-03-01 00:00:27    3952019-03-01 00:00:28    4942019-03-01 00:00:29    463... 2019-03-01 00:09:30    1722019-03-01 00:09:31     602019-03-01 00:09:32    4692019-03-01 00:09:33     732019-03-01 00:09:34     652019-03-01 00:09:35     412019-03-01 00:09:36    1062019-03-01 00:09:37    1402019-03-01 00:09:38    2402019-03-01 00:09:39    4542019-03-01 00:09:40    4032019-03-01 00:09:41    3802019-03-01 00:09:42     962019-03-01 00:09:43    3432019-03-01 00:09:44    1822019-03-01 00:09:45    2462019-03-01 00:09:46    3662019-03-01 00:09:47    2832019-03-01 00:09:48    3992019-03-01 00:09:49    2782019-03-01 00:09:50    4042019-03-01 00:09:51    4242019-03-01 00:09:52    1112019-03-01 00:09:53    1262019-03-01 00:09:54    1172019-03-01 00:09:55    1202019-03-01 00:09:56    1682019-03-01 00:09:57     922019-03-01 00:09:58    1472019-03-01 00:09:59    455Freq: S, Length: 600, dtype: int32a.resample('2Min',how='sum')D:\PyCharm\helpers\pydev\pydevconsole.py:1: FutureWarning: how in .resample() is deprecatedthe new syntax is .resample(...).sum()'''Out[74]: 2019-03-01 00:00:00    324982019-03-01 00:02:00    329542019-03-01 00:04:00    292392019-03-01 00:06:00    302752019-03-01 00:08:00    27312Freq: 2T, dtype: int32rng=pd.period_range('2000Q1','2019Q1',freq='Q')rngOut[76]: PeriodIndex(['2000Q1', '2000Q2', '2000Q3', '2000Q4', '2001Q1', '2001Q2','2001Q3', '2001Q4', '2002Q1', '2002Q2', '2002Q3', '2002Q4','2003Q1', '2003Q2', '2003Q3', '2003Q4', '2004Q1', '2004Q2','2004Q3', '2004Q4', '2005Q1', '2005Q2', '2005Q3', '2005Q4','2006Q1', '2006Q2', '2006Q3', '2006Q4', '2007Q1', '2007Q2','2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3', '2008Q4','2009Q1', '2009Q2', '2009Q3', '2009Q4', '2010Q1', '2010Q2','2010Q3', '2010Q4', '2011Q1', '2011Q2', '2011Q3', '2011Q4','2012Q1', '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2','2013Q3', '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4','2015Q1', '2015Q2', '2015Q3', '2015Q4', '2016Q1', '2016Q2','2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4','2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1'],dtype='period[Q-DEC]', freq='Q-DEC')rng.to_timestamp()Out[77]: DatetimeIndex(['2000-01-01', '2000-04-01', '2000-07-01', '2000-10-01','2001-01-01', '2001-04-01', '2001-07-01', '2001-10-01','2002-01-01', '2002-04-01', '2002-07-01', '2002-10-01','2003-01-01', '2003-04-01', '2003-07-01', '2003-10-01','2004-01-01', '2004-04-01', '2004-07-01', '2004-10-01','2005-01-01', '2005-04-01', '2005-07-01', '2005-10-01','2006-01-01', '2006-04-01', '2006-07-01', '2006-10-01','2007-01-01', '2007-04-01', '2007-07-01', '2007-10-01','2008-01-01', '2008-04-01', '2008-07-01', '2008-10-01','2009-01-01', '2009-04-01', '2009-07-01', '2009-10-01','2010-01-01', '2010-04-01', '2010-07-01', '2010-10-01','2011-01-01', '2011-04-01', '2011-07-01', '2011-10-01','2012-01-01', '2012-04-01', '2012-07-01', '2012-10-01','2013-01-01', '2013-04-01', '2013-07-01', '2013-10-01','2014-01-01', '2014-04-01', '2014-07-01', '2014-10-01','2015-01-01', '2015-04-01', '2015-07-01', '2015-10-01','2016-01-01', '2016-04-01', '2016-07-01', '2016-10-01','2017-01-01', '2017-04-01', '2017-07-01', '2017-10-01','2018-01-01', '2018-04-01', '2018-07-01', '2018-10-01','2019-01-01'],dtype='datetime64[ns]', freq='QS-OCT')时间运算pd.Timestamp('20190302')-pd.Timestamp('20180817')Out[78]: Timedelta('197 days 00:00:00')pd.Timestamp('20190821')+pd.Timedelta(days=90)Out[79]: Timestamp('2019-11-19 00:00:00')

数据可视化

插入操作df=pd.DataFrame({'ID':(1,2,3,4,5,6),'raw_grade':('a','b','b','a','a','d')})dfOut[82]: ID raw_grade0   1         a1   2         b2   3         b3   4         a4   5         a5   6         ddf['grade']=df.raw_grade.astype('category')dfOut[84]: ID raw_grade grade0   1         a     a1   2         b     b2   3         b     b3   4         a     a4   5         a     a5   6         d     ddf.grade.cat.categories=['very good','good','bad']dfOut[86]: ID raw_grade      grade0   1         a  very good1   2         b       good2   3         b       good3   4         a  very good4   5         a  very good5   6         d        bads=pd.Series(np.random.randn(1000),index=pd.date_range('20000101',periods=1000))sOut[88]: 2000-01-01    0.2117372000-01-02    0.9776882000-01-03    0.1587802000-01-04   -0.6080212000-01-05   -0.5789802000-01-06    0.6408272000-01-07    0.2359842000-01-08    0.2403082000-01-09   -0.8685252000-01-10    0.3306772000-01-11   -0.4211352000-01-12   -0.6731162000-01-13   -0.2724062000-01-14    1.2920672000-01-15    0.2501422000-01-16   -1.7803872000-01-17    0.5105832000-01-18   -0.5836812000-01-19    0.5412232000-01-20    0.7150702000-01-21    0.0970762000-01-22    0.4506962000-01-23    0.3883712000-01-24   -0.1224932000-01-25   -0.1981852000-01-26   -1.3141342000-01-27   -0.4212042000-01-28   -0.8476152000-01-29    0.1046112000-01-30   -0.067303...   2002-08-28   -0.5816892002-08-29    0.5331282002-08-30    1.2635522002-08-31   -0.5281982002-09-01    0.7690582002-09-02   -0.7378472002-09-03    0.6027402002-09-04   -0.0255942002-09-05   -1.7567332002-09-06    0.7118662002-09-07    0.4726912002-09-08    0.0075982002-09-09   -0.0272242002-09-10    0.6759312002-09-11   -1.9791122002-09-12    1.4647652002-09-13   -0.7214142002-09-14    1.3668402002-09-15    2.7342682002-09-16    1.8831772002-09-17   -0.2896672002-09-18    0.8464532002-09-19    1.3584232002-09-20   -0.4050802002-09-21   -0.2339492002-09-22   -0.6099622002-09-23    0.2924902002-09-24    1.4788422002-09-25    0.3041812002-09-26   -0.369478Freq: D, Length: 1000, dtype: float64

数据载入与保存

df=pd.DataFrame(np.random.randn(100,4),columns=list('ABCD'))dfOut[94]: A         B         C         D0   0.064095  0.909678  1.028639 -0.0274571  -1.471539 -0.903830 -0.736624 -0.9598362  -0.386030  0.435065 -0.550544  0.9427603  -1.781232  0.709337  0.757159 -1.2241604   3.036006  1.778238 -0.055653  2.6889935  -0.052378 -0.917533 -0.215525 -0.3554016  -1.154648 -0.130338  0.335879 -0.0621727  -1.392447  1.605805  0.456320  0.4342868   1.977163  1.002907 -0.272517 -0.7184669   0.424313 -1.493273  0.015595  0.33460010 -1.469211 -0.774529  0.642443  0.99821111 -0.818350  0.756107 -0.830342  0.27486112  0.337988  0.023247 -0.811474  2.67975413  0.089583  0.088795 -0.804172 -1.08564714 -1.257731 -1.034421  0.713171 -0.41232615 -0.767504 -0.011130  0.906505 -0.30648316  0.849606 -0.108564 -0.081838 -1.15181017 -0.028719 -0.783104 -0.772919 -0.23664018  0.421343  0.695579  0.293997  1.17423119 -1.154045  0.530179 -0.300266  0.35187720  0.075423 -0.602728  0.229882  0.33420721 -0.097215 -1.332227 -2.146795 -0.38558922 -0.300275 -2.516605 -0.925743 -1.65615223  1.398598  0.286433 -0.524563  1.69535224 -0.748971  0.807054  0.909789 -0.29306925 -2.221560  0.081356  0.161429  0.17538126  0.329723 -1.212668  0.102878  1.64539727  1.695421  1.404585 -1.787636  1.67941028  0.166341  1.225791 -0.852990  0.33060529 -0.539736 -0.807441 -0.140344 -0.018509..       ...       ...       ...       ...70 -2.225945 -2.131682  0.127904 -0.64824971 -3.111897 -0.408759  1.322142  2.63505472 -0.776495 -1.335791 -0.090325  1.97878673  0.574853  0.394074 -0.064251 -1.41325674  0.111574 -1.042979  0.320441  1.22979575 -0.775243 -0.125456  0.191114  0.44007376 -0.539562 -3.155613 -0.371731 -0.39665577 -1.613776 -1.332295  0.590957 -0.18333578  1.682681 -1.148102 -2.044948 -0.86774279 -0.330760  0.907191 -0.909002 -0.11094380  1.913363  0.008444 -0.809529 -1.40346381  0.850681  0.357707 -0.042104  0.66765782 -0.467310 -0.123689  0.616570 -1.74664583  0.004234  0.000694  1.493874 -0.49754484 -0.230444 -1.056287 -1.514723  1.02414785 -1.380981  1.905973 -0.096171 -0.15153686 -0.276202 -1.549406 -1.069241  1.18122187  1.637849 -0.077122  0.338336 -2.04856688  1.656047  0.279875 -1.131464  1.26485689 -1.151340 -0.575027 -0.517369 -1.07637990 -0.410812 -0.302089 -0.157782  0.08797991  0.094909 -1.238561 -0.757331 -1.23659092  1.164323 -0.615039  1.863732  0.74173293 -0.145811 -0.470317  0.773096 -0.76026194  1.095176  1.654621  1.456416  0.18886695 -0.675272 -1.333038  0.605275 -0.82146396  0.403931  0.528502 -0.123698  0.94132197  1.000746  0.519680 -1.481362 -0.05145798 -0.388573 -0.994269  0.170004 -0.85186599  2.851694  0.458063  0.691514 -0.507752[100 rows x 4 columns]df.to_csv(data.csv)Traceback (most recent call last):File "C:\Users\HP\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_codeexec(code_obj, self.user_global_ns, self.user_ns)File "<ipython-input-95-7a5c562dad46>", line 1, in <module>df.to_csv(data.csv)NameError: name 'data' is not defineddf.to_csv('data.csv')

电影评分数据

在这里插入图片描述

合并

在这里插入图片描述

不同性别平均得分

在这里插入图片描述

核心数据结构

Series

一维带标签数组，数组里可以放任意数据（整数，浮点数，字符串，python object）

基本创建函数是

s=pd.Series(data，index=index)

其中index是一个列表用来作为数据标签，data可以是不同数据类型

python字典

ndarray对象

标量值

创建

[4]:s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])sOut[4]:a    0.643028b   -0.006442c    1.466846d   -0.828449e    1.953144dtype: float6[6]:d={'a':0,'b':1,'d':3}s=pd.Series(d,index=list('abcd'))sOut[6]:a    0.0b    1.0c    NaNd    3.0dtype: float64:s=pd.Series(5,index=list('abcd'))sOut[7]:a    5b    5c    5d    5dtype: int64

特性

Series对象性质类ndarray对象标签对齐操作[9]:np.sin(s)Out[9]:a   -0.958924b   -0.958924c   -0.958924d   -0.958924dtype: float6411]:s['g']=100sOut[11]:a      5b      5c      5d      5g    100dtype: int64s1=pd.Series(np.random.randn(3),index=['a','c','e'])s2=pd.Series(np.random.randn(3),index=['a','d','e'])s1+s2Out[12]:a    1.054570c         NaNd         NaNe   -0.658003dtype: float64

DataFrame

1DataFrame是二维带行标签和列标签的数组，可以把DataFrame像成一个Excel列表和一个sq数据库表格，还可以想象成一个Series对象字典，他是pandas里最常用的数据结构

2创建DataFrame基本格式是

df=pd.DataFrame（Data,index=index，columns=columns）

其中index是行标签，columns是列标签

data可以是下面数据：

有一维numpy数组，list，Series构成的字典

二维numpy数组

一个Series

另外的DataFrame

创建

在这里插入图片描述

d={'one':pd.Series([1,2,3],index=['a','b','c']),'two':pd.Series([1,2,3,4],index=['a','b','c','d'])}df=pd.DataFrame(d)dfOut[14]:onetwoa1.01b2.02c3.03dNaN4d={'one':[1,2,3,4],'two':[21,22,23,24]}df=pd.DataFrame(d)dfOut[15]:onetwo0121122223233424

特性

列选择/增加/删除

使用assign（）来插入新列

索引选择

选择一列->df[col]-.>Series

Series

根据行标签选择一行->df.loc[label]->Series

根据行位置选择一行->df.iloc[label]->Series

选择多行->df[5:10]->DataFrame

根据布尔向量选择多行->df[bool_vector]->DataFrame

数据对齐

使用numpy函数

Panel

panel是三维带标签数组，实际上pandas名称有panel演进的，即pan（el）da（ta）s

panel比较少用，但依然是重要数据结构之一

items：坐标轴为0索引对应元素DataFrame

major_axis：坐标轴为1，DataFrame里行标签

minor_axis坐标轴2DataFrame列标签

基础运算

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
s=pd.Series([1,3,5,6,8],index=list('acefh'))
s
Out[6]: 
a    1
c    3
e    5
f    6
h    8
dtype: int64

重新索引

s.reindex(list('abcdefgh'))
Out[7]: 
a    1.0
b    NaN
c    3.0
d    NaN
e    5.0
f    6.0
g    NaN
h    8.0
dtype: float64
s.reindex(list('abcdefgh'),fill_value=0)
Out[8]: 
a    1
b    0
c    3
d    0
e    5
f    6
g    0
h    8
dtype: int64
s.reindex(list('abcdefgh'),method='ffill')
Out[9]: 
a    1
b    1
c    3
d    3
e    5
f    6
g    6
h    8
dtype: int64

DataFram

df=pd.DataFrame(np.random.randn(4,6),index=list('ADFH'),columns=['one','two','three','four','five','six'])
df
Out[11]: one       two     three      four      five       six
A -0.636050  0.706831  0.037713 -0.618195  0.146753  0.227147
D -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
F  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
H -0.936974  0.458558 -2.331884  0.200938  1.599978  0.010540
df2=df.reindex(index=list('ABCDEFGH'))
df2
Out[14]: one       two     three      four      five       six
A -0.636050  0.706831  0.037713 -0.618195  0.146753  0.227147
B       NaN       NaN       NaN       NaN       NaN       NaN
C       NaN       NaN       NaN       NaN       NaN       NaN
D -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
E       NaN       NaN       NaN       NaN       NaN       NaN
F  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
G       NaN       NaN       NaN       NaN       NaN       NaN
H -0.936974  0.458558 -2.331884  0.200938  1.599978  0.010540

向前填充

df.reindex(index=list('ABCDEFGH'),method='ffill')
Out[15]: one       two     three      four      five       six
A -0.636050  0.706831  0.037713 -0.618195  0.146753  0.227147
B -0.636050  0.706831  0.037713 -0.618195  0.146753  0.227147
C -0.636050  0.706831  0.037713 -0.618195  0.146753  0.227147
D -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
E -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
F  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
G  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
H -0.936974  0.458558 -2.331884  0.200938  1.599978  0.010540

向后填空

df.reindex(index=list('ABCDEFGH'),method='bfill')
Out[16]: one       two     three      four      five       six
A -0.636050  0.706831  0.037713 -0.618195  0.146753  0.227147
B -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
C -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
D -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
E  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
F  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
G -0.936974  0.458558 -2.331884  0.200938  1.599978  0.010540
H -0.936974  0.458558 -2.331884  0.200938  1.599978  0.010540

丢弃部分数据

df
Out[17]: one       two     three      four      five       six
A -0.636050  0.706831  0.037713 -0.618195  0.146753  0.227147
D -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
F  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
H -0.936974  0.458558 -2.331884  0.200938  1.599978  0.010540
df.drop('A')
Out[18]: one       two     three      four      five       six
D -0.407534 -0.400880 -0.477477  0.404516  0.036828  0.095793
F  1.562102  0.717782  0.353708  1.315581 -0.209741  0.081160
H -0.936974  0.458558 -2.331884  0.200938  1.599978  0.010540

丢弃列

df.drop(['two','four'],axis=1)
Out[23]: one     three      five       six
A -0.636050  0.037713  0.146753  0.227147
D -0.407534 -0.477477  0.036828  0.095793
F  1.562102  0.353708 -0.209741  0.081160
H -0.936974 -2.331884  1.599978  0.010540

映射函数

df=pd.DataFrame(np.arange(12).reshape(4,3),index=['one','two','three','four'],columns=list('ABC'))
df
Out[25]: A   B   C
one    0   1   2
two    3   4   5
three  6   7   8
four   9  10  11

apply

df.apply(lambda x:x.max()-x.min())
Out[27]: 
A    9
B    9
C    9
dtype: int64
df.apply(lambda x:x.max()-x.min(),axis=1)
Out[28]: 
one      2
two      2
three    2
four     2
dtype: int64
def min_max(x):return pd.Series([x.min(),x.max()],index=['min','max'])
df.apply(min_max)
Out[29]: A   B   C
min  0   1   2
max  9  10  11

apply map

formater='(0:.03f)'.format
df.applymap(formater)
Out[39]: A         B         C
one    (0:.03f)  (0:.03f)  (0:.03f)
two    (0:.03f)  (0:.03f)  (0:.03f)
three  (0:.03f)  (0:.03f)  (0:.03f)
four   (0:.03f)  (0:.03f)  (0:.03f)

排名

s
Out[42]: 
0    3
1    6
2    2
3    6
4    4
dtype: int64
s.rank()
Out[44]: 
0    2.0
1    4.5
2    1.0
3    4.5
4    3.0
dtype: float64
s.rank(method='first')
Out[45]: 
0    2.0
1    4.0
2    1.0
3    5.0
4    3.0
dtype: float64

数据唯一性及成员

s=pd.Series(list('ABCDEFGHDFDFDFD'))
s.value_counts()
Out[47]: 
D    5
F    4
B    1
C    1
E    1
A    1
H    1
G    1
dtype: int64
s.unique()
Out[48]: array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], dtype=object)
s.isin(['A','B','C'])
Out[50]: 
0      True
1      True
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
dtype: bool
s.isin(s.unique())
Out[51]: 
0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
dtype: bool