—————————————————
***Pandas 小抄*** —————————————————
***1. Reading and Writing Data*** ——
import pandas as pd
#a. Reading a csv file
df=pd.read_csv('Analysis.cav')
#b. Writing content of data frame to csv file
df.to_csv('werfer.csv')
# c.Reading an Excel file
df=pd.read_excel('sdfsdgsd.xlsx', 'sheeet1')
#d. Writing content of data frame to Excel file
df.to_excel('sddg.xlsx', sheet_name='sheet2')
# pandas 导入导出,读取和储存# The pandas I/O API is a set of top level reader functions accessed like # pd.read_csv() that pandas object.# read_csv # excel files# read_excel# read_hdf# read_sql# read_json# read_msgpack(experimental)# read_html# read_gbq(experimental)# read_stata# read_sas# read_clipboard# read_pickle #自带的亚索# The corresponding writer functions are object methods that are accessed like# df.to_csv# to_csv# to_excel# to_hdf# to_sql# to_json# to_msgpack# to_html# to_gbq# to_stata# to_clipboard# to_pickleimport pandas as pddata = pd.read_csv('student.csv')
print(data)data.to_packle('student.pickle')
—————————————————
***2. Getting Preview of Dataframe***
#a.Looking at top n record
df.head(5)
#b.Looking at bottom n record
df.tail(5)
#c.View columns name
df.columns
—————————————————
***3. Rename Columns of Data Frame***
—————————————————
***4. Selecting Columns or Rows***
#a. Accessing sub data frames
df[['column1','column2']]
#b.Filtering Records
df[df['column1']>10]
df[(df['column1']>10) & df['column2']==30]
df[(df['column1']>10) | df['column2']==30]
# pandas 数据选择import pandas as pd
import numpy as npdates = pd.date_range('20170101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)print(df['A'],df.A)print(df[0:3],df['20170101':'20170104'])# select by label:loc
print(df.loc['20170102'])print(df.loc[:,['A','B']])print(df.loc['20170102',['A','B']])#select by position:iloc
print(df.iloc[3])
print(df.iloc[3,1])
print(df.iloc[1:3,1:3])
print(df.iloc[[1,3,5],1:3])#mixed selection:ix
print(df.ix[:3,['A','C']])# Boolean indexing
print(df)
print(df[df.A>8])
—————————————————
***5. Handing Missing Values*** This is an inevitale part of dealing wiht data. To overcom this hurdle, use dropna or fillna function
#a. dropna: It is used to droprowsor columns having missing data
df1.dropna()
#b.fillna: It is used to fill missing values
df2.fillna(value=5) # It replaces all missing valueswith5
mean = df2['column1'].mean()
df2['column1'].fillna(mean) # It replaces all missing valuesof column1 with mean of available values
————-
from pandas import Series,DataFrame
import pandas as pd
ser = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
serser.drop('c')
ser
.drop() 返回的是一个新对象,元对象不会被改变。
from pandas import Series,DataFrame
import pandas as pd
import numpy as npdf = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
... [np.nan, np.nan, np.nan, 5]],
... columns=list('ABCD'))df#Drop the columns where all elements are nan
df.dropna(axis=1, how='all')A B D
0NaN2.0013.04.012NaNNaN5#Drop the columns where any of the elements is nan>>> df.dropna(axis=1, how='any')D
001125#Drop the rows where all of the elements are nan (there is no row to drop, so df #stays the same):>>> df.dropna(axis=0, how='all')A B C D
0NaN2.0NaN013.04.0NaN12NaNNaNNaN5#Drop the rows where any of the elements are nan >>> df.dropna(axis=0, how='any')
Empty DataFrame
Columns: [A, B, C, D]
Index: []#Keep only the rows with at least 2 non-na values:>>> df.dropna(thresh=2)A B C D
0NaN2.0NaN013.04.0NaN1#Drop where all of the elements are nan, the default is the row, (there is no row to drop, so df #stays the same):>>> df.dropna(how='all')A B C D
0NaN2.0NaN013.04.0NaN12NaNNaNNaN5#Drop where any of the elements are nan, default is the row >>> df.dropna( how='any')
Empty DataFrame
Columns: [A, B, C, D]
Index: []dfnew = pd.DataFrame([[3435234, 2, 5666, 0], [3, 4, np.nan, 1],...: ... [np.nan, np.nan, np.nan, 5]],...: ... columns=list('ABCD'))dfnew.dropna() #默认对row 进行操作,去掉Na项A B C D
03435234256660
# 处理丢失数据import numpy as np
import pandas as pddates = pd.date_range('20170101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
print(df.dropna(axis=0,how='any'))#how={'any','all'} default is 'any'
print(df.dropna(axis=1,how='all'))#填入数据
print(df.fillna(value=0))
#打印缺失数据
print(df.isnull())
#打印出缺失数据,当数据比较大时
print(np.any(df.isnull())==True)
—————————————————
***6. Creating New Columns*** New column is a function of existing columns
df['NewColumn1'] = df['column2'] # Create a copyof existing column2
df['NewColumn2'] = df['column2'] + 10 # Add10to existing column2 thencreate a new one
df['NewColumn3'] = df['column1'] + df['column2'] # Add elements of column1 and column2 thencreatenew column
—————————————————
***7. Aggregate*** a. Groupby: Groupby helps to perform three operations. i. Splitting the data into groups ii. Applying a function to each group individually iii. Combining the result into a data structure
# pandas 合并concatimport pandas as pd
import numpy as np#concatenatingdf1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])print(df1)
print(df2)
print(df3)result = pd.concat([df1,df2,df3],axis=0)#行合并print(result)
#result1 = pd.concat([df1,df2,df3],axis=1)#列合并#print(result1)result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#行合并,忽略indexprint(result)#join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[2,3,4])
print(df1)
print(df2)result2 = pd.concat([df1,df2],join='outer',ignore_index=True)# 补充为naprint(result2)
result22 = pd.concat([df1,df2],join='outer')# 补充为naprint(result22)
result3 = pd.concat([df1,df2],join='inner',ignore_index=True) # 裁剪掉print(result3)
result33 = pd.concat([df1,df2],join='inner') # 裁剪掉print(result33)#join_axes
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[2,3,4])
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)res1 = pd.concat([df1,df2],axis=1)
print(res1)#append
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
res11 = df1.append(df2,ignore_index=True)
print(res11)
res12 = df1.append([df2,df3],ignore_index=True)
print(res12)s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])res13=df1.append(s1,ignore_index=True)
print(res13)#pandas 合并mergeimport pandas as pd#merging two df by key/keys.(may be used in database)#simple example
left = pd.DataFrame({'key':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})print(lef)
print(right)
res14 = pd.merge(left,right,on='key')
print(res14)#consider two keys
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],'key2':['K0','K0','K0','K0'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
print(left)
print(right)res15 = pd.merge(left,right,on=['key1','key2'])
print(res15)
#how =['left','right','inner','outer']
res16 = pd.merge(left,right,on=['key1','key2'],how='inner')
print(res16)
—————————————————
***9. Applying function to element, column or dataframe*** a. Map: It iterates over each element of a series
df['column1'].map(lambda x: 10+x) #this will add 10to each element of column1
df['column2'].map(lambda x:'AV'+x) # this will concatenate 'AV' at the beginning of each element of column2(column format isstring)
b. Apply: As the name suggests, applies a function along any axis of the DataFrame
df[['column1','column2']].apply(sum) #It will returns the sum of all the values of column1 and column2
c. ApplyMap: This helps to apply a function to each element of dataframe
func = lambda x: x+2
df.applymap(func) # it will add2toeach element of dataframe(all columns of dataframe must be numeric type)
—————————————————
***10. Identify unique value*** Function unique helps to return unique values of a column
df['Column1'].unique()
—————————————————
***11. Basic Stats*** Pandas helps to understand the data using basic statistical methods. a. describe: This returns the quick stats(count, mean, std, min, first quartile, median, third quartile, max) on suitable columns
df.describe()
b. covariance: It returns the co-variance between suitable columns
df.cov()
c.correlation: It returns the co-variance between suitable columns.
#https://python.freelycode.com/contribution/detail/333#https://python.freelycode.com/contribution/detail/334#http://www.datadependence.com/2016/05/scientific-python-pandas/#Python科学计算之Pandas#导入Pandas的标准方式import pandas as pd # This is the standard
#将数据导入Pandas,采用[英国政府数据中关于降雨量数据](https://data.gov.uk/dataset/average-temperature-and-rainfall-england-and-wales/resource/3fea0f7b-5304-4f11-a809-159f4558e7da)# Reading a csv into Pandas,从csv文件中读取到了数据,并将他们存入了dataframe中#header关键字告诉Pandas这些数据是否有列名,在哪里。如果没有列名,你可以将其置为None。
df = pd.read_csv('uk_rain_2014.csv', header=0)
#将你的数据准备好以进行挖掘和分析#想要快速查看前x行数据#Getting first x rows
df.head(5)
Water Year
Rain (mm) Oct-Sep
Outflow (m3/s) Oct-Sep
Rain (mm) Dec-Feb
Outflow (m3/s) Dec-Feb
Rain (mm) Jun-Aug
Outflow (m3/s) Jun-Aug
0
1980/81
1182
5408
292
7248
174
2212
1
1981/82
1098
5112
257
7316
242
1936
2
1982/83
1156
5701
330
8567
124
1802
3
1983/84
993
4265
391
8905
141
1078
4
1984/85
1182
5364
217
5813
343
4313
#想要获得最后x行的数据#Getting last x rows#Pandas不是从dataframe的结尾处开始倒着输出数据,#而是按照它们在dataframe中固有的顺序输出给你。
df.tail(5)
#取数据的行数,即条目数#Finding out how many rows dataset has.
len(df)
33
#数据的一些基本的统计信息#Finding out basic statistical information on your dataset.
pd.options.display.float_format = '{:,.3f}'.format
#Limit output to 3 decimal places.计数,均值,标准方差
df.describe()
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
count
33.000
33.000
33.000
33.000
33.000
33.000
mean
1,129.000
5,019.182
325.364
7,926.545
237.485
2,439.758
std
101.900
658.588
69.995
1,692.800
66.168
1,025.914
min
856.000
3,479.000
206.000
4,578.000
103.000
1,078.000
25%
1,053.000
4,506.000
268.000
6,690.000
193.000
1,797.000
50%
1,139.000
5,112.000
309.000
7,630.000
229.000
2,142.000
75%
1,182.000
5,497.000
360.000
8,905.000
280.000
2,959.000
max
1,387.000
6,391.000
484.000
11,486.000
379.000
5,261.000
#过滤#提取一整列。可以直接使用列标签#Getting a column by label
df['rain_octsep']
#使用多条条件表达式来进行过滤#Filtering by multiple conditionals#将返回rain_octsep小于1000并且outflow_octsep小于4000的那些条目。
df[(df.rain_octsep <1000) & (df.outflow_octsep <4000)]
# Can't use the keyword 'and'
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
15
1995/96
856
3479
245
5515
172
1439
#数据中有字符串,也可以使用字符串方法来过滤数据。#必须使用.str.[string method],你不能直接在字符串上直接调用字符串方法。#Filtering by string methods
df[df.water_year.str.startswith('199')] #这一语句返回1990年代的所有条目
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
10
1990/91
1022
4418
305
7120
216
1923
11
1991/92
1151
4506
246
5493
280
2118
12
1992/93
1130
5246
308
8751
219
2551
13
1993/94
1162
5583
422
10109
193
1638
14
1994/95
1110
5370
484
11486
103
1231
15
1995/96
856
3479
245
5515
172
1439
16
1996/97
1047
4019
258
5770
256
2102
17
1997/98
1169
4953
341
7747
285
3206
18
1998/99
1268
5824
360
8771
225
2240
19
1999/00
1204
5665
417
10021
197
2166
#索引#如果行有数字索引,可以使用iloc引用他们#Getting a row via a numerical index#iloc仅仅作用于数字索引。它将会返回该行的一个series。
df.iloc[30]
#可能在数据集里有年份的列,或者年代的列#Setting a new index from an existing column
df = df.set_index(['water_year'])
df.head(5)
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
water_year
1980/81
1182
5408
292
7248
174
2212
1981/82
1098
5112
257
7316
242
1936
1982/83
1156
5701
330
8567
124
1802
1983/84
993
4265
391
8905
141
1078
1984/85
1182
5364
217
5813
343
4313
#在上面这个例子中,我们把我们的索引值全部设置为了字符串。这意味着我们不可以使用iloc索引这些列了。#这种情况该如何?我们使用loc。#Getting a row via a label-based index
df.loc['2000/01']
#这里,loc和iloc一样会返回你所索引的行数据的一个series。#唯一的不同是此时你使用的是字符串标签进行引用,而不是数字标签。
#如果loc是字符串标签的索引方法,iloc是数字标签的索引方法,那什么是ix呢?#事实上,ix是一个字符串标签的索引方法,但是它同样支持数字标签索引作为它的备选。#Getting a row via a label-based or numerical index
df.ix['1999/00'] # Label based with numerical index fallback * Not recommend#正如loc和iloc,上述代码将返回一个series包含你所索引的行的数据
#调用sort_index来对dataframe实现排序#inplace=True to apple the sorting in place#置了关键字参数’ascending’为False。这样,我的数据会以降序排列
df.sort_index(ascending=False).head(5)
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
water_year
2012/13
1090
5329
350
9615
187
1797
2011/12
1285
5500
339
7630
379
5261
2010/11
1053
4521
265
6593
267
2885
2009/10
1103
4738
255
6435
244
1958
2008/09
1139
4941
268
6690
323
3189
#当你为一列数据设置了一个索引时,它们将不再是数据本身了。#如果你想把索引设置为原始数据的形式,#你可以使用和set_index相反的操作——reset_index。#Returning an index to data#这将返回数据原始的索引形式。
df = df.reset_index('water_year')
df.head(5)
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
0
1980/81
1182
5408
292
7248
174
2212
1
1981/82
1098
5112
257
7316
242
1936
2
1982/83
1156
5701
330
8567
124
1802
3
1983/84
993
4265
391
8905
141
1078
4
1984/85
1182
5364
217
5813
343
4313
#对数据集应用函数#Applying a function to a columndefbase_year(year):base_year = year[:4]base_year = pd.to_datetime(base_year).yearreturn base_yeardf['year'] = df.water_year.apply(base_year)
df.head(5)
#创造一个新的dataframe#Create a new dataframe containing entries which has rain_octsep values of#greater than 1250
high_rain = df[df.rain_octsep > 1250]
high_rain
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
year
18
1998/99
1268
5824
360
8771
225
2240
1998
26
2006/07
1387
6391
437
10926
357
5168
2006
31
2011/12
1285
5500
339
7630
379
5261
2011
#上述代码为我们创建了如下的dataframe,我们将对它进行pivot操作#ivot实际上是在本文中我们已经见过的操作的组合。#首先,它设置了一个新的索引(set_index()),然后它对这个索引排序(sort_index()),最后它会进行unstack操作。#组合起来就是一个pivot操作。看看你能不能想想会发生什么:#Pivoting#does set_index,sort_index and unstack in a row
high_rain.pivot('year', 'rain_octsep')[['outflow_octsep','outflow_decfeb','outflow_junaug']].fillna('')
outflow_octsep
outflow_decfeb
outflow_junaug
rain_octsep
1268
1285
1387
1268
1285
1387
1268
1285
1387
year
1998
5,824.000
8,771.000
2,240.000
2006
6,391.000
10,926.000
5,168.000
2011
5,500.000
7,630.000
5,261.000
#合并数据集#有时候你有两个单独的数据集,它们直接互相关联,而你想要比较它们的差异或者合并它们#Merging two datasets together
rain_jpn = pd.read_csv('jpn_rain.csv')
rain_jpn.column = ['year', 'jpn_rainfall']
uk_jpn_rain = df.merge(rain_jpn, on = 'year')
uk_jpn_rain.head(5)
#可以看到,两个数据集在年份这一类上已经合并了。rain_jpn数据集仅仅包含年份以及降雨量。
#采用Pandas快速绘制图表#Using pandas to quickly plot graphs
%matplotlib inline
high_rain.plot(x='year', y='rain_octsep')
<matplotlib.axes._subplots.AxesSubplot at 0x7f1214a5d748>
#存储你的数据集#Saving your data to a csv
df.to_csv('high_rain.csv')
# pandas plotimport pandas as pdimport numpy as npimport matplotlib.pyplot as plt #plot data# Seriesdata = pd.Series(np.random.randn(1000),index=np.arange(1000))data = data.cumsum()data.plot()plt.show()
plt.plot(x= , y = )#DataFramedata = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
data =data.cumsum()print(data.head())data.plot()plt.show()#plot methods:#'bar','hist','box','area','scatter','hexbin','pie'data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')data.plot.scatter(x='A',y='C',color='DarkGreen',lable='Class2',ax=ax)plt.show()
References
Python科学计算之Pandas(上)
Python科学计算之Pandas(下)
An Introduction to Scientific Python – Pandas
CheatSheet: Data Exploration using Pandas in Python
python matplotlib 图像可视化 python-data-visualization-course Interactive Web Plotting for Python Interactive Web Plotting for Python-github
待整理的
Matplotlib Introduction to Matplotlib and basic line
matplotlib——一个 2D 绘图库,可产生…
插值 Bilinear interpolation would be order1,
nearest is order0,
and cubic is the default (order3).
举例说明
import numpy as np
import scipy.ndimagex np.arange(64).reshape(8,8)print Original array:
print xprint Resampled by a factor of 2 with nearest i…
相关文章
PCA的原理及MATLAB实现
UFLDL教程:Exercise:PCA in 2D & PCA and Whitening
python-A comparison of various Robust PCA implementations
Deep Learning and Unsupervised Feature Learning Tutorial Solutions 统计学的基本概念 统计学里最基本…
Deep networks Deep Learning and Unsupervised Feature Learning Tutorial Solutions
深度网络的优势 比单层神经网络能学习到更复杂的表达。不同层的网络学习到的特征是由最底层到最高层慢慢上升的。比如在图像的学习中,第一个隐含层网络可能学习的是边缘特征&am…
Linear Decoders Deep Learning and Unsupervised Feature Learning Tutorial Solutions
以三层的稀疏编码神经网络而言,在sparse autoencoder中的输出层满足下面的公式 从公式中可以看出,a3的输出值是f函数的输出,而在普通的sparse autoenc…
Deep Learning and Unsupervised Feature Learning Tutorial Solutions CNN的基本结构包括两层 其一为特征提取层,每个神经元的输入与前一层的局部接受域相连,并提取该局部的特征。一旦该局部特征被提取后,它与其它特征间的位置关系也随之确…