—————————————————
***Pandas 小抄*** —————————————————
***1. Reading and Writing Data*** ——
import pandas as pd
#a. Reading a csv file
df=pd.read_csv('Analysis.cav')
#b. Writing content of data frame to csv file
df.to_csv('werfer.csv')
# c.Reading an Excel file
df=pd.read_excel('sdfsdgsd.xlsx', 'sheeet1')
#d. Writing content of data frame to Excel file
df.to_excel('sddg.xlsx', sheet_name='sheet2')
# pandas 导入导出,读取和储存# The pandas I/O API is a set of top level reader functions accessed like # pd.read_csv() that pandas object.# read_csv # excel files# read_excel# read_hdf# read_sql# read_json# read_msgpack(experimental)# read_html# read_gbq(experimental)# read_stata# read_sas# read_clipboard# read_pickle #自带的亚索# The corresponding writer functions are object methods that are accessed like# df.to_csv# to_csv# to_excel# to_hdf# to_sql# to_json# to_msgpack# to_html# to_gbq# to_stata# to_clipboard# to_pickleimport pandas as pddata = pd.read_csv('student.csv')
print(data)data.to_packle('student.pickle')
—————————————————
***2. Getting Preview of Dataframe***
#a.Looking at top n record
df.head(5)
#b.Looking at bottom n record
df.tail(5)
#c.View columns name
df.columns
—————————————————
***3. Rename Columns of Data Frame***
—————————————————
***4. Selecting Columns or Rows***
#a. Accessing sub data frames
df[['column1','column2']]
#b.Filtering Records
df[df['column1']>10]
df[(df['column1']>10) & df['column2']==30]
df[(df['column1']>10) | df['column2']==30]
# pandas 数据选择import pandas as pd
import numpy as npdates = pd.date_range('20170101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)print(df['A'],df.A)print(df[0:3],df['20170101':'20170104'])# select by label:loc
print(df.loc['20170102'])print(df.loc[:,['A','B']])print(df.loc['20170102',['A','B']])#select by position:iloc
print(df.iloc[3])
print(df.iloc[3,1])
print(df.iloc[1:3,1:3])
print(df.iloc[[1,3,5],1:3])#mixed selection:ix
print(df.ix[:3,['A','C']])# Boolean indexing
print(df)
print(df[df.A>8])
—————————————————
***5. Handing Missing Values*** This is an inevitale part of dealing wiht data. To overcom this hurdle, use dropna or fillna function
#a. dropna: It is used to droprowsor columns having missing data
df1.dropna()
#b.fillna: It is used to fill missing values
df2.fillna(value=5) # It replaces all missing valueswith5
mean = df2['column1'].mean()
df2['column1'].fillna(mean) # It replaces all missing valuesof column1 with mean of available values
————-
from pandas import Series,DataFrame
import pandas as pd
ser = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
serser.drop('c')
ser
.drop() 返回的是一个新对象,元对象不会被改变。
from pandas import Series,DataFrame
import pandas as pd
import numpy as npdf = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
... [np.nan, np.nan, np.nan, 5]],
... columns=list('ABCD'))df#Drop the columns where all elements are nan
df.dropna(axis=1, how='all')A B D
0NaN2.0013.04.012NaNNaN5#Drop the columns where any of the elements is nan>>> df.dropna(axis=1, how='any')D
001125#Drop the rows where all of the elements are nan (there is no row to drop, so df #stays the same):>>> df.dropna(axis=0, how='all')A B C D
0NaN2.0NaN013.04.0NaN12NaNNaNNaN5#Drop the rows where any of the elements are nan >>> df.dropna(axis=0, how='any')
Empty DataFrame
Columns: [A, B, C, D]
Index: []#Keep only the rows with at least 2 non-na values:>>> df.dropna(thresh=2)A B C D
0NaN2.0NaN013.04.0NaN1#Drop where all of the elements are nan, the default is the row, (there is no row to drop, so df #stays the same):>>> df.dropna(how='all')A B C D
0NaN2.0NaN013.04.0NaN12NaNNaNNaN5#Drop where any of the elements are nan, default is the row >>> df.dropna( how='any')
Empty DataFrame
Columns: [A, B, C, D]
Index: []dfnew = pd.DataFrame([[3435234, 2, 5666, 0], [3, 4, np.nan, 1],...: ... [np.nan, np.nan, np.nan, 5]],...: ... columns=list('ABCD'))dfnew.dropna() #默认对row 进行操作,去掉Na项A B C D
03435234256660
# 处理丢失数据import numpy as np
import pandas as pddates = pd.date_range('20170101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
print(df.dropna(axis=0,how='any'))#how={'any','all'} default is 'any'
print(df.dropna(axis=1,how='all'))#填入数据
print(df.fillna(value=0))
#打印缺失数据
print(df.isnull())
#打印出缺失数据,当数据比较大时
print(np.any(df.isnull())==True)
—————————————————
***6. Creating New Columns*** New column is a function of existing columns
df['NewColumn1'] = df['column2'] # Create a copyof existing column2
df['NewColumn2'] = df['column2'] + 10 # Add10to existing column2 thencreate a new one
df['NewColumn3'] = df['column1'] + df['column2'] # Add elements of column1 and column2 thencreatenew column
—————————————————
***7. Aggregate*** a. Groupby: Groupby helps to perform three operations. i. Splitting the data into groups ii. Applying a function to each group individually iii. Combining the result into a data structure
# pandas 合并concatimport pandas as pd
import numpy as np#concatenatingdf1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])print(df1)
print(df2)
print(df3)result = pd.concat([df1,df2,df3],axis=0)#行合并print(result)
#result1 = pd.concat([df1,df2,df3],axis=1)#列合并#print(result1)result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#行合并,忽略indexprint(result)#join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[2,3,4])
print(df1)
print(df2)result2 = pd.concat([df1,df2],join='outer',ignore_index=True)# 补充为naprint(result2)
result22 = pd.concat([df1,df2],join='outer')# 补充为naprint(result22)
result3 = pd.concat([df1,df2],join='inner',ignore_index=True) # 裁剪掉print(result3)
result33 = pd.concat([df1,df2],join='inner') # 裁剪掉print(result33)#join_axes
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'],index=[2,3,4])
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])
print(res)res1 = pd.concat([df1,df2],axis=1)
print(res1)#append
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
res11 = df1.append(df2,ignore_index=True)
print(res11)
res12 = df1.append([df2,df3],ignore_index=True)
print(res12)s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])res13=df1.append(s1,ignore_index=True)
print(res13)#pandas 合并mergeimport pandas as pd#merging two df by key/keys.(may be used in database)#simple example
left = pd.DataFrame({'key':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})print(lef)
print(right)
res14 = pd.merge(left,right,on='key')
print(res14)#consider two keys
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],'key2':['K0','K0','K0','K0'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
print(left)
print(right)res15 = pd.merge(left,right,on=['key1','key2'])
print(res15)
#how =['left','right','inner','outer']
res16 = pd.merge(left,right,on=['key1','key2'],how='inner')
print(res16)
—————————————————
***9. Applying function to element, column or dataframe*** a. Map: It iterates over each element of a series
df['column1'].map(lambda x: 10+x) #this will add 10to each element of column1
df['column2'].map(lambda x:'AV'+x) # this will concatenate 'AV' at the beginning of each element of column2(column format isstring)
b. Apply: As the name suggests, applies a function along any axis of the DataFrame
df[['column1','column2']].apply(sum) #It will returns the sum of all the values of column1 and column2
c. ApplyMap: This helps to apply a function to each element of dataframe
func = lambda x: x+2
df.applymap(func) # it will add2toeach element of dataframe(all columns of dataframe must be numeric type)
—————————————————
***10. Identify unique value*** Function unique helps to return unique values of a column
df['Column1'].unique()
—————————————————
***11. Basic Stats*** Pandas helps to understand the data using basic statistical methods. a. describe: This returns the quick stats(count, mean, std, min, first quartile, median, third quartile, max) on suitable columns
df.describe()
b. covariance: It returns the co-variance between suitable columns
df.cov()
c.correlation: It returns the co-variance between suitable columns.
#https://python.freelycode.com/contribution/detail/333#https://python.freelycode.com/contribution/detail/334#http://www.datadependence.com/2016/05/scientific-python-pandas/#Python科学计算之Pandas#导入Pandas的标准方式import pandas as pd # This is the standard
#将数据导入Pandas,采用[英国政府数据中关于降雨量数据](https://data.gov.uk/dataset/average-temperature-and-rainfall-england-and-wales/resource/3fea0f7b-5304-4f11-a809-159f4558e7da)# Reading a csv into Pandas,从csv文件中读取到了数据,并将他们存入了dataframe中#header关键字告诉Pandas这些数据是否有列名,在哪里。如果没有列名,你可以将其置为None。
df = pd.read_csv('uk_rain_2014.csv', header=0)
#将你的数据准备好以进行挖掘和分析#想要快速查看前x行数据#Getting first x rows
df.head(5)
Water Year
Rain (mm) Oct-Sep
Outflow (m3/s) Oct-Sep
Rain (mm) Dec-Feb
Outflow (m3/s) Dec-Feb
Rain (mm) Jun-Aug
Outflow (m3/s) Jun-Aug
0
1980/81
1182
5408
292
7248
174
2212
1
1981/82
1098
5112
257
7316
242
1936
2
1982/83
1156
5701
330
8567
124
1802
3
1983/84
993
4265
391
8905
141
1078
4
1984/85
1182
5364
217
5813
343
4313
#想要获得最后x行的数据#Getting last x rows#Pandas不是从dataframe的结尾处开始倒着输出数据,#而是按照它们在dataframe中固有的顺序输出给你。
df.tail(5)
#取数据的行数,即条目数#Finding out how many rows dataset has.
len(df)
33
#数据的一些基本的统计信息#Finding out basic statistical information on your dataset.
pd.options.display.float_format = '{:,.3f}'.format
#Limit output to 3 decimal places.计数,均值,标准方差
df.describe()
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
count
33.000
33.000
33.000
33.000
33.000
33.000
mean
1,129.000
5,019.182
325.364
7,926.545
237.485
2,439.758
std
101.900
658.588
69.995
1,692.800
66.168
1,025.914
min
856.000
3,479.000
206.000
4,578.000
103.000
1,078.000
25%
1,053.000
4,506.000
268.000
6,690.000
193.000
1,797.000
50%
1,139.000
5,112.000
309.000
7,630.000
229.000
2,142.000
75%
1,182.000
5,497.000
360.000
8,905.000
280.000
2,959.000
max
1,387.000
6,391.000
484.000
11,486.000
379.000
5,261.000
#过滤#提取一整列。可以直接使用列标签#Getting a column by label
df['rain_octsep']
#使用多条条件表达式来进行过滤#Filtering by multiple conditionals#将返回rain_octsep小于1000并且outflow_octsep小于4000的那些条目。
df[(df.rain_octsep <1000) & (df.outflow_octsep <4000)]
# Can't use the keyword 'and'
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
15
1995/96
856
3479
245
5515
172
1439
#数据中有字符串,也可以使用字符串方法来过滤数据。#必须使用.str.[string method],你不能直接在字符串上直接调用字符串方法。#Filtering by string methods
df[df.water_year.str.startswith('199')] #这一语句返回1990年代的所有条目
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
10
1990/91
1022
4418
305
7120
216
1923
11
1991/92
1151
4506
246
5493
280
2118
12
1992/93
1130
5246
308
8751
219
2551
13
1993/94
1162
5583
422
10109
193
1638
14
1994/95
1110
5370
484
11486
103
1231
15
1995/96
856
3479
245
5515
172
1439
16
1996/97
1047
4019
258
5770
256
2102
17
1997/98
1169
4953
341
7747
285
3206
18
1998/99
1268
5824
360
8771
225
2240
19
1999/00
1204
5665
417
10021
197
2166
#索引#如果行有数字索引,可以使用iloc引用他们#Getting a row via a numerical index#iloc仅仅作用于数字索引。它将会返回该行的一个series。
df.iloc[30]
#可能在数据集里有年份的列,或者年代的列#Setting a new index from an existing column
df = df.set_index(['water_year'])
df.head(5)
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
water_year
1980/81
1182
5408
292
7248
174
2212
1981/82
1098
5112
257
7316
242
1936
1982/83
1156
5701
330
8567
124
1802
1983/84
993
4265
391
8905
141
1078
1984/85
1182
5364
217
5813
343
4313
#在上面这个例子中,我们把我们的索引值全部设置为了字符串。这意味着我们不可以使用iloc索引这些列了。#这种情况该如何?我们使用loc。#Getting a row via a label-based index
df.loc['2000/01']
#这里,loc和iloc一样会返回你所索引的行数据的一个series。#唯一的不同是此时你使用的是字符串标签进行引用,而不是数字标签。
#如果loc是字符串标签的索引方法,iloc是数字标签的索引方法,那什么是ix呢?#事实上,ix是一个字符串标签的索引方法,但是它同样支持数字标签索引作为它的备选。#Getting a row via a label-based or numerical index
df.ix['1999/00'] # Label based with numerical index fallback * Not recommend#正如loc和iloc,上述代码将返回一个series包含你所索引的行的数据
#调用sort_index来对dataframe实现排序#inplace=True to apple the sorting in place#置了关键字参数’ascending’为False。这样,我的数据会以降序排列
df.sort_index(ascending=False).head(5)
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
water_year
2012/13
1090
5329
350
9615
187
1797
2011/12
1285
5500
339
7630
379
5261
2010/11
1053
4521
265
6593
267
2885
2009/10
1103
4738
255
6435
244
1958
2008/09
1139
4941
268
6690
323
3189
#当你为一列数据设置了一个索引时,它们将不再是数据本身了。#如果你想把索引设置为原始数据的形式,#你可以使用和set_index相反的操作——reset_index。#Returning an index to data#这将返回数据原始的索引形式。
df = df.reset_index('water_year')
df.head(5)
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
0
1980/81
1182
5408
292
7248
174
2212
1
1981/82
1098
5112
257
7316
242
1936
2
1982/83
1156
5701
330
8567
124
1802
3
1983/84
993
4265
391
8905
141
1078
4
1984/85
1182
5364
217
5813
343
4313
#对数据集应用函数#Applying a function to a columndefbase_year(year):base_year = year[:4]base_year = pd.to_datetime(base_year).yearreturn base_yeardf['year'] = df.water_year.apply(base_year)
df.head(5)
#创造一个新的dataframe#Create a new dataframe containing entries which has rain_octsep values of#greater than 1250
high_rain = df[df.rain_octsep > 1250]
high_rain
water_year
rain_octsep
outflow_octsep
rain_decfeb
outflow_decfeb
rain_junaug
outflow_junaug
year
18
1998/99
1268
5824
360
8771
225
2240
1998
26
2006/07
1387
6391
437
10926
357
5168
2006
31
2011/12
1285
5500
339
7630
379
5261
2011
#上述代码为我们创建了如下的dataframe,我们将对它进行pivot操作#ivot实际上是在本文中我们已经见过的操作的组合。#首先,它设置了一个新的索引(set_index()),然后它对这个索引排序(sort_index()),最后它会进行unstack操作。#组合起来就是一个pivot操作。看看你能不能想想会发生什么:#Pivoting#does set_index,sort_index and unstack in a row
high_rain.pivot('year', 'rain_octsep')[['outflow_octsep','outflow_decfeb','outflow_junaug']].fillna('')
outflow_octsep
outflow_decfeb
outflow_junaug
rain_octsep
1268
1285
1387
1268
1285
1387
1268
1285
1387
year
1998
5,824.000
8,771.000
2,240.000
2006
6,391.000
10,926.000
5,168.000
2011
5,500.000
7,630.000
5,261.000
#合并数据集#有时候你有两个单独的数据集,它们直接互相关联,而你想要比较它们的差异或者合并它们#Merging two datasets together
rain_jpn = pd.read_csv('jpn_rain.csv')
rain_jpn.column = ['year', 'jpn_rainfall']
uk_jpn_rain = df.merge(rain_jpn, on = 'year')
uk_jpn_rain.head(5)
#可以看到,两个数据集在年份这一类上已经合并了。rain_jpn数据集仅仅包含年份以及降雨量。
#采用Pandas快速绘制图表#Using pandas to quickly plot graphs
%matplotlib inline
high_rain.plot(x='year', y='rain_octsep')
<matplotlib.axes._subplots.AxesSubplot at 0x7f1214a5d748>
#存储你的数据集#Saving your data to a csv
df.to_csv('high_rain.csv')
# pandas plotimport pandas as pdimport numpy as npimport matplotlib.pyplot as plt #plot data# Seriesdata = pd.Series(np.random.randn(1000),index=np.arange(1000))data = data.cumsum()data.plot()plt.show()
plt.plot(x= , y = )#DataFramedata = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
data =data.cumsum()print(data.head())data.plot()plt.show()#plot methods:#'bar','hist','box','area','scatter','hexbin','pie'data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')data.plot.scatter(x='A',y='C',color='DarkGreen',lable='Class2',ax=ax)plt.show()
References
Python科学计算之Pandas(上)
Python科学计算之Pandas(下)
An Introduction to Scientific Python – Pandas
CheatSheet: Data Exploration using Pandas in Python
python matplotlib 图像可视化 python-data-visualization-course Interactive Web Plotting for Python Interactive Web Plotting for Python-github
待整理的
Matplotlib Introduction to Matplotlib and basic line
matplotlib——一个 2D 绘图库,可产生…
空语句 熟悉C/C的经常会这样写判断语句:
if(ture)expression;
else;//do nothing 那在Python里面怎么去表达空语句呢,这时候就用到了pass。
number input(请输入一个数)
if number 5:print(your numbers is 5)
else:pass 实际…
os 模块用法示例 python编程时,经常和文件、目录打交道,这是就离不了os模块。os模块包含普遍的操作系统功能,与具体的平台无关。以下列举常用的命令
1. os.name()——判断现在正在实用的平台,Windows 返回 ‘nt; Linux 返回’pos…
插值 Bilinear interpolation would be order1,
nearest is order0,
and cubic is the default (order3).
举例说明
import numpy as np
import scipy.ndimagex np.arange(64).reshape(8,8)print Original array:
print xprint Resampled by a factor of 2 with nearest i…
载入数据并显示 Deep Learning and Unsupervised Feature Learning Tutorial Solutions
下载MINIST数据集及加载数据集的函数。MINIST数据集的介绍。
% Change the filenames if youve saved the files under different names
% On some platforms, the files might be saved…
相关文章
PCA的原理及MATLAB实现
UFLDL教程:Exercise:PCA in 2D & PCA and Whitening
python-A comparison of various Robust PCA implementations
Deep Learning and Unsupervised Feature Learning Tutorial Solutions 统计学的基本概念 统计学里最基本…