点击标题即可获取文章源代码和笔记
4.1.0 概要
Pandas基础处理Pandas是什么?为什么用?核心数据结构DataFramePanelSeries基本操作运算画图文件的读取与存储高级处理4. 1Pandas介绍4.1 .1 Pandas介绍 - 数据处理工具panel + data + analysispanel面板数据 - 计量经济学 三维数据4.1 .2 为什么使用Pandas便捷的数据处理能力读取文件方便封装了Matplotlib、Numpy的画图和计算4.1 .3 DataFrame结构:既有行索引,又有列索引的二维数组属性:shapeindexcolumnsvaluesT方法:head( ) tail( ) 3 DataFrame索引的设置1 )修改行列索引值2 )重设索引3 )设置新索引2 PanelDataFrame的容器3 Series带索引的一维数组属性indexvalues总结:DataFrame是Series的容器Panel是DataFrame的容器
4.2 基本数据操作4.2 .1 索引操作1 )直接索引先列后行2 )按名字索引loc3 )按数字索引iloc4 )组合索引数字、名字4.2 .3 排序对内容排序dataframeseries对索引排序dataframeseries
4.3 DataFrame运算算术运算逻辑运算逻辑运算符布尔索引逻辑运算函数query( ) isin( ) 统计运算min max mean median var stdnp. argmax( ) np. argmin( ) 自定义运算apply ( func, axis= 0 ) True func: 自定义函数
4.4 Pandas画图sr. plot( )
4.5 文件读取与存储4.5 .1 CSVpd. read_csv( path) usecols= names= dataframe. to_csv( path) columns= [ ] index= False header= False 4.5 .2 HDF5hdf5 存储 3 维数据的文件key1 dataframe1二维数据key2 dataframe2二维数据pd. read_hdf( path, key= ) df. to_hdf( path, key= ) 4.5 .3 JSONpd. read_json( path) orient= "records" lines= True df. to_json( patn) orient= "records" lines= True
4.1.3 DataFrame
import numpy as np
stock_change = np. random. normal( 0 , 1 , ( 10 , 5 ) )
stock_change
array([[ 0.77072465, 1.30408183, -0.44043464, 0.8900768 , -0.80947118],[ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051],[-1.68378051, 0.4302981 , 0.8069393 , 0.60557427, -0.03960376],[ 0.75708007, -0.39899325, 0.23027082, -0.89585658, -1.86590247],[-0.41516245, -1.31841546, 0.16256478, -0.67449097, -1.26234013],[-0.27687242, -0.74154521, -0.03755446, 1.24182603, -0.79444361],[-0.2549323 , -0.41034663, -1.85076521, -1.28663451, -0.28566877],[ 1.22453612, -1.60200055, -1.83171522, -0.85322799, -1.70950421],[ 2.00461483, 1.49338564, 0.33928513, -0.1776084 , -0.39698965],[ 0.2184662 , -0.03868143, -0.21432675, 0.00604093, 1.35011139]])
import pandas as pd
pd. DataFrame( stock_change)
0 1 2 3 4 0 0.770725 1.304082 -0.440435 0.890077 -0.809471 1 0.924080 0.016468 -1.266148 1.523937 -0.853731 2 -1.683781 0.430298 0.806939 0.605574 -0.039604 3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340 5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 8 2.004615 1.493386 0.339285 -0.177608 -0.396990 9 0.218466 -0.038681 -0.214327 0.006041 1.350111
stock_code = [ '股票' + str ( i) for i in range ( stock_change. shape[ 0 ] ) ]
stock_code
['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9']
data = pd. DataFrame( stock_change, index= stock_code)
data
0 1 2 3 4 股票0 0.770725 1.304082 -0.440435 0.890077 -0.809471 股票1 0.924080 0.016468 -1.266148 1.523937 -0.853731 股票2 -1.683781 0.430298 0.806939 0.605574 -0.039604 股票3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 股票4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340 股票5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票8 2.004615 1.493386 0.339285 -0.177608 -0.396990 股票9 0.218466 -0.038681 -0.214327 0.006041 1.350111
date = pd. date_range( start= "20200618" , periods= 5 , freq= "B" )
date
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23','2020-06-24'],dtype='datetime64[ns]', freq='B')
data = pd. DataFrame( stock_change, index= stock_code, columns= date)
data
2020-06-18 2020-06-19 2020-06-22 2020-06-23 2020-06-24 股票0 0.770725 1.304082 -0.440435 0.890077 -0.809471 股票1 0.924080 0.016468 -1.266148 1.523937 -0.853731 股票2 -1.683781 0.430298 0.806939 0.605574 -0.039604 股票3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 股票4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340 股票5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票8 2.004615 1.493386 0.339285 -0.177608 -0.396990 股票9 0.218466 -0.038681 -0.214327 0.006041 1.350111
DataFrame属性
data. shape
(10, 5)
data. index
Index(['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9'], dtype='object')
data. columns
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23','2020-06-24'],dtype='datetime64[ns]', freq='B')
data. values
array([[ 0.77072465, 1.30408183, -0.44043464, 0.8900768 , -0.80947118],[ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051],[-1.68378051, 0.4302981 , 0.8069393 , 0.60557427, -0.03960376],[ 0.75708007, -0.39899325, 0.23027082, -0.89585658, -1.86590247],[-0.41516245, -1.31841546, 0.16256478, -0.67449097, -1.26234013],[-0.27687242, -0.74154521, -0.03755446, 1.24182603, -0.79444361],[-0.2549323 , -0.41034663, -1.85076521, -1.28663451, -0.28566877],[ 1.22453612, -1.60200055, -1.83171522, -0.85322799, -1.70950421],[ 2.00461483, 1.49338564, 0.33928513, -0.1776084 , -0.39698965],[ 0.2184662 , -0.03868143, -0.21432675, 0.00604093, 1.35011139]])
data. T
股票0 股票1 股票2 股票3 股票4 股票5 股票6 股票7 股票8 股票9 2020-06-18 0.770725 0.924080 -1.683781 0.757080 -0.415162 -0.276872 -0.254932 1.224536 2.004615 0.218466 2020-06-19 1.304082 0.016468 0.430298 -0.398993 -1.318415 -0.741545 -0.410347 -1.602001 1.493386 -0.038681 2020-06-22 -0.440435 -1.266148 0.806939 0.230271 0.162565 -0.037554 -1.850765 -1.831715 0.339285 -0.214327 2020-06-23 0.890077 1.523937 0.605574 -0.895857 -0.674491 1.241826 -1.286635 -0.853228 -0.177608 0.006041 2020-06-24 -0.809471 -0.853731 -0.039604 -1.865902 -1.262340 -0.794444 -0.285669 -1.709504 -0.396990 1.350111
DataFrame方法
data. head( )
2020-06-18 2020-06-19 2020-06-22 2020-06-23 2020-06-24 股票0 0.770725 1.304082 -0.440435 0.890077 -0.809471 股票1 0.924080 0.016468 -1.266148 1.523937 -0.853731 股票2 -1.683781 0.430298 0.806939 0.605574 -0.039604 股票3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 股票4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340
data. tail( )
2020-06-18 2020-06-19 2020-06-22 2020-06-23 2020-06-24 股票5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票8 2.004615 1.493386 0.339285 -0.177608 -0.396990 股票9 0.218466 -0.038681 -0.214327 0.006041 1.350111
3 DataFrame索引的设置
data. index[ 2 ]
'股票2'
data. index[ 2 ] = "股票88"
---------------------------------------------------------------------------TypeError Traceback (most recent call last)<ipython-input-19-9e95917cc4d9> in <module>
----> 1 data.index[2] = "股票88"D:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)3908 3909 def __setitem__(self, key, value):
-> 3910 raise TypeError("Index does not support mutable operations")3911 3912 def __getitem__(self, key):TypeError: Index does not support mutable operations
stock_ = [ "股票_{}" . format ( i) for i in range ( 10 ) ]
data. index = stock_
data. index
Index(['股票_0', '股票_1', '股票_2', '股票_3', '股票_4', '股票_5', '股票_6', '股票_7', '股票_8','股票_9'],dtype='object')
重设索引
reset_index(drop=False) 设置新的下标索引 drop:默认为False,不删除原来索引,如果为True,删除原来的索引值
data. reset_index( )
index 2020-06-18 00:00:00 2020-06-19 00:00:00 2020-06-22 00:00:00 2020-06-23 00:00:00 2020-06-24 00:00:00 0 股票_0 0.770725 1.304082 -0.440435 0.890077 -0.809471 1 股票_1 0.924080 0.016468 -1.266148 1.523937 -0.853731 2 股票_2 -1.683781 0.430298 0.806939 0.605574 -0.039604 3 股票_3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 4 股票_4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340 5 股票_5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 6 股票_6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 7 股票_7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 8 股票_8 2.004615 1.493386 0.339285 -0.177608 -0.396990 9 股票_9 0.218466 -0.038681 -0.214327 0.006041 1.350111
data. reset_index( drop= True )
2020-06-18 2020-06-19 2020-06-22 2020-06-23 2020-06-24 0 0.770725 1.304082 -0.440435 0.890077 -0.809471 1 0.924080 0.016468 -1.266148 1.523937 -0.853731 2 -1.683781 0.430298 0.806939 0.605574 -0.039604 3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340 5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 8 2.004615 1.493386 0.339285 -0.177608 -0.396990 9 0.218466 -0.038681 -0.214327 0.006041 1.350111
以某列值设置为新的索引
set_index(keys,drop=True) keys:列索引名或者列索引名称的列表 drop:boolean,default True 当作新的索引,删除原来的索引列
设置新索引案例
df = pd. DataFrame( { 'month' : [ 1 , 4 , 7 , 10 ] , 'year' : [ 2012 , 2014 , 2013 , 2014 ] , 'sale' : [ 55 , 40 , 84 , 31 ]
} )
df
month year sale 0 1 2012 55 1 4 2014 40 2 7 2013 84 3 10 2014 31
df. set_index( 'month' )
year sale month 1 2012 55 4 2014 40 7 2013 84 10 2014 31
new_df = df. set_index( [ 'year' , 'month' ] )
new_df
sale year month 2012 1 55 2014 4 40 2013 7 84 2014 10 31
new_df. index
MultiIndex([(2012, 1),(2014, 4),(2013, 7),(2014, 10)],names=['year', 'month'])
4.1.4 MultiIndex 与 Panel的关系
1 Multilndex多级或分层索引对象。
names: levels的名称
levels:每个level的元组值
new_df. index. names
FrozenList(['year', 'month'])
new_df. index. levels
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])
2 Panel
p = pd. Panel( )
p
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version"""Entry point for launching an IPython kernel.<pandas.__getattr__.<locals>.Panel at 0x203fd31ea08>
data
2020-06-18 2020-06-19 2020-06-22 2020-06-23 2020-06-24 股票_0 0.770725 1.304082 -0.440435 0.890077 -0.809471 股票_1 0.924080 0.016468 -1.266148 1.523937 -0.853731 股票_2 -1.683781 0.430298 0.806939 0.605574 -0.039604 股票_3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 股票_4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340 股票_5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票_6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票_7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票_8 2.004615 1.493386 0.339285 -0.177608 -0.396990 股票_9 0.218466 -0.038681 -0.214327 0.006041 1.350111
Series
data. iloc[ 1 , : ]
2020-06-18 0.924080
2020-06-19 0.016468
2020-06-22 -1.266148
2020-06-23 1.523937
2020-06-24 -0.853731
Freq: B, Name: 股票_1, dtype: float64
type ( data. iloc[ 1 , : ] )
pandas.core.series.Series
属性
data. iloc[ 1 , : ] . index
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23','2020-06-24'],dtype='datetime64[ns]', freq='B')
data. iloc[ 1 , : ] . values
array([ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051])
1. 创建Series
通过已有数据创建
pd. Series( np. arange( 10 ) )
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
pd. Series( [ 6.7 , 5.6 , 3 , 10 , 2 ] , index= [ 1 , 2 , 3 , 4 , 5 ] )
1 6.7
2 5.6
3 3.0
4 10.0
5 2.0
dtype: float64
pd. Series( { 'red' : 100 , 'blue' : 200 , 'green' : 500 , 'yellow' : 1000
} )
red 100
blue 200
green 500
yellow 1000
dtype: int64
总结
DataFrame 是 Series的容器 Panel 是 DataFrame的容器
4.2 基本数据操作
datas = pd. read_excel( "./datas/szfj_baoan.xls" )
datas
district roomnum hall AREA C_floor floor_num school subway per_price 0 baoan 3 2 89.3 middle 31 0 0 7.0773 1 baoan 4 2 127.0 high 31 0 0 6.9291 2 baoan 1 1 28.0 low 39 0 0 3.9286 3 baoan 1 1 28.0 middle 30 0 0 3.3568 4 baoan 2 2 78.0 middle 8 1 1 5.0769 ... ... ... ... ... ... ... ... ... ... 1246 baoan 4 2 89.3 low 8 0 0 4.2553 1247 baoan 2 1 67.0 middle 30 0 0 3.8060 1248 baoan 2 2 67.4 middle 29 1 0 5.3412 1249 baoan 2 2 73.1 low 15 1 0 5.9508 1250 baoan 3 2 86.2 middle 32 0 1 4.5244
1251 rows × 9 columns
datas. columns
Index(['district', 'roomnum', 'hall', 'AREA', 'C_floor', 'floor_num', 'school','subway', 'per_price'],dtype='object')
datas = datas. drop( columns= [ 'school' , 'subway' , ] , axis= 0 )
datas
district roomnum hall AREA C_floor floor_num per_price 0 baoan 3 2 89.3 middle 31 7.0773 1 baoan 4 2 127.0 high 31 6.9291 2 baoan 1 1 28.0 low 39 3.9286 3 baoan 1 1 28.0 middle 30 3.3568 4 baoan 2 2 78.0 middle 8 5.0769 ... ... ... ... ... ... ... ... 1246 baoan 4 2 89.3 low 8 4.2553 1247 baoan 2 1 67.0 middle 30 3.8060 1248 baoan 2 2 67.4 middle 29 5.3412 1249 baoan 2 2 73.1 low 15 5.9508 1250 baoan 3 2 86.2 middle 32 4.5244
1251 rows × 7 columns
4.2.1 索引操作
1.直接使用行列索引(先列后行)
datas[ "per_price" ] [ 0 ]
7.0773
2. 按名字索引(先行后列)
datas. loc[ 0 ] [ "per_price" ]
7.0773
datas. loc[ 0 , "per_price" ]
7.0773
3.按数字索引
datas. iloc[ 0 , 6 ]
7.0773
datas. index[ 0 : 4 ]
RangeIndex(start=0, stop=4, step=1)
datas. loc[ datas. index[ 0 : 4 ] , [ "district" , "roomnum" ] ]
district roomnum 0 baoan 3 1 baoan 4 2 baoan 1 3 baoan 1
datas. columns. get_indexer( [ "district" , "roomnum" ] )
array([0, 1], dtype=int64)
datas. iloc[ 0 : 4 , datas. columns. get_indexer( [ "district" , "roomnum" ] ) ]
district roomnum 0 baoan 3 1 baoan 4 2 baoan 1 3 baoan 1
4.2.2 赋值操作
datas[ "hall" ] = 5
datas. head( )
district roomnum hall AREA C_floor floor_num per_price 0 baoan 3 5 89.3 middle 31 7.0773 1 baoan 4 5 127.0 high 31 6.9291 2 baoan 1 5 28.0 low 39 3.9286 3 baoan 1 5 28.0 middle 30 3.3568 4 baoan 2 5 78.0 middle 8 5.0769
datas. hall = 1
datas. head( )
district roomnum hall AREA C_floor floor_num per_price 0 baoan 3 1 89.3 middle 31 7.0773 1 baoan 4 1 127.0 high 31 6.9291 2 baoan 1 1 28.0 low 39 3.9286 3 baoan 1 1 28.0 middle 30 3.3568 4 baoan 2 1 78.0 middle 8 5.0769
datas. iloc[ 0 , 0 ] = "zzzz"
datas. head( )
district roomnum hall AREA C_floor floor_num per_price 0 zzzz 3 1 89.3 middle 31 7.0773 1 baoan 4 1 127.0 high 31 6.9291 2 baoan 1 1 28.0 low 39 3.9286 3 baoan 1 1 28.0 middle 30 3.3568 4 baoan 2 1 78.0 middle 8 5.0769
4.2.3 排序
datas. sort_values( by= "per_price" , ascending= False )
district roomnum hall AREA C_floor floor_num per_price 917 baoan 4 1 93.59 high 28 21.9040 356 baoan 8 1 248.99 low 7 21.2860 576 baoan 1 1 21.95 middle 22 19.3622 296 baoan 4 1 93.59 high 28 19.2328 186 baoan 3 1 113.60 middle 31 16.5493 ... ... ... ... ... ... ... ... 911 baoan 2 1 89.00 middle 16 1.6854 841 baoan 2 1 75.00 high 7 1.6667 1188 baoan 3 1 110.00 middle 33 1.5909 684 baoan 3 1 89.00 middle 26 1.2247 1047 baoan 3 1 98.90 middle 26 1.1931
1251 rows × 7 columns
datas. sort_values( by= "per_price" )
district roomnum hall AREA C_floor floor_num per_price 1047 baoan 3 1 98.90 middle 26 1.1931 684 baoan 3 1 89.00 middle 26 1.2247 1188 baoan 3 1 110.00 middle 33 1.5909 841 baoan 2 1 75.00 high 7 1.6667 911 baoan 2 1 89.00 middle 16 1.6854 ... ... ... ... ... ... ... ... 186 baoan 3 1 113.60 middle 31 16.5493 296 baoan 4 1 93.59 high 28 19.2328 576 baoan 1 1 21.95 middle 22 19.3622 356 baoan 8 1 248.99 low 7 21.2860 917 baoan 4 1 93.59 high 28 21.9040
1251 rows × 7 columns
datas. sort_values( by= [ "district" , "per_price" ] )
district roomnum hall AREA C_floor floor_num per_price 1047 baoan 3 1 98.90 middle 26 1.1931 684 baoan 3 1 89.00 middle 26 1.2247 1188 baoan 3 1 110.00 middle 33 1.5909 841 baoan 2 1 75.00 high 7 1.6667 911 baoan 2 1 89.00 middle 16 1.6854 ... ... ... ... ... ... ... ... 296 baoan 4 1 93.59 high 28 19.2328 576 baoan 1 1 21.95 middle 22 19.3622 356 baoan 8 1 248.99 low 7 21.2860 917 baoan 4 1 93.59 high 28 21.9040 0 zzzz 3 1 89.30 middle 31 7.0773
1251 rows × 7 columns
datas. sort_index( )
district roomnum hall AREA C_floor floor_num per_price 0 zzzz 3 1 89.3 middle 31 7.0773 1 baoan 4 1 127.0 high 31 6.9291 2 baoan 1 1 28.0 low 39 3.9286 3 baoan 1 1 28.0 middle 30 3.3568 4 baoan 2 1 78.0 middle 8 5.0769 ... ... ... ... ... ... ... ... 1246 baoan 4 1 89.3 low 8 4.2553 1247 baoan 2 1 67.0 middle 30 3.8060 1248 baoan 2 1 67.4 middle 29 5.3412 1249 baoan 2 1 73.1 low 15 5.9508 1250 baoan 3 1 86.2 middle 32 4.5244
1251 rows × 7 columns
sr = datas[ "per_price" ]
sr
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
sr. sort_values( )
1047 1.1931
684 1.2247
1188 1.5909
841 1.6667
911 1.6854...
186 16.5493
296 19.2328
576 19.3622
356 21.2860
917 21.9040
Name: per_price, Length: 1251, dtype: float64
sr. sort_index( )
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
4.3 DataFrame运算
算术运算
datas[ "roomnum" ] + 3
0 6
1 7
2 4
3 4
4 5..
1246 7
1247 5
1248 5
1249 5
1250 6
Name: roomnum, Length: 1251, dtype: int64
datas[ "roomnum" ] . add( 3 ) . head( )
0 6
1 7
2 4
3 4
4 5
Name: roomnum, dtype: int64
datas. iloc[ : , 1 : 4 ]
roomnum hall AREA 0 3 1 89.3 1 4 1 127.0 2 1 1 28.0 3 1 1 28.0 4 2 1 78.0 ... ... ... ... 1246 4 1 89.3 1247 2 1 67.0 1248 2 1 67.4 1249 2 1 73.1 1250 3 1 86.2
1251 rows × 3 columns
datas. iloc[ : , 1 : 4 ] + 10
roomnum hall AREA 0 13 11 99.3 1 14 11 137.0 2 11 11 38.0 3 11 11 38.0 4 12 11 88.0 ... ... ... ... 1246 14 11 99.3 1247 12 11 77.0 1248 12 11 77.4 1249 12 11 83.1 1250 13 11 96.2
1251 rows × 3 columns
逻辑运算
datas[ 'AREA' ] > 100
0 False
1 True
2 False
3 False
4 False...
1246 False
1247 False
1248 False
1249 False
1250 False
Name: AREA, Length: 1251, dtype: bool
datas[ datas[ 'AREA' ] > 100 ]
district roomnum hall AREA C_floor floor_num per_price 1 baoan 4 1 127.00 high 31 6.9291 5 baoan 4 1 125.17 middle 15 5.8161 16 baoan 3 1 151.00 high 20 4.9669 25 baoan 3 1 116.00 high 18 5.0000 26 baoan 5 1 151.25 high 30 7.6033 ... ... ... ... ... ... ... ... 1232 baoan 5 1 127.17 low 24 5.1113 1238 baoan 4 1 130.74 low 30 13.0029 1239 baoan 3 1 102.10 middle 28 10.8717 1241 baoan 5 1 151.30 high 29 7.2703 1243 baoan 4 1 142.25 high 32 6.3269
322 rows × 7 columns
( datas[ "AREA" ] > 100 ) & ( datas[ "per_price" ] < 40000 )
0 False
1 True
2 False
3 False
4 False...
1246 False
1247 False
1248 False
1249 False
1250 False
Length: 1251, dtype: bool
datas[ ( datas[ "AREA" ] > 100 ) & ( datas[ "per_price" ] < 40000 ) ]
district roomnum hall AREA C_floor floor_num per_price 1 baoan 4 1 127.00 high 31 6.9291 5 baoan 4 1 125.17 middle 15 5.8161 16 baoan 3 1 151.00 high 20 4.9669 25 baoan 3 1 116.00 high 18 5.0000 26 baoan 5 1 151.25 high 30 7.6033 ... ... ... ... ... ... ... ... 1232 baoan 5 1 127.17 low 24 5.1113 1238 baoan 4 1 130.74 low 30 13.0029 1239 baoan 3 1 102.10 middle 28 10.8717 1241 baoan 5 1 151.30 high 29 7.2703 1243 baoan 4 1 142.25 high 32 6.3269
322 rows × 7 columns
逻辑运算函数
datas. query( "AREA>100 & per_price<40000" )
district roomnum hall AREA C_floor floor_num per_price 1 baoan 4 1 127.00 high 31 6.9291 5 baoan 4 1 125.17 middle 15 5.8161 16 baoan 3 1 151.00 high 20 4.9669 25 baoan 3 1 116.00 high 18 5.0000 26 baoan 5 1 151.25 high 30 7.6033 ... ... ... ... ... ... ... ... 1232 baoan 5 1 127.17 low 24 5.1113 1238 baoan 4 1 130.74 low 30 13.0029 1239 baoan 3 1 102.10 middle 28 10.8717 1241 baoan 5 1 151.30 high 29 7.2703 1243 baoan 4 1 142.25 high 32 6.3269
322 rows × 7 columns
datas[ "roomnum" ] . isin( [ 4 , 5 ] )
0 False
1 True
2 False
3 False
4 False...
1246 True
1247 False
1248 False
1249 False
1250 False
Name: roomnum, Length: 1251, dtype: bool
datas[ datas[ "roomnum" ] . isin( [ 4 , 5 ] ) ]
district roomnum hall AREA C_floor floor_num per_price 1 baoan 4 1 127.00 high 31 6.9291 5 baoan 4 1 125.17 middle 15 5.8161 26 baoan 5 1 151.25 high 30 7.6033 29 baoan 4 1 143.45 middle 25 6.9711 36 baoan 4 1 134.60 middle 32 9.1828 ... ... ... ... ... ... ... ... 1232 baoan 5 1 127.17 low 24 5.1113 1238 baoan 4 1 130.74 low 30 13.0029 1241 baoan 5 1 151.30 high 29 7.2703 1243 baoan 4 1 142.25 high 32 6.3269 1246 baoan 4 1 89.30 low 8 4.2553
224 rows × 7 columns
统计运算
datas. describe( )
roomnum hall AREA floor_num per_price count 1251.000000 1251.0 1251.000000 1251.000000 1251.000000 mean 2.906475 1.0 92.409976 24.598721 6.643429 std 0.940663 0.0 37.798122 9.332119 2.435132 min 1.000000 1.0 21.950000 1.000000 1.193100 25% 2.000000 1.0 75.000000 17.000000 5.075850 50% 3.000000 1.0 87.800000 28.000000 5.906800 75% 3.000000 1.0 101.375000 31.000000 7.761950 max 8.000000 1.0 352.900000 53.000000 21.904000
统计函数
datas. max ( axis= 0 )
district zzzz
roomnum 8
hall 1
AREA 352.9
C_floor middle
floor_num 53
per_price 21.904
dtype: object
datas. var( axis= 0 )
roomnum 0.884846
hall 0.000000
AREA 1428.698032
floor_num 87.088446
per_price 5.929870
dtype: float64
datas. std( axis= 0 )
roomnum 0.940663
hall 0.000000
AREA 37.798122
floor_num 9.332119
per_price 2.435132
dtype: float64
datas. iloc[ : , 3 ]
0 89.3
1 127.0
2 28.0
3 28.0
4 78.0...
1246 89.3
1247 67.0
1248 67.4
1249 73.1
1250 86.2
Name: AREA, Length: 1251, dtype: float64
datas. iloc[ : , 3 ] . idxmax( axis= 0 )
759
datas. iloc[ 759 , 3 ]
352.9
datas. iloc[ : , 3 ] . idxmin( axis= 0 )
576
datas. iloc[ 576 , 3 ]
21.95
累计统计函数
datas[ "per_price" ]
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
datas[ "per_price" ] . cumsum( )
0 7.0773
1 14.0064
2 17.9350
3 21.2918
4 26.3687...
1246 8291.3076
1247 8295.1136
1248 8300.4548
1249 8306.4056
1250 8310.9300
Name: per_price, Length: 1251, dtype: float64
datas[ "per_price" ] . sort_index( ) . cumsum( ) . plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x2039a3a3dc8>
import matplotlib. pyplot as plt
datas[ "per_price" ] . sort_index( ) . cumsum( ) . plot( )
plt. show( )
自定义运算
datas[ [ "per_price" ] ] . apply ( lambda x : x. max ( ) - x. min ( ) , axis= 0 )
per_price 20.7109
dtype: float64
4.4 Pandas画图
datas. plot( x= "AREA" , y= "per_price" , kind= "scatter" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a343dec8>
datas. plot( x= "floor_num" , y= "per_price" , kind= "scatter" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a3a81bc8>
datas. plot( x= "AREA" , y= "per_price" , kind= "barh" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a2147f08>
4.5 文件的读取与存储
1.读取csv文件 read_csv()
iris_data = pd. read_csv( "./datas/iris.data.csv" )
iris_data. head( )
feature1 feature2 feature3 feature4 result 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa
iris_data1 = pd. read_csv( "./datas/iris.data.csv" , usecols= [ "feature1" , "feature2" , "result" ] )
iris_data1. head( )
feature1 feature2 result 0 5.1 3.5 Iris-setosa 1 4.9 3.0 Iris-setosa 2 4.7 3.2 Iris-setosa 3 4.6 3.1 Iris-setosa 4 5.0 3.6 Iris-setosa
iris_data2 = pd. read_csv( "./datas/iris.data2.csv" )
iris_data2. head( )
5.1 3.5 1.4 0.2 Iris-setosa 0 4.9 3.0 1.4 0.2 Iris-setosa 1 4.7 3.2 1.3 0.2 Iris-setosa 2 4.6 3.1 1.5 0.2 Iris-setosa 3 5.0 3.6 1.4 0.2 Iris-setosa 4 5.4 3.9 1.7 0.4 Iris-setosa
iris_data2 = pd. read_csv( "./datas/iris.data2.csv" , names= [ "feature1" , "feature2" , "feature3" , "feature4" , "result" ] )
iris_data2. head( )
feature1 feature2 feature3 feature4 result 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa
datas. head( 5 )
district roomnum hall AREA C_floor floor_num per_price 0 zzzz 3 1 89.3 middle 31 7.0773 1 baoan 4 1 127.0 high 31 6.9291 2 baoan 1 1 28.0 low 39 3.9286 3 baoan 1 1 28.0 middle 30 3.3568 4 baoan 2 1 78.0 middle 8 5.0769
datas[ : - 1 ] . to_csv( "./price_test" , columns= [ 'per_price' ] , index= False , mode= "a" , header= False )
perice_test = pd. read_csv( "./price_test" )
perice_test
per_price 0 7.0773 1 6.9291 2 3.9286 3 3.3568 4 5.0769 ... ... 3746 6.1932 3747 4.2553 3748 3.806 3749 5.3412 3750 5.9508
3751 rows × 1 columns