pandas更新中…
pandas版本:2.0.3
pandas两个比较重要的结构:Series和DataFrame
导包
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
Series 的reindex操作
s1 = pd.Series([1, 2, 3], index=["a", "b", "c"])
print(s1)
# a 1
# b 2
# c 3
# dtype: int64
print(s1.reindex(["a", "b", "c", "d"]))
# a 1.0
# b 2.0
# c 3.0
# d NaN
# dtype: float64
print(s1.reindex(["a", "b", "c", "d"], fill_value=666))
# a 1
# b 2
# c 3
# d 666
# dtype: int64s2 = pd.Series(["A", "B", "C"], index=[1, 3, 5])
print(s2)
# 增加新的index索引
# method="ffill"方式[1,3)填充index=1的值,[3,5)填充index=3的值,[5,7)填充index=5的值
print(s2.reindex(range(7), method="ffill"))
reindex 也可以用来截取数据
print(s1.reindex(index=["a", "b"]))
# a 1
# b 2
# dtype: int64
# 等价
print(s1.drop("a"))
# b 2
# c 3
# dtype: int64
DataFrame的reindex操作
df1 = DataFrame(np.random.rand(25).reshape(5, 5),index=["A", "B", "D", "E", "F"],columns=["c1", "c2", "c3", "c4", "c5"],
)
print(df1)
print(df1.reindex(index=["A", "B", "C", "D", "E", "F"]))
# c1 c2 c3 c4 c5
# A 0.205651 0.094378 0.548683 0.104871 0.497516
# B 0.086842 0.842261 0.912813 0.661971 0.849072
# C NaN NaN NaN NaN NaN
# D 0.570752 0.585542 0.606429 0.713678 0.013199
# E 0.650421 0.567925 0.303060 0.234874 0.911466
# F 0.011784 0.304507 0.040120 0.064653 0.209229
print(df1.reindex(columns=["c1", "c2", "c3", "c4", "c5","c6"]))
# !!可以看出reindex方法返回新的DataFrame
# c1 c2 c3 c4 c5 c6
# A 0.279706 0.179977 0.744959 0.637003 0.574922 NaN
# B 0.179284 0.159424 0.857709 0.302644 0.051491 NaN
# D 0.571989 0.893290 0.337798 0.385577 0.098375 NaN
# E 0.128973 0.367170 0.556358 0.303979 0.628117 NaN
# F 0.358955 0.851620 0.789938 0.720290 0.169392 NaN# 增加行列
print(df1.reindex(index=["A", "B", "C", "D", "E", "F"], columns=["c1", "c2", "c3", "c4", "c5", "c6"]
))
# c1 c2 c3 c4 c5 c6
# A 0.600788 0.478370 0.842493 0.645647 0.927288 NaN
# B 0.466357 0.523482 0.792704 0.004789 0.319052 NaN
# C NaN NaN NaN NaN NaN NaN
# D 0.677526 0.039438 0.846586 0.875791 0.354747 NaN
# E 0.577882 0.779355 0.089818 0.958951 0.697171 NaN
# F 0.130449 0.671984 0.922430 0.902640 0.100657 NaN
reindex也可以用来截取数据
print(df1.reindex(index=["A","B"]))
# c1 c2 c3 c4 c5
# A 0.600788 0.478370 0.842493 0.645647 0.927288
# B 0.466357 0.523482 0.792704 0.004789 0.319052print(df1.drop("A", axis=0)) # 行删除
# c1 c2 c3 c4 c5
# B 0.466357 0.523482 0.792704 0.004789 0.319052
# D 0.677526 0.039438 0.846586 0.875791 0.354747
# E 0.577882 0.779355 0.089818 0.958951 0.697171
# F 0.130449 0.671984 0.922430 0.902640 0.100657print(df1.drop("c1", axis=1)) # 列删除
# c2 c3 c4 c5
# A 0.478370 0.842493 0.645647 0.927288
# B 0.523482 0.792704 0.004789 0.319052
# D 0.039438 0.846586 0.875791 0.354747
# E 0.779355 0.089818 0.958951 0.697171
# F 0.671984 0.922430 0.902640 0.100657
Nan操作
Series中的nan
s1 = pd.Series([1, 2, np.nan, 3, 4], index=["A", "B", "C", "D", "E"])
print(s1)
# A 1.0
# B 2.0
# C NaN
# D 3.0
# E 4.0
# dtype: float64print(s1.isnull()) # nan为True
# A False
# B False
# C True
# D False
# E False
# dtype: bools1.dropna() # 删除nan
# A 1.0
# B 2.0
# D 3.0
# E 4.0
# dtype: float64
DataFrame的nan
dframe = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, np.nan, 8], [np.nan, np.nan, np.nan]]
)
print(dframe)
# 0 1 2
# 0 1.0 2.0 3.0
# 1 NaN 5.0 6.0
# 2 7.0 NaN 8.0
# 3 NaN NaN NaNprint(dframe.isnull())
# 0 1 2
# 0 False False False
# 1 True False False
# 2 False True False
# 3 True True True
过滤和填充nan
# axis=0, how="any" axis删除行的nan值,how=方式为包含就删除
# !!上述两个参数默认值就是0和any
# axis和numpy的axis参数不一样
# 等价 print(dframe.dropna())
df1 = dframe.dropna(axis=0, how="any")
print(df1)
# 0 1 2
# 0 1.0 2.0 3.0df2 = dframe.dropna(thresh=2) # 行大于2个的nan将被删除
print(df2)
# 0 1 2
# 0 1.0 2.0 3.0
# 1 NaN 5.0 6.0
# 2 7.0 NaN 8.0df3 = dframe.fillna(value=99) # 填充nan
print(df3)
# 0 1 2
# 0 1.0 2.0 3.0
# 1 99.0 5.0 6.0
# 2 7.0 99.0 8.0
# 3 99.0 99.0 99.0
map操作
df1 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,1500]})
print(df1)
# 城市 人口
# 0 北京 1000
# 1 上海 2000
# 2 广州 1500# gdp_map = {"北京": 1, "上海": 2, "广州": 3}
gdp_map = {"北京": 1, "广州": 3} # 缺省值为nan
df1["GDP"] = df1["城市"].map(gdp_map)
print(df1)
# 城市 人口 GDP
# 0 北京 1000 1.0
# 1 上海 2000 NaN
# 2 广州 1500 3.0