1.describe():查看每一列的描述性统计量
import numpy as np
import pandas as pddf = pd.DataFrame(data=np.random.randint(0,10,size=(5,3)),index=list("ABCDE"),columns=["Python","NumPy","Pandas"])
dfdf.describe()
| Python | NumPy | Pandas |
---|
count | 5.000000 | 5.000000 | 5.000000 |
---|
mean | 5.600000 | 2.800000 | 5.400000 |
---|
std | 2.073644 | 2.167948 | 2.408319 |
---|
min | 4.000000 | 0.000000 | 2.000000 |
---|
25% | 4.000000 | 1.000000 | 4.000000 |
---|
50% | 5.000000 | 4.000000 | 6.000000 |
---|
75% | 6.000000 | 4.000000 | 7.000000 |
---|
max | 9.000000 | 5.000000 | 8.000000 |
---|
df.describe([0.01,0.3,0.4,0.9,0.99])
| Python | NumPy | Pandas |
---|
count | 5.000000 | 5.000000 | 5.000000 |
---|
mean | 5.600000 | 2.800000 | 5.400000 |
---|
std | 2.073644 | 2.167948 | 2.408319 |
---|
min | 4.000000 | 0.000000 | 2.000000 |
---|
1% | 4.000000 | 0.040000 | 2.080000 |
---|
30% | 4.200000 | 1.600000 | 4.400000 |
---|
40% | 4.600000 | 2.800000 | 5.200000 |
---|
50% | 5.000000 | 4.000000 | 6.000000 |
---|
90% | 7.800000 | 4.600000 | 7.600000 |
---|
99% | 8.880000 | 4.960000 | 7.960000 |
---|
max | 9.000000 | 5.000000 | 8.000000 |
---|
df.describe([0.01,0.3,0.4,0.9,0.99]).T
| count | mean | std | min | 1% | 30% | 40% | 50% | 90% | 99% | max |
---|
Python | 5.0 | 5.6 | 2.073644 | 4.0 | 4.00 | 4.2 | 4.6 | 5.0 | 7.8 | 8.88 | 9.0 |
---|
NumPy | 5.0 | 2.8 | 2.167948 | 0.0 | 0.04 | 1.6 | 2.8 | 4.0 | 4.6 | 4.96 | 5.0 |
---|
Pandas | 5.0 | 5.4 | 2.408319 | 2.0 | 2.08 | 4.4 | 5.2 | 6.0 | 7.6 | 7.96 | 8.0 |
---|
2.df.std():可以求得DataFrame对象每一列的标准差
df.std()
Python 2.073644
NumPy 2.167948
Pandas 2.408319
dtype: float64
3.df.drop():删除特定索引
df2 = df.copy()
df2
| Python | NumPy | Pandas |
---|
A | 9 | 0 | 8 |
---|
B | 5 | 1 | 2 |
---|
C | 6 | 5 | 7 |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
df2.drop("A")
| Python | NumPy | Pandas |
---|
B | 5 | 1 | 2 |
---|
C | 6 | 5 | 7 |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
df2.drop(index="A")
| Python | NumPy | Pandas |
---|
B | 5 | 1 | 2 |
---|
C | 6 | 5 | 7 |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
df2.drop("Python",axis=1)
| NumPy | Pandas |
---|
A | 0 | 8 |
---|
B | 1 | 2 |
---|
C | 5 | 7 |
---|
D | 4 | 6 |
---|
E | 4 | 4 |
---|
df2.drop(columns="Python")
| NumPy | Pandas |
---|
A | 0 | 8 |
---|
B | 1 | 2 |
---|
C | 5 | 7 |
---|
D | 4 | 6 |
---|
E | 4 | 4 |
---|
df2.drop(columns=["NumPy","Python"])
df2.drop(index=["A","B"])
| Python | NumPy | Pandas |
---|
C | 6 | 5 | 7 |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
df2.drop(index=["A","B"],inplace=True)
df2
| Python | NumPy | Pandas |
---|
C | 6 | 5 | 7 |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
4.unique():唯一,去重(只能用于Series一维数组)
df["Python"].unique()
array([9, 5, 6, 4])
5.df.query:按条件查询
df.query("Python == 6")
df.query("Python > 6")
df.query("Python < 6")
| Python | NumPy | Pandas |
---|
B | 5 | 1 | 2 |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
df.query("Python > 4 and NumPy == 5")
df.query("Python > 4 & NumPy == 5")
df.query("Python > 6 or NumPy == 8")
df.query("Python > 6 | NumPy == 8")
df.query("Python in [5,6,9]")
| Python | NumPy | Pandas |
---|
A | 9 | 0 | 8 |
---|
B | 5 | 1 | 2 |
---|
C | 6 | 5 | 7 |
---|
n = 6
df.query("Python == @n")
m = [5,6,9]
df.query("Python in @m")
| Python | NumPy | Pandas |
---|
A | 9 | 0 | 8 |
---|
B | 5 | 1 | 2 |
---|
C | 6 | 5 | 7 |
---|
6.df.sort_values():根据值排序
df.sort_values("Python")
| Python | NumPy | Pandas |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
B | 5 | 1 | 2 |
---|
C | 6 | 5 | 7 |
---|
A | 9 | 0 | 8 |
---|
df.sort_values("Python",ascending=False)
| Python | NumPy | Pandas |
---|
A | 9 | 0 | 8 |
---|
C | 6 | 5 | 7 |
---|
B | 5 | 1 | 2 |
---|
D | 4 | 4 | 6 |
---|
E | 4 | 4 | 4 |
---|
df.sort_values("B",axis=1)
| NumPy | Pandas | Python |
---|
A | 0 | 8 | 9 |
---|
B | 1 | 2 | 5 |
---|
C | 5 | 7 | 6 |
---|
D | 4 | 6 | 4 |
---|
E | 4 | 4 | 4 |
---|
7.df.sort_index():根据索引排序
df.sort_index(ascending=False)
| Python | NumPy | Pandas |
---|
E | 4 | 4 | 4 |
---|
D | 4 | 4 | 6 |
---|
C | 6 | 5 | 7 |
---|
B | 5 | 1 | 2 |
---|
A | 9 | 0 | 8 |
---|
df.sort_index(ascending=False,axis=1)
| Python | Pandas | NumPy |
---|
A | 9 | 8 | 0 |
---|
B | 5 | 2 | 1 |
---|
C | 6 | 7 | 5 |
---|
D | 4 | 6 | 4 |
---|
E | 4 | 4 | 4 |
---|
8.df.info():查看数据信息
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 3 columns):# Column Non-Null Count Dtype
--- ------ -------------- -----0 Python 5 non-null int321 NumPy 5 non-null int322 Pandas 5 non-null int32
dtypes: int32(3)
memory usage: 272.0+ bytes
9.练习
- 新建一个形状为10000*3的标准正态分布的DataFrame(np.random.randn),去除掉所有满足以下情况的行:其中任一元素绝对值大约3陪标准差
df = pd.DataFrame(np.random.randn(10000,3))
df
| 0 | 1 | 2 |
---|
0 | 0.786386 | -0.204965 | -0.152465 |
---|
1 | 0.400526 | 1.447733 | 0.310461 |
---|
2 | -0.363709 | -0.989258 | -1.093327 |
---|
3 | -2.856978 | 2.336645 | 1.474821 |
---|
4 | -0.847757 | 1.141278 | -0.230877 |
---|
... | ... | ... | ... |
---|
9995 | 0.331052 | 0.263059 | 0.469468 |
---|
9996 | 1.172189 | -1.380337 | 0.648793 |
---|
9997 | -0.544049 | -0.509627 | -0.224698 |
---|
9998 | -0.034967 | -0.085575 | -0.687314 |
---|
9999 | 0.007202 | -0.069250 | -0.803754 |
---|
10000 rows × 3 columns
cond = df.abs() > df.std()*3
cond
| 0 | 1 | 2 |
---|
0 | False | False | False |
---|
1 | False | False | False |
---|
2 | False | False | False |
---|
3 | False | False | False |
---|
4 | False | False | False |
---|
... | ... | ... | ... |
---|
9995 | False | False | False |
---|
9996 | False | False | False |
---|
9997 | False | False | False |
---|
9998 | False | False | False |
---|
9999 | False | False | False |
---|
10000 rows × 3 columns
cond2 = cond.any(axis=1)
cond2
0 False
1 False
2 False
3 False
4 False...
9995 False
9996 False
9997 False
9998 False
9999 False
Length: 10000, dtype: bool
df.loc[~cond2]
| 0 | 1 | 2 |
---|
0 | 0.786386 | -0.204965 | -0.152465 |
---|
1 | 0.400526 | 1.447733 | 0.310461 |
---|
2 | -0.363709 | -0.989258 | -1.093327 |
---|
3 | -2.856978 | 2.336645 | 1.474821 |
---|
4 | -0.847757 | 1.141278 | -0.230877 |
---|
... | ... | ... | ... |
---|
9995 | 0.331052 | 0.263059 | 0.469468 |
---|
9996 | 1.172189 | -1.380337 | 0.648793 |
---|
9997 | -0.544049 | -0.509627 | -0.224698 |
---|
9998 | -0.034967 | -0.085575 | -0.687314 |
---|
9999 | 0.007202 | -0.069250 | -0.803754 |
---|
9904 rows × 3 columns