沧海拾珠

Pandas 处理缺失数据

1. Series 中设置,查找缺失数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
missing = np.nan
series_obj = Series(['row 1', 'row 2', missing, 'row 4','row 5', 'row 6', missing, 'row 8'])
series_obj
0 row 1
1 row 2
2 NaN
3 row 4
4 row 5
5 row 6
6 NaN
7 row 8

查找缺失数据

1
2
3
4
5
6
7
8
9
series_obj.isnull()
0 False
1 False
2 True
3 False
4 False
5 False
6 True
7 False

2. DataFrame中设置,查找缺失数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
np.random.seed(25)
DF_obj = DataFrame(np.random.randn(36).reshape(6,6))
DF_obj
0 1 2 3 4 5
0 0.228273 1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915 1.837905 -2.053231 0.868583 -0.920734 -0.232312
2 2.152957 -1.334661 0.076380 -1.246089 1.202272 -1.049942
3 1.056610 -0.419678 2.294842 -2.594487 2.822756 0.680889
4 -1.577693 -1.976254 0.533340 -0.290870 -0.513520 1.982626
5 0.226001 -1.839905 1.607671 0.388292 0.399732 0.405477
#设置缺失数据
DF_obj.iloc[3:5, 0] = missing
DF_obj.iloc[1:4, 5] = missing
DF_obj
0 1 2 3 4 5
0 0.228273 1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915 1.837905 -2.053231 0.868583 -0.920734 NaN
2 2.152957 -1.334661 0.076380 -1.246089 1.202272 NaN
3 NaN -0.419678 2.294842 -2.594487 2.822756 NaN
4 NaN -1.976254 0.533340 -0.290870 -0.513520 NaN
5 NaN -1.839905 1.607671 0.388292 0.399732 0.405477
1
2
3
4
5
6
7
8
9
10
#将缺失数据赋值
filled_DF = DF_obj.fillna(0)
filled_DF
0 1 2 3 4 5
0 0.228273 1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915 1.837905 -2.053231 0.868583 -0.920734 0.000000
2 2.152957 -1.334661 0.076380 -1.246089 1.202272 0.000000
3 0.000000 -0.419678 2.294842 -2.594487 2.822756 0.000000
4 0.000000 -1.976254 0.533340 -0.290870 -0.513520 0.000000
5 0.000000 -1.839905 1.607671 0.388292 0.399732 0.405477
1
2
3
4
5
6
7
8
#计算缺失数据数
DF_obj.isnull().sum()
0 3
1 0
2 0
3 0
4 0
5 4
1
2
3
4
5
6
7
8
9
10
11
12
13
#删除缺失数据
DF_no_NaN = DF_obj.dropna(axis=1) # 去除含有缺失数据的列。如果去除含有缺失数据的行,则dropna()就行。
DF_no_NaN
1 2 3 4
0 1.026890 -0.839585 -0.591182 -0.956888
1 1.837905 -2.053231 0.868583 -0.920734
2 -1.334661 0.076380 -1.246089 1.202272
3 -0.419678 2.294842 -2.594487 2.822756
4 -1.976254 0.533340 -0.290870 -0.513520
5 -1.839905 1.607671 0.388292 0.399732
#如果只去除所有值都为缺失数据的行或者列。
DF_obj.dropna(how='all')