python pandas使用经验

函数原型:
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html#pandas.DataFrame.fillna

pad/ffill:用前一个非缺失值去填充该缺失值
backfill/bfill:用下一个非缺失值填充该缺失值
None:指定一个值去替换缺失值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# coding: utf-8
import pandas as pd

df = pd.DataFrame([[1, None, 2],
[None, 3, None],
[None, 4, 5],])

print('origin')
print(df)
# 0 1 2
# 0 1.0 NaN 2.0
# 1 NaN 3.0 NaN
# 2 NaN 4.0 5.0

print('left')
data = df.bfill(axis=1).iloc[:, 0]
print(data)
# 0 1.0
# 1 3.0
# 2 4.0

print('up')
data = df.bfill().iloc[:, -1]
print(data)
# 0 2.0
# 1 5.0
# 2 5.0

print('left')
data = df.fillna(method='bfill',axis=1)
print(data)
# 0 1 2
# 0 1.0 2.0 2.0
# 1 3.0 3.0 NaN
# 2 4.0 4.0 5.0

print('up')
data = df.fillna(method='bfill')
print(data)
# 0 1 2
# 0 1.0 3.0 2.0
# 1 NaN 3.0 5.0
# 2 NaN 4.0 5.0

print('right')
data = df.fillna(method='ffill',axis=1)
print(data)
# 0 1 2
# 0 1.0 1.0 2.0
# 1 NaN 3.0 3.0
# 2 NaN 4.0 5.0

print('down')
data = df.fillna(method='ffill')
print(data)
# 0 1 2
# 0 1.0 NaN 2.0
# 1 1.0 3.0 2.0
# 2 1.0 4.0 5.0

print('left')
data = df.fillna(method='backfill',axis=1)
print(data)
# 0 1 2
# 0 1.0 2.0 2.0
# 1 3.0 3.0 NaN
# 2 4.0 4.0 5.0

print('up')
data = df.fillna(method='backfill')
print(data)
# 0 1 2
# 0 1.0 3.0 2.0
# 1 NaN 3.0 5.0
# 2 NaN 4.0 5.0

print('right')
data = df.fillna(method='pad',axis=1)
print(data)
# 0 1 2
# 0 1.0 1.0 2.0
# 1 NaN 3.0 3.0
# 2 NaN 4.0 5.0

print('down')
data = df.fillna(method='pad')
print(data)
# 0 1 2
# 0 1.0 NaN 2.0
# 1 1.0 3.0 2.0
# 2 1.0 4.0 5.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
import numpy as np

df = pd.DataFrame([[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, 5],
[np.nan, 3, np.nan, 4]],
columns = list('ABCD'))
print(df)
# A B C D
# 0 NaN 2.0 NaN 0
# 1 3.0 4.0 NaN 1
# 2 NaN NaN NaN 5
# 3 NaN 3.0 NaN 4

print(df.fillna(0))
# A B C D
# 0 0.0 2.0 0.0 0
# 1 3.0 4.0 0.0 1
# 2 0.0 0.0 0.0 5
# 3 0.0 3.0 0.0 4

print(df.fillna(method='ffill'))
# A B C D
# 0 NaN 2.0 NaN 0
# 1 3.0 4.0 NaN 1
# 2 3.0 4.0 NaN 5
# 3 3.0 3.0 NaN 4

values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
print(df.fillna(value=values))
# A B C D
# 0 0.0 2.0 2.0 0
# 1 3.0 4.0 2.0 1
# 2 0.0 1.0 2.0 5
# 3 0.0 3.0 2.0 4

print(df.fillna(value=values, limit=1))
# A B C D
# 0 0.0 2.0 2.0 0
# 1 3.0 4.0 NaN 1
# 2 NaN 1.0 NaN 5
# 3 NaN 3.0 NaN 4

如果导入的dataframe中包含有字典的,使用data.join(data[‘A10’].apply(json.loads).apply(pd.Series))来拆分字典,组合成不同的列。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pandas as pd
import json

filename = 'top5.txt'
data = pd.read_csv(filename, sep="\t", header=None)

# test model.8.10 modelname 810 8101 2018-03-28 04:21:13 2018-03-28 04:21:13
# 1 0 2018-04-02 14:50:54 {"cell_info":"LTE PLMN:46000 EARFCN:38400(B39) Cell Identity
# :197539969 PCI:141 TAC:37884 RSSI:-65 RSRP:-95 RSRQ:-11 SINR*10: 133 EMM state:REGISTERED
# service state:NORMAL reg domain:CS_PS lte_tx_power tx = 9 lte_rx_chain0 rssi=-64 rsrp=-94
# sinr=133 lte_rx_chain1 rssi=-69 rsrp=-99 sinr=118 ","log_from":"com.android.phone",
# "reg_at_time":"31112","rat":"14","reg_during_time":"3554","hplmn":"46002"} 2018-04-02

columns = []
for i in range(data.shape[1]):
columns.append('A' + str(i))
data.columns = columns
print(data.columns)
# Index(['A0', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10','A11'],
# dtype='object')

print(data['A10'])
# 0 {"cell_info":"LTE PLMN:46000 EARFCN:38400(B39...
data = data.join(data['A10'].apply(json.loads).apply(pd.Series))

print(data.columns)
# Index(['A0', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10',
# 'A11', 'cell_info', 'hplmn', 'log_from', 'rat', 'reg_at_time',
# 'reg_during_time'],
# dtype='object')

猜你喜欢

转载自www.cnblogs.com/LearnFromNow/p/9345414.html