Series的创建
import pandas as pd
import numpy as np
通过迭代器、一维数组生成,可以自动生成索引,也可以自定义索引
pd.Series(range(10))
pd.Series(range(10,20))
0 10
1 11
2 12
3 13
4 14
5 15
6 16
7 17
8 18
9 19
dtype: int64
pd.Series(np.arange(5,10),index=['a','b','c','d','e'])
a 5
b 6
c 7
d 8
e 9
dtype: int32
d = {
'a':1,'b':2,'c':3}
pd.Series(d)
a 1
b 2
c 3
dtype: int64
DataFrame的创建
通过字典构建df,key为列名,value为值
dic = {
'A':1,
'B':pd.Timestamp("20221227"),
"C": pd.Series(1, index=list(range(4)),dtype="float32"),
"D": np.array([3] * 4,dtype="int32"),
"E":["Python","Java","C++","C"],
"F": "tiger" }
pd.DataFrame(dic)
|
A |
B |
C |
D |
E |
F |
0 |
1 |
2022-12-27 |
1.0 |
3 |
Python |
tiger |
1 |
1 |
2022-12-27 |
1.0 |
3 |
Java |
tiger |
2 |
1 |
2022-12-27 |
1.0 |
3 |
C++ |
tiger |
3 |
1 |
2022-12-27 |
1.0 |
3 |
C |
tiger |
pd.DataFrame([[1,2,3],[4,5,6]],columns=['A','B','C'])
pd.DataFrame([[1,2,3],[4,5,6]])
读取excel
df1 = pd.read_excel(r'.\台账.xlsx',sheet_name='总表')
df1.head(2)
df1.tail(3)
df1.shape
(37795, 82)
读取文本文件
rows = []
path = r'.\tg20220727.txt'
with open(path, 'r',encoding='utf8') as f:
for index, line in enumerate(f):
if index in [0, 2]:
continue
row = line.split()
if len(row) in [0, 3]:
continue
rows.append(row)
column = rows.pop(0)
df = pd.DataFrame(data=rows, columns=column)
df.head(3)
输出到excel
df.to_excel('./tg20220727.xlsx')
df.to_excel('./tg20220727.xlsx',index=False)
DataFrame列操作
- 插入列
- 删除列
- 修改列名
- 获取所有列名
插入列
df2 = pd.DataFrame({
'a':range(5),'b':range(5)})
df2
|
a |
b |
0 |
0 |
0 |
1 |
1 |
1 |
2 |
2 |
2 |
3 |
3 |
3 |
4 |
4 |
4 |
df2['c'] = range(5,10)
df2
|
a |
b |
c |
0 |
0 |
0 |
5 |
1 |
1 |
1 |
6 |
2 |
2 |
2 |
7 |
3 |
3 |
3 |
8 |
4 |
4 |
4 |
9 |
删除列
df2.drop('b',axis=1)
|
a |
c |
0 |
0 |
5 |
1 |
1 |
6 |
2 |
2 |
7 |
3 |
3 |
8 |
4 |
4 |
9 |
修改列名
df2
|
a |
b |
c |
0 |
0 |
0 |
5 |
1 |
1 |
1 |
6 |
2 |
2 |
2 |
7 |
3 |
3 |
3 |
8 |
4 |
4 |
4 |
9 |
df2 = df2.rename(columns={
'b':'bbb'})
df2
|
a |
bbb |
c |
0 |
0 |
0 |
5 |
1 |
1 |
1 |
6 |
2 |
2 |
2 |
7 |
3 |
3 |
3 |
8 |
4 |
4 |
4 |
9 |
获取所有列名
df2.columns
Index(['a', 'bbb', 'c'], dtype='object')
df2.columns.tolist()
['a', 'bbb', 'c']
索引操作
- 获取索引
- 修改索引
- 重置索引
df3 = pd.DataFrame({
'a':range(5),'b':range(5)},index=['a','b','c','d','e'])
df3
|
a |
b |
a |
0 |
0 |
b |
1 |
1 |
c |
2 |
2 |
d |
3 |
3 |
e |
4 |
4 |
df3.index
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
df3.index.tolist()
[1, 'b', 'c', 'd', 'e']
df3.rename(index={
'a':1},inplace=True)
df3
|
a |
b |
1 |
0 |
0 |
b |
1 |
1 |
c |
2 |
2 |
d |
3 |
3 |
e |
4 |
4 |
df3.reset_index()
|
index |
a |
b |
0 |
1 |
0 |
0 |
1 |
b |
1 |
1 |
2 |
c |
2 |
2 |
3 |
d |
3 |
3 |
4 |
e |
4 |
4 |
df3.reset_index(drop=True)
|
a |
b |
0 |
0 |
0 |
1 |
1 |
1 |
2 |
2 |
2 |
3 |
3 |
3 |
4 |
4 |
4 |
删除空行、空列
df3 = pd.DataFrame({
'a':range(3),'b':[1,2,np.nan],'c':[None,2,4]})
df3
|
a |
b |
c |
0 |
0 |
1.0 |
NaN |
1 |
1 |
2.0 |
2.0 |
2 |
2 |
NaN |
4.0 |
df3.dropna(axis=1)
df3.dropna(axis=0)
数据类型转换
- 查看数据类型
- 修改数据类型
df4 = pd.DataFrame({
'a':range(3),'b':[1,2,np.nan],'c':[None,2,4]})
df4
|
a |
b |
c |
0 |
0 |
1.0 |
NaN |
1 |
1 |
2.0 |
2.0 |
2 |
2 |
NaN |
4.0 |
df4['a'].dtype
dtype('int64')
df4.dtypes
a int64
b float64
c float64
dtype: object
df5 = df4.astype('str')
df5
|
a |
b |
c |
0 |
0 |
1.0 |
nan |
1 |
1 |
2.0 |
2.0 |
2 |
2 |
nan |
4.0 |
df5.dtypes
a object
b object
c object
dtype: object
数据筛选
- loc
- iloc
loc 通过索引名取值
df6 = pd.DataFrame({
'a':range(5),'b':range(15,20)},index=['a','b','c','d','e'])
df6
|
a |
b |
a |
0 |
15 |
b |
1 |
16 |
c |
2 |
17 |
d |
3 |
18 |
e |
4 |
19 |
df6.loc['d','a']
3
df6.loc['b':'d',['a','b']]
df6.loc['b':'d',:]
df6.loc[df6.a==2,:]
df6.loc[df6['a'] < 2,'b']
a 15
b 16
Name: b, dtype: int64
df6.loc[df6['a'].isin([0,3]),:]
df6.loc[~df6['a'].isin([0,3]),:]
df6.loc[(df6.a == 3) & (df6.b == 18),:]
iloc 通过索引编号取值
df6
|
a |
b |
a |
0 |
15 |
b |
1 |
16 |
c |
2 |
17 |
d |
3 |
18 |
e |
4 |
19 |
df6.iloc[1,:]
a 1
b 16
Name: b, dtype: int64
df6.iloc[2:4,:]
分组聚合
df1.columns
Index(['分中心', ... '核心发放机构'],
dtype='object')
df1.groupby('分中心').agg({
'发放金额':np.max,'贷款余额':np.average})
排序
- 指定某一列排序
- 指定多列排序,每列可指定升序或降序
df11 = df1.iloc[:10,[15,20]]
df11
|
授信金额 |
发放日期 |
0 |
300000.0 |
2014-07-21 |
1 |
400000.0 |
2014-08-01 |
2 |
250000.0 |
2014-08-06 |
3 |
400000.0 |
2014-08-29 |
4 |
694000.0 |
2014-08-29 |
5 |
200000.0 |
2014-09-24 |
6 |
500000.0 |
2014-10-15 |
7 |
300000.0 |
2014-12-04 |
8 |
500000.0 |
2015-03-10 |
9 |
300000.0 |
2015-05-18 |
df11.sort_values('授信金额')
|
授信金额 |
发放日期 |
5 |
200000.0 |
2014-09-24 |
2 |
250000.0 |
2014-08-06 |
0 |
300000.0 |
2014-07-21 |
7 |
300000.0 |
2014-12-04 |
9 |
300000.0 |
2015-05-18 |
1 |
400000.0 |
2014-08-01 |
3 |
400000.0 |
2014-08-29 |
6 |
500000.0 |
2014-10-15 |
8 |
500000.0 |
2015-03-10 |
4 |
694000.0 |
2014-08-29 |
df11.sort_values('授信金额',ascending=False)
|
授信金额 |
发放日期 |
4 |
694000.0 |
2014-08-29 |
6 |
500000.0 |
2014-10-15 |
8 |
500000.0 |
2015-03-10 |
1 |
400000.0 |
2014-08-01 |
3 |
400000.0 |
2014-08-29 |
0 |
300000.0 |
2014-07-21 |
7 |
300000.0 |
2014-12-04 |
9 |
300000.0 |
2015-05-18 |
2 |
250000.0 |
2014-08-06 |
5 |
200000.0 |
2014-09-24 |
df11.sort_values(['授信金额','发放日期'],ascending=[True,False])
|
授信金额 |
发放日期 |
5 |
200000.0 |
2014-09-24 |
2 |
250000.0 |
2014-08-06 |
9 |
300000.0 |
2015-05-18 |
7 |
300000.0 |
2014-12-04 |
0 |
300000.0 |
2014-07-21 |
3 |
400000.0 |
2014-08-29 |
1 |
400000.0 |
2014-08-01 |
8 |
500000.0 |
2015-03-10 |
6 |
500000.0 |
2014-10-15 |
4 |
694000.0 |
2014-08-29 |
数据合并concat
df7 = pd.DataFrame({
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3'],
'E': ['E0', 'E1', 'E2', 'E3']
})
df7
|
A |
B |
C |
D |
E |
0 |
A0 |
B0 |
C0 |
D0 |
E0 |
1 |
A1 |
B1 |
C1 |
D1 |
E1 |
2 |
A2 |
B2 |
C2 |
D2 |
E2 |
3 |
A3 |
B3 |
C3 |
D3 |
E3 |
df8 = pd.DataFrame({
'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7'],
'F': ['F4', 'F5', 'F6', 'F7']
})
df8
|
A |
B |
C |
D |
F |
0 |
A4 |
B4 |
C4 |
D4 |
F4 |
1 |
A5 |
B5 |
C5 |
D5 |
F5 |
2 |
A6 |
B6 |
C6 |
D6 |
F6 |
3 |
A7 |
B7 |
C7 |
D7 |
F7 |
pd.concat([df7,df8])
|
A |
B |
C |
D |
E |
F |
0 |
A0 |
B0 |
C0 |
D0 |
E0 |
NaN |
1 |
A1 |
B1 |
C1 |
D1 |
E1 |
NaN |
2 |
A2 |
B2 |
C2 |
D2 |
E2 |
NaN |
3 |
A3 |
B3 |
C3 |
D3 |
E3 |
NaN |
0 |
A4 |
B4 |
C4 |
D4 |
NaN |
F4 |
1 |
A5 |
B5 |
C5 |
D5 |
NaN |
F5 |
2 |
A6 |
B6 |
C6 |
D6 |
NaN |
F6 |
3 |
A7 |
B7 |
C7 |
D7 |
NaN |
F7 |
pd.concat([df7,df8],ignore_index=True)
|
A |
B |
C |
D |
E |
F |
0 |
A0 |
B0 |
C0 |
D0 |
E0 |
NaN |
1 |
A1 |
B1 |
C1 |
D1 |
E1 |
NaN |
2 |
A2 |
B2 |
C2 |
D2 |
E2 |
NaN |
3 |
A3 |
B3 |
C3 |
D3 |
E3 |
NaN |
4 |
A4 |
B4 |
C4 |
D4 |
NaN |
F4 |
5 |
A5 |
B5 |
C5 |
D5 |
NaN |
F5 |
6 |
A6 |
B6 |
C6 |
D6 |
NaN |
F6 |
7 |
A7 |
B7 |
C7 |
D7 |
NaN |
F7 |
pd.concat([df7,df8], ignore_index=True, join="inner")
|
A |
B |
C |
D |
0 |
A0 |
B0 |
C0 |
D0 |
1 |
A1 |
B1 |
C1 |
D1 |
2 |
A2 |
B2 |
C2 |
D2 |
3 |
A3 |
B3 |
C3 |
D3 |
4 |
A4 |
B4 |
C4 |
D4 |
5 |
A5 |
B5 |
C5 |
D5 |
6 |
A6 |
B6 |
C6 |
D6 |
7 |
A7 |
B7 |
C7 |
D7 |
pd.concat([df7,df8],axis=1)
|
A |
B |
C |
D |
E |
A |
B |
C |
D |
F |
0 |
A0 |
B0 |
C0 |
D0 |
E0 |
A4 |
B4 |
C4 |
D4 |
F4 |
1 |
A1 |
B1 |
C1 |
D1 |
E1 |
A5 |
B5 |
C5 |
D5 |
F5 |
2 |
A2 |
B2 |
C2 |
D2 |
E2 |
A6 |
B6 |
C6 |
D6 |
F6 |
3 |
A3 |
B3 |
C3 |
D3 |
E3 |
A7 |
B7 |
C7 |
D7 |
F7 |
apply
1.apply方法都是通过传入一个函数或者lambda表达式对数据进行批量处理
2.apply方法处理的都是一个Series对象
df9=pd.DataFrame([[4,5]]*3,columns=['A','B'])
df9
df9.apply(np.sqrt)
|
A |
B |
0 |
2.0 |
2.236068 |
1 |
2.0 |
2.236068 |
2 |
2.0 |
2.236068 |
df9.apply(sum,axis=0)
A 12
B 15
dtype: int64
df9.apply(sum,axis=1)
0 9
1 9
2 9
dtype: int64
def func(row):
row['A'] += 3
return row
df9.apply(func,axis=1)
字符串操作
df10 = pd.DataFrame({
'A': ['A0 ', 'A1', ' A2', 'A3'],
'B': ['B0', 'B1,B11', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3'],
'E': ['E0', 'E1', 'E2', 'E3']
})
df10
|
A |
B |
C |
D |
E |
0 |
A0 |
B0 |
C0 |
D0 |
E0 |
1 |
A1 |
B1,B11 |
C1 |
D1 |
E1 |
2 |
A2 |
B2 |
C2 |
D2 |
E2 |
3 |
A3 |
B3 |
C3 |
D3 |
E3 |
df10['A'].str.len()
0 3
1 2
2 3
3 2
Name: A, dtype: int64
df10['A'].str.strip().str.len()
0 2
1 2
2 2
3 2
Name: A, dtype: int64
a = df10['B'].str.split(',')
a
0 [B0]
1 [B1, B11]
2 [B2]
3 [B3]
Name: B, dtype: object
type(a[1])
list
df10['A'].str.rjust(6,'0')
0 000A0
1 0000A1
2 000 A2
3 0000A3
Name: A, dtype: object