pandas 处理文本数据

import pandas as pd
import numpy as np

s = pd.Series(['A',"B","C","AaBa","Baca",np.nan,'dog','cat'])

0       A
1       B
2       C
3    AaBa
4    Baca
5     NaN
6     dog
7     cat
dtype: object

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6     dog
7     cat
dtype: object

s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6     DOG
7     CAT
dtype: object

s.str.len()

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
6    3.0
7    3.0
dtype: float64

idx = pd.Index([' jack','jill ',' jesse','frank'])

idx.str.strip() # 去掉左右两边的空白符

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

idx.str.lstrip()  #  左去掉空白字符

Index(['jack', 'jill ', 'jesse', 'frank'], dtype='object')

idx.str.rstrip()  # 去掉右边的空白符

Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')

df = pd.DataFrame(np.random.randn(3,2),columns=[' Column A ',' Column B '],index=range(3))

df

df.columns.str.strip()

Index(['Column A', 'Column B'], dtype='object')

df.columns.str.lower()

Index([' column a ', ' column b '], dtype='object')

df.columns = df.columns.str.strip().str.lower().str.replace(' ',"_")

df

s2 = pd.Series(['a_b_c',"c_D_e",np.nan,'f_g_H'])

s2.str.split("_")

0    [a, b, c]
1    [c, D, e]
2          NaN
3    [f, g, H]
dtype: object

s2.str.split('_')[1]

['c', 'D', 'e']

s2.str.split('_').str[1] # 切割之后的Series，通过str方法可以得到新的数据

0      b
1      D
2    NaN
3      g
dtype: object

s2.str.split('_').str.get(1)

0      b
1      D
2    NaN
3      g
dtype: object

s2.str.split('_',expand=True,n=1) # expand 参数，通过可以通过n确定延伸的次数

s2.str.rsplit('_',expand=True,n=1) # rsplit 方法

s3 = pd.Series(['A',"B","C","AaBa","Baca",np.nan,"CABA","dog","cat"])
s3

0       A
1       B
2       C
3    AaBa
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

s3.str.replace('^.a|dog','XX_XX',case=False)  # 替换第二个字符是a或者dog的字符串，忽略大小写，关于正则表达式的内容篇幅很大

0          A
1          B
2          C
3    XX_XXBa
4    XX_XXca
5        NaN
6    XX_XXBA
7      XX_XX
8     XX_XXt
dtype: object

dollars = pd.Series(['12', '-$10', '$10,000'])
dollars.str.replace('$', '') # replace $ to ''

0        12
1       -10
2    10,000
dtype: object

dollars.str.replace("-$",'-')  #  doesn't work

0         12
1       -$10
2    $10,000
dtype: object

dollars.str.replace(r'-\$','-')
# 转义 原字符-\$  替换成'-'

0         12
1        -10
2    $10,000
dtype: object

dollars.str.replace('-\$', '-')

0         12
1        -10
2    $10,000
dtype: object

s = pd.Series(['A',"B","C","D"])
s.str.cat(sep=',')

'A,B,C,D'

s.str.cat()

'ABCD'

t = pd.Series(['a', 'b', np.nan, 'd'])
t.str.cat(sep=',',na_rep='_')

'a,b,_,d'

s.str.cat(['a',"b","c","d"])

0    Aa
1    Bb
2    Cc
3    Dd
dtype: object

pd.Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)', expand=False)#  组命名?P

pattern = r'[0-9][a-z]'
pd.Series(['1','2','3a','3b','03c']).str.contains(pattern)# 包含数字字母的文本

0    False
1    False
2     True
3     True
4     True
dtype: bool

pd.Series(['1','2','3a','3b','03c']).str.match(pattern)# 匹配数字字母的文本

0    False
1    False
2     True
3     True
4    False
dtype: bool