python3 regular expression re module

Regular expression: It is a small, highly specialized programming language, which is embedded in python and implemented through the re module. The regular expression pattern is compiled into a series of bytecodes, and then executed with a matching engine written in C.

Used to match strings

 

Ordinary characters and metacharacters:

Metacharacters:

. Wildcard, refers to all characters (except for newline), a dot represents a character

>>> import re
>>> re.findall('w\w{2}l','hello world')
['worl']
>>> re.findall('w..l','hello world')
['worl']
>>> re.findall('w.l','hello world')
[]

 

^ Match the beginning of the string

 

>>> re.findall('^h...o','hjkjlkjjkhello')
[]
>>> re.findall('h...o','hjkjlkjjkhello')
['hello']

 

$ Matches the string at the end

 

>>> re.findall('a..d$','hkekjannd')
['annd']
>>> re.findall('a..d$','hkekjanndy')
[]

 

* Repeat matching, repeat the previous character multiple times, it can be 0 times

 

>>> re.findall('a.*li','gjkjalexlifjjl')
['alexli']
>>> re.findall('sak*jl','gkskkkkkkkkjljkljl')
[]
>>> re.findall('sak*j','gkskkkkkkkkjljkljl')
[]
>>> re.findall('sk*j','gkskkkkkkkkjljkljl')
['skkkkkkkkj']

 

+ Repeat matching, repeat the previous character for multiple matches, starting from 1, there is at least one character

 

>>> re.findall('sk+j','gkskkkkkkkkjljkljl')
['skkkkkkkkj']
>>> re.findall('a+b','bbbaabx')
['aab']
>>> re.findall('a+b','bbbaabxaaabm')
['aab', 'aaab']

 

? Repeat matching, matching the previous character, can only be [0,1], 0 times or 1 time

 

>>> re.findall('a?b','bbbaabx')
['b', 'b', 'b', 'ab']
>>> re.findall('a?b','baabx')
['b', 'ab']

 

{} Repeat matching, custom matching times, the times can be a range to match the most times (the default is greedy matching)

 

>>> re.findall('a{2}b','abb')
[]
>>> re.findall('a{2}b','aabbx')
['aab']
>>> re.findall('a{2}b','aaabbx')
['aab']
>>> re.findall('a{2}b','aaaabbx')
['aab']
>>> re.findall('a{1,3}b','aaaabbx')
['aaab']
>>>
>>> re.findall('a{1,3}b','aaaabbx')
['aaab']
>>>
>>> re.findall('a{2,3}b','abbx')
[]

summary:

* Equivalent to {0, +∞}

+ Is equivalent to {1, +∞}

? Equivalent to {0, 1}

 

>>> re.findall('abc*?','abcccc')  #惰性匹配匹配最小值0  * 0-∞
['ab']
>>> re.findall('abc+?','abcccc') #惰性匹配 匹配最小值1  + 1-∞
['abc']

 

 

 

 

 

[] Character set, cancel the special function of metacharacters, write metacharacters in brackets, but (\ ^-exception)

>>> import re
>>> re.findall('a[c,d]x','acx')
['acx']
>>> re.findall('a[c,d]x','adx')
['adx']
>>> re.findall('a[c,d]x','acdx')
[]
>>> re.findall('a[c,d,r]x','arx')
['arx']
>>> re.findall('a[c,d,r]x','ar x')
[]
>>> re.findall('a[c,d,r, ]x','ar x')
[]
>>> re.findall('a[c,d,r, ]x','a x')
['a x']
>>>
>>> re.findall('[a-z]','a x')
['a', 'x']
>>> re.findall('v[a-z]i','vaoxi')
[]
>>> re.findall('v[a-z]i','vxi')
['vxi']
>>> re.findall('v[w,*]i','v*i')
['v*i']
>>> re.findall('v[w,*]i','vni')
[]
>>> re.findall('v[w,*]i','vwi')
['vwi']
>>> re.findall('[w,*,,]','vwi,')
['w', ',']
>>> re.findall('[1-9,a-z,A-Z]','13tyQW')
['1', '3', 't', 'y', 'Q', 'W']
>>> re.findall('1-9a-zA-z','13tyQW')
[]
>>> re.findall('[1-9a-zA-z]','13tyQW')
['1', '3', 't', 'y', 'Q', 'W']

 

[^] In the brackets ^ is reversed to match

 

>>> re.findall('[^ty]','13tyQW')
['1', '3', 'Q', 'W']
>>> re.findall('[^1ty]','13tyQW')
['3', 'Q', 'W']
>>> re.findall('[^t,y]','13tyQW')
['1', '3', 'Q', 'W']

 

\ Backslash followed by meta characters to remove special functions, backslash followed by ordinary characters to achieve special functions

 

\d matches any decimal digit, [0-9]

\D matches any non-digit character, [^0-9]

>>> import re
>>> re.findall('\d{3}','qq23490208')
['234', '902']
>>> re.findall('\D','qq23490208')
['q', 'q']

\s matches any blank character, [\t\n\r\f\v]

\S matches any non-blank character, [^\t\n\r\f\v]

>>> re.findall('\s{2}','qq23490208')
[]
>>> re.findall('\s{2}','qq 2 3 490208')
[]
>>> re.findall('\s','qq 2 3 490208')
[' ', ' ', ' ']
>>> re.findall('\S','qq 2 3 490208')
['q', 'q', '2', '3', '4', '9', '0', '2', '0', '8']
>>> re.findall('\S2','qq 2 3 490208')
['02']
>>> re.findall('\S4','qq 2 3 490208')
[]
>>> re.findall('\s2','qq 2 3 490208')
[' 2']
>>> re.findall('\s4','qq 2 3 490208')
[' 4']

 

\w matches any alphanumeric character, [a-zA-Z0-9]

 

\W matches any non-alphanumeric character, [^a-zA-Z0-9]

>>> re.findall('\w','qq 2 3 490208')
['q', 'q', '2', '3', '4', '9', '0', '2', '0', '8']
>>> re.findall('\W','qq 2 3 490208')
[' ', ' ', ' ']

 

\b matches a special character boundary, which refers to the position boundary between a word and a space or a special character

 

>>> re.findall(r'I\b','I am Ice')
['I']
>>> re.findall(r'I\b','I am I$e')
['I', 'I']
>>> re.findall(r'\bI','hello,I am uI$e')
['I']
>>> re.findall(r'\bI','hello,I am I$e')
['I', 'I']

 

re.search(), match the first result found, use group() to match the result value

>>> import re
>>> re.search('a..c','xxaauocl')
<_sre.SRE_Match object; span=(3, 7), match='auoc'>
>>> ret = re.search('a..c','xxaauocl')
>>> ret.group() #返回匹配的值
'auoc'

() Regular expression group matching characters

>>> re.search('(as)+','asdjkaskkoas').group()  #(as)为一个整体进行匹配
'as'
>>> re.search('(as)|3','asdjkaskkoas').group() #| 管道符  为或者的意思
'as'
>>> re.search('(as)|3','asdjkaskkoas3').group() # 匹配到第一个
'as'
>>> re.search('(as)|3','3asdjkaskkoa').group()
'3'

 

>>> import re
>>> ret = re.search('(?P<id>\d{3})','weeew34ttt123/ooo') #分组匹配 用括号括起来,其中?P<id>为固定格式该组匹配的字符内容,可用group(’id‘)获取其该分组匹配值结果
>>> ret.group()
'123'
>>> ret = re.search('(?P<id>\d{3})/(?P<name>\w{3})','weeew34ttt123/ooo') #两个分组匹配id组和name组
>>> ret.group()
'123/ooo'
>>> ret.group('id')  #id分组匹配结果值
'123'
>>> ret.group('name') #name分组匹配结果值
'ooo'

Regular expression methods:

findall() returns all the results, returned as a list list

>>> re.findall('www.(\w+).com','www.baidu.com') #返回括号分组的内容,返回’baidu‘
['baidu']
>>> re.search('www.(\w+).com','www.baidu.com').group() #用search()返回整个匹配的字串
'www.baidu.com'
>>> re.findall('www.(?:\w+).com','www.baidu.com') #findall() 组里面用?:方式取消组的优先级后返回整个字串
['www.baidu.com']

finditer() method, generate iterator, use next method to get matching value

>>> ret = re.finditer('\d','q1w2e3r4t5y') #迭代器
>>> print(ret)
<callable_iterator object at 0x00000265533B1668>
>>> next(ret)
<_sre.SRE_Match object; span=(1, 2), match='1'>
>>> next(ret).group() #获取匹配值
'2'

 

search() returns an object, only matches the first object, use the group() method to get the returned result

 

match() only matches at the beginning of the string, use the group() method to get the returned result

>>> ret = re.match('asd',"asd45ooolasde")
>>> ret.group()
'asd'

 

split() to split the string by a certain character

>>> import re
>>> re.split('[k,s]','djksal')
['dj', '', 'al']  #出现空格
>>> re.split('[ks]','djksal') #先以k进行分割后,再将分割后的字符串以s进行分割
['dj', '', 'al']
>>> re.split('[ks]','djkskasl') #先以k进行分割后,再将分割后的字符串以s进行分割
['dj', '', '', 'a', 'l']

 

 

sub() replacement function, at least three parameters

 

 

>>> re.sub('a..x','s..b','hfjasalexxdhf')
'hfjass..bxdhf'
>>> import re
>>> re.sub('\d','A','q1w2e3r4t4y',1) #参数依次为: 要被匹配的字串内容, 替换为的字串,需要操作替换的整个字串,多个匹配替换的个数
'qAw2e3r4t4y'
>>> re.sub('\d','A','q1w2e3r4t4y',2)
'qAwAe3r4t4y'
>>> re.sub('\d','A','q1w2e3r4t4y',4)
'qAwAeArAt4y'
>>> re.sub('\d','A','q1w2e3r4t4y') #无替换个数参数直接全部替换
'qAwAeArAtAy'

 

 

 

compile() creates a matching string object

 

 

 

>>> obj = re.compile('\.com')
>>> obj.findall('fajfskjk.comfsdds')  #直接对象使用方法
['.com']

 

 

 

Guess you like

Origin blog.csdn.net/dance117/article/details/80490545