Python-正则模块

1、正则模块是啥

正则就是用一些具有特殊含义的符号组合到一起(称为正则表达式)来描述字符或者字符串的方法。或者说:正则就是用来描述一类事物的规则。(在Python中)它内嵌在Python中,并通过 re 模块实现。
正则表达式:由一系列特殊字符拼接而成的表达式/规则,该表达式用于从一个大字符串中匹配出符合规则的子字符串。

2、常用匹配模式(元字符)

\w 匹配字母数字及下划线

\W 匹配非字母数字下划线

# \w
print(re.findall('\w\w\w',"h ello 123_ (0"))
['ell', '123']

# \W
print(re.findall('\W',"h ello 123_ (0"))
[' ', ' ', ' ', '(']

\s 匹配任意空白字符[\t\n\r\f]也算

\S 匹配非空白字符

# \s
print(re.findall('\s',"h e\tll\no 123_ (0"))
print(re.findall('\w\s',"h ello 123_ (0"))
[' ', '\t', '\n', ' ', ' ']
['h ', 'o ', '_ ']
# \S
print(re.findall('\S',"h e\tll\no 123_ (0"))
['h', 'e', 'l', 'l', 'o', '1', '2', '3', '_', '(', '0']

\d 匹配任意数字 = [0-9]

\D 匹配任意非数字

# \d
print(re.findall('\d',"h e\tll\no 123_ (0"))
['1', '2', '3', '0']
# \D
print(re.findall('\D',"h e\tll\no 123_ (0"))
['h', ' ', 'e', '\t', 'l', 'l', '\n', 'o', ' ', '_', ' ', '(']

\定制范围

print(re.findall("a\db","a1b a2b a b aab aaaaaaaa1b a2c a22c a 3c"))
['a1b', 'a2b', 'a1b']

\n 匹配换行符

\t 匹配制表符

msg = """h e\tll\n\no 123_ (0
# \t1
#     2
# 3
# """
print(re.findall('\n', msg))
print(re.findall('\t', msg))
print(re.findall(' ', msg))
['\n', '\n', '\n', '\n', '\n', '\n']
['\t', '\t']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']

^ 匹配字符串的开头

& 匹配字符串的结尾

print(re.findall("^egon","egon asdf 213123 egon afsadf egon"))
print(re.findall("egon$","egon asdf 213123 egonafsadfegon"))
print(re.findall("^a\w\w\wc$","ab12c3c a213c"))
print(re.findall("^a\w\w\wc$","ab_2c"))
['egon']
['egon']
[]
['ab_2c']

‘.’ 匹配任意字符,除了换行符,指定re.DOTALL可以匹配包括换行的任意字符

print(re.findall("a\db","a1b a2b aab aaaaaaab a+b a-b a c"))
print(re.findall("a\wb","a1b a2b aab aaaaaaab a+b a-b a c"))
print(re.findall("a.b","a1b a2b aab aaaaaaab a+b a-b a b a c"))
print(re.findall("a.b","a1b a2b aab aaaaaaab a\tb a-b a\nb a c",re.DOTALL))
['a1b', 'a2b']
['a1b', 'a2b', 'aab', 'aab']
['a1b', 'a2b', 'aab', 'aab', 'a+b', 'a-b', 'a b']
['a1b', 'a2b', 'aab', 'aab', 'a\tb', 'a-b', 'a\nb']

[] 用来匹配一个字符,我们可以指定其范围

print(re.findall("a[+-]b", "a1b a2b aab aaaaaaab a+b a-b a c"))
print(re.findall("a[.*/+-]b", "a.b a2b a*b a/b aab aaaaaaab a+b a-b a c"))
# 如果要匹配-,则需要放到最左边或者最右边,否者会被认定成范围符号
print(re.findall("a[a-z]b", "a.b a2b a*b a/b aab aaaaaaab a+b a-b a c")) 
print(re.findall("a[a-zA-Z]b", "a.b a2b a*b a/b aAb aCb aab aaaaaaab a+b a-b a c")) 
print(re.findall("a\db", "a.b a2b a*b a/b aAb aCb aab aaaaaaab a+b a-b a c"))
print(re.findall("a[0-9]b", "a.b a2b a*b a/b aAb aCb aab aaaaaaab a+b a-b a c"))
['a+b', 'a-b']
['a.b', 'a*b', 'a/b', 'a+b', 'a-b']
['aab', 'aab']
['aAb', 'aCb', 'aab', 'aab']
['a2b']
['a2b']

[^…] 代表取反,也就取不在括号内的字符

print(re.findall("a[^0-9]b", "a.b a2b a*b a/b aAb aCb aab aaaaaaab a+b a-b a c")) # -放在[]内的开头或结果
['a.b', 'a*b', 'a/b', 'aAb', 'aCb', 'aab', 'aab', 'a+b', 'a-b']

* 左边字符出现零到无穷次

+ 左边字符出现一到无穷次

print(re.findall("ab*","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
print(re.findall("ab+","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
['a', 'ab', 'abb', 'abbbbbbbbbbbb']
['ab', 'abb', 'abbbbbbbbbbbb']

{n,m}: 左边那个字符出现n次到m次

# 类似于+*的用法
print(re.findall("ab{0,}", "a ab abb abbbbbbbbbbbb bbbbbbbbb"))
print(re.findall("ab*", "a ab abb abbbbbbbbbbbb bbbbbbbbb"))
print(re.findall("ab{1,}", "a ab abb abbbbbbbbbbbb bbbbbbbbb"))
print(re.findall("ab+", "a ab abb abbbbbbbbbbbb bbbbbbbbb"))

# 2-5
print(re.findall("ab{2,5}","a ab abb abbb abbbb abbbbbbbb abbbbbbbbbbbb bbbbbbbbb"))

['a', 'ab', 'abb', 'abbbbbbbbbbbb']
['a', 'ab', 'abb', 'abbbbbbbbbbbb']
['ab', 'abb', 'abbbbbbbbbbbb']
['ab', 'abb', 'abbbbbbbbbbbb']
['abb', 'abbb', 'abbbb', 'abbbbb', 'abbbbb']

?: 左边那个字符出现0次到1次

print(re.findall("ab?","a ab abb abbbbbbbbbbbb bbbbbbbbb"))
['a', 'ab', 'ab', 'ab']

.*: 匹配所有 默认贪婪匹配(计算到字符串结束)

print(re.findall("a.*b","123 a1231-==-000b123123123123123b"))
['a1231-==-000b123123123123123b']

.*?为非贪婪匹配:推荐使用(即得到符合结果就返回)

print(re.findall("a.*?b","123 a1231-==-000b123123123123123b"))
['a1231-==-000b']
# 例:
msg = '<a href="https://pan.baidu.com/s/1skWyTT7" target="_blank"><strong><span style="color: #ff0000;">原理图:https://pan.baidu.com/s/1skWyTT7</span></strong></a><a href="https://www.baidu/com">"点我啊"</a>'
print(re.findall('href="(.*?)"',msg))
['https://pan.baidu.com/s/1skWyTT7', 'https://www.baidu/com']

()分组

print(re.findall('ab+','ababab123'))
print(re.findall('(ab)+123','ababab123')) 
['ab', 'ab', 'ab']
['ab']
# 取消分组
print(re.findall('(?:ab)+123','ababab123')) #findall的结果不是匹配的全部内容,而是组内的内容,?:可以让结果为匹配的全部内容
['ababab123']

| x或x

# 和上面取消分组合使用
print(re.findall("compan(?:ies|y)","Too many companies have gone bankrupt, and the next one is my company'"))
['companies', 'company']
# 组合使用
print(re.findall("\d+\.?\d*","as9fdasl333...4444df1111asdf3333dfadf333.44dafadf3.5555asdfsafd.5555"))
['9', '333.', '4444', '1111', '3333', '333.44', '3.5555', '5555']

\ 使用注意

# 报错
print(re.findall('a\\c','a\c a1c aac')) #对于正则来说a\\c确实可以匹配到a\c,但是在python解释器读取a\\c时,会发生转义,然后交给re去执行,所以抛出异常

print(re.findall('a\\\\c','a\c a1c aac'))
print(re.findall(r'a\\c','a\c a1c aac'))
['a\\c']
['a\\c']

3、re模块提供方法介绍

findall

print(re.findall('e','alex make love') )   #['e', 'e', 'e'],返回所有满足匹配条件的结果,放在列表里

search

print(re.search('e','alex make love')) #e,只到找到第一个匹配然后返回一个包含匹配信息的对象,该对象可以通过调用group()方法得到匹配的字符串,如果字符串没有匹配,则返回None。
<re.Match object; span=(2, 3), match='e'>

print(re.search('e','alex make love').group())
e

print(re.search("\d+\.?\d*","1.3 aa3.44aaa").group())
print(re.search("\d+\.?\d*","asdfsadf"))
1.3
None

match

print(re.match('e','alex make love'))    #None,同search,不过在字符串开始处进行匹配,完全可以用search+^代替match
print(re.search("\d+\.?\d*"," 1.3 aa3.44aaa"))
print(re.match("\d+\.?\d*"," 1.3 aa3.44aaa"))
<re.Match object; span=(1, 4), match='1.3'>
None

split

print(re.split('[ab]','abcd'))     #['', '', 'cd'],先按'a'分割得到''和'bcd',再对''和'bcd'分别按'b'分割

msg = "egon:18-male=10"
print(msg.split(':'))
print(re.split('[:=-]', msg))
['egon', '18-male=10']
['egon', '18', 'male', '10']

sub

print('===>', re.sub('a', 'A', 'alex make love'))  # ===> Alex mAke love,不指定n,默认替换所有
print('===>', re.sub('a', 'A', 'alex make love', 1))  # ===> Alex make love
print('===>', re.sub('a', 'A', 'alex make love', 2))  # ===> Alex mAke love
print('===>', re.sub('^(\w+)(.*?\s)(\w+)(.*?\s)(\w+)(.*?)$', r'\5\2\3\4\1', 'alex make love'))  # ===> love make alex
print('===>', re.subn('a', 'A', 'alex make love'))  # ===> ('Alex mAke love', 2),结果带有总共替换的个数
msg = '<a href="https://pan.baidu.com/s/1skWyTT7" target="_blank"><strong><span style="color: #ff0000;">原理图:https://pan.baidu.com/s/1skWyTT7</span></strong></a><a href="https://www.baidu/com">"点我啊"</a>'
print(re.findall('href="(.*?)"', msg))
url_pattern = re.compile('href="(.*?)"')
res = url_pattern.findall(msg)
print(res)
['https://pan.baidu.com/s/1skWyTT7', 'https://www.baidu/com']
['https://pan.baidu.com/s/1skWyTT7', 'https://www.baidu/com']

猜你喜欢

转载自blog.csdn.net/msmso/article/details/107764176