3.6 初识正则表达式

# 正则表达式
# 特定字符创的查找切割替换等
# 邮箱格式、URL.IP等的校验
# 使用原则
#只要使用字符串等函数可以解决的问题，就不要使用正则
# 正则表达式的效率比较低，同时会降低代码的可读性
# 正则表达式是用来写的不是用来读的，在不指定功能的
# 情况下，不要试图阅读别人的正则
# 正则不是我们写的，在python中是通过re 模块来完成的
# 相关函数：
# match:从头开始进行匹配，匹配到就返回正则结果对象，没有就返回None
# search：从任意位置匹配，功能同上
import re
m = re.match('abc','abcdask; abcdefg')
if m:
    # 返回匹配的内容
    print(m.group())
    print(m.span())

m = re.search('abc','shdlkghabclhrrlabc')
if m:
    print(m.group())
    print(m.span())

# findall: 全部匹配，返回匹配到的内容组成的列表，没有就返回空列表
f = re.findall('abcd','abcdeabcdeabcde')
print(f)

# compile: 生成正则表达式对象
c = re.compile('hello')
print(type(c))

# 从头开始匹配
m = c.match('hellodkglg; hellodgkgihello')
if m:
    print(m.group())
    print(m.span())

# 从任意的位置开始匹配
s = c.search('hellodgkglglhhellogfgjfl')
if s:
    print(m.group())
    print(m.span())

#     匹配所有的内容：
f = c.findall('helloghfkghellokghello')
print(f)

# 正则规则
# 普通字符：简单来说就是一对一的完全匹配
#[]中间的任意一个字符
# [a-z]之间的任意字符
c = re.compile('[a-z]')
s = c.findall('fkgdflgjflhjl1dg3dg;5djf711')
print(s)

# [a-zA-Z]匹配所有的字母，多个连续的片段中间不能有任何多余的字符
c = re.compile('[a-zA-Z]')
s = c.findall('abcdflghhjtldleitih;dfkjoirtoeu')
print(s)

# [^0-9] 匹配0-9以外的任意字符
c = re.compile('[^0-9]')
s = c.findall('dhkg*/.dl124596dfido')
print(s)

# .: 匹配\n以外的任意字符
c = re.compile('.')
s = c.findall('1\n2\n3\n4\n5\n')
print(s)

# \d 数字字符，等价于[0-9]
c = re.compile('\d')
s = c.findall('1a2b3c4d5e')
print(s)

# \D 非数字字符，等价于[^0-9]
c = re.compile('\D')
s = c.findall('1a2b3c4d5e')
print(s)

# \w匹配字（数字、字母、下划线、汉字）
c = re.compile('\w')
s = c.findall('我爱我家521--0')
print(s)
# \W匹配与\w相反的内容
c = re.compile('\W')
s = c.findall('我爱我家521--0')
print(s)

# \s匹配所有的空白符
c = re.compile('\s')
s = c.findall('h e l l 0 \t\n\r')
print(s)

# \S匹配非空白字符
c = re.compile('\S')
s = c.findall('h e l l 0 \t\n\r')
print(s)

# \b 匹配词边界
c = re.compile(r'\bcc')
s = c.findall('aaahelloworld; ccchelloworld')
print(s)

# 次数的限定
#  + 至少一次
#  ？ 最多一次
# {m, n}: m <= 次数 <= n
# {m, }至少m次数
# {n, }至少n 次数
# {m} 指定M次数
# 边界限定
#  ^ 以指定的内容开头
#  $以指定的内容结束
#  示例
f = re.findall('^hello','hellodkjgfgjfll')
print(f)

f = re.findall('hello$', 'dkghfkgfkghhello')
print(f)

# 优先级控制
# | 表示或， 它拥有最低的优先级
# （）表示一个整体，可以明确的指定结合性或者优先级
# 示例
import re
f = re.findall('a(hello|world)c','dshfldghahellocaworldc')
print(f)

# 分组匹配：
# 示例如下
c = re.compile(r'(\d+)([a-z]+)(\d+)')
s = c.search('abcd521abcdefghijk520msdn')
print(s.group(),s.span())
print(s.group(1),s.span(1))
print(s.group(2),s.span(2))
print(s.group(3),s.span(3))

# 示例2
import re
# 固定匹配
c = re.compile(r'<a>\w+</a>')
# \1 表示前面的第一个()匹配的内容
e = re.compile(r'<([a-z]+)><([a-z]+)>\w+</\2></\1>')
s = c.search('<div><a>百度一下</a></div>')
if s:
    print(s.group())

g = e.search('<div><a>百度一下</a></div>')
if g:
    print(g.group())

# 贪婪匹配
# 贪婪： 最大限度的匹配。正则的匹配默认就是贪婪的
# 非贪婪： 只要满足条件，能少匹配就少匹配。'?'经常用于取消贪婪
# 示例：
import re
#?取消任意多次的贪婪匹配
c = re.compile(r'a.*?b')
# ? 取消至少一次的贪婪匹配
e = re.compile(r'a.+?b')
s = c.search('sljgacedgkgblgljbgflb')
if s:
    print(s.group())

k = e.search('sljgacedgkgblgljbgflb')
if k:
    print(s.group())

# 匹配模式
# 说明： 所谓的匹配模式就是对匹配的原则进行整体的修饰
# 示例：
import re
# 忽略大小写re.I
s = re.search(r'hello','HELLO world',re.I)
if s:
    print(s.group())

# 正则默认是单行匹配，使用re.M可以进行多行的匹配
# ^以指定的内容开头
s = re.search(r'^hello','dkjh\nhello world',re.M)
if s:
    print(s.group())

# 使用.匹配任意字符，作为单行处理， 忽略\n
# 在Python的正则表达式中，有一个参数为re.S。它表示“.”
# （不包含外侧双引号，下同）的作用扩展到整个字符串，包括“\n”。
string = '<div>hello</div>'
strings = '''<div>
hello
</div>'''
s = re.search(r'<div>.*?</div>',strings,re.S)
if s:
    print(s.group())
3.6 初识正则表达式

猜你喜欢