【Python网络爬虫】150讲轻松搞定Python网络爬虫付费课程笔记 篇十一——正则表达式和re模块

1. 匹配单字符

import re

#match 从最开始的位置开始查找匹配,如果第一个字符不匹配则返回
# text = "abc"
# ret = re.match('a', text)
# print(ret.group())

# . 匹配任意的字符, 除了'\n'
# text = '2bc'
# ret1 = re.match('.', text)
# print(ret1.group())

# \d 匹配任意的数字
# text = "2345"
# ret2 = re.match('\d', text)
# print(ret2.group())

# \D 匹配任意非数字
# text = "+S345"
# ret3 = re.match('\D', text)
# print(ret3.group())

# \s 匹配空白字符, \n, \t, \r 空格
# text = "\n2345"
# ret3 = re.match('\s', text)
# print('=' * 20)
# print(ret3.group())
# print('=' * 20)

# \S 匹配非空白字符, \n, \t, \r 空格
# text = "2345"
# ret4 = re.match('\S', text)
# print(ret4.group())

# \w 匹配字母a-z, A-Z 以及数字和下划线w
# text = "ac45"
# ret5 = re.match('\w', text)
# print(ret5.group())

# \W 匹配与\w相反
# text = "+ac45"
# ret6 = re.match('\W', text)
# print(ret6.group())a

# [] 组合匹配中括号中满足一个的条件
# text = "bc"
# ret6 = re.match('[1bc]', text)
# print(ret6.group())

# text = "123"
# ret6 = re.match('[0-9]', text)
# print(ret6.group())

# text = "bc"
# ret6 = re.match('[^0-9]', text)
# print(ret6.group())

text = "bc"
ret6 = re.match('[a-zA-Z0-9]', text)
print(ret6.group())

2. 匹配多字符

import re

# *, 匹配0个或多个字符,匹配条件由前面设置
# text = "+abc"
# result = re.match('\w*', text)
# print(result.group()) #空

# +, 匹配1个或多个字符,匹配条件由前面设置
# text = "abc"
# result = re.match('\w+', text)
# print(result.group()) #abc


# ?, 匹配前一个字符要求的0个或多个字符,匹配条件由前面设置
# text = "1abc"
# result = re.match('\w?', text)
# print(result.group()) #1

# {m}, 匹配m个字符,匹配条件由前面设置
# text = "12abc"
# result = re.match('\w{3}', text)
# print(result.group())

# {m,n}, 匹配m-n之间个字符,匹配条件由前面设置 
text = "12abc"
result = re.match('\w{1,3}', text)#注意{m,n}中间没有空格,加上空格会报错
print(result.group())

3. 开始 / 结束 / 贪婪 / 非贪婪

import re

# ^ 以...开头
# text = "hello world"
# result = re.search('^hello', text)
# print(result.group())

#$ 以..结尾
# text = "hello world"
# result = re.search('world$', text)
# print(result.group())
#
# text = ""
# result = re.search('^$', text)
# print(result.group())

# | 匹配多个表达式

# 贪婪和非贪婪
# text = "12345"
# result = re.search('\d+?', text)#非贪婪模式, 满足条件下尽可能少的匹配字符
# print(result.group())

#案例
# text = "<h1>这是个大标题</h1>"
# result1 = re.search('<.+>', text) #贪婪模式,满足条件下尽可能多的匹配字符
# result2 = re.search('<.+?>', text)#非贪婪模式, 满足条件下尽可能少的匹配字符
# print(result1.group())
# print(result2.group())

#案例: 验证一个字符是不是0-100之间的数字0,1, 99, 100
text = "100"
result = re.match('0$|[1-9]\d?$|100$', text)
print(result.group())

4. 转义字符和原生字符

import re

# 转义字符
# text = "hello\\nworld"
# print(text)

# 正则表达式中的转义字符
# text = r"hello\nworld"
# print(text)

# 正则表达式中的转义字符
# text = "apple price is $99, orange price is $88"
# result = re.findall("\$\d+", text)
# print(result) 

# 原生字符串 和正则表达式
# 正则表达式的字符串解析规则
# 先把字符串放到python层面解析,之后放到正则表达式解析
text = "\cba a"
result = re.match("\\\\c", text) # \\\\c (python层面) > \\c (正则表达式) > \c
result2 = re.match(r'\\c', text)
print(result.group())
print(result2.group())

5. 分组 

import re

#分组
text = "apple price is $99, orange price is $88"
result = re.search('.+(\$\d+).+(\$\d+)', text)
print(result.group(0))  # / group() 匹配整个分组 apple price is $99, orange price is $88
print(result.group(1))  #匹配第一个分组 $99
print(result.group(2))  #匹配第二个分组... $88
print(result.groups())  # 获取所有分组('$99', '$88')

6. 常用函数

6.1 sub

#sub 根据规则替换其他字符串
text = "nuhao zhongque, lala hello world"
# new_text = text.replace(" ", "\n")
# print(new_text)

new_text = re.sub(r' |,', '\n', text)
print(new_text)
# nuhao
# zhongque
#
# lala
# hello
# world

html = '''
<div class="job-detail">
        <p>工作职责:<br>1.从事计算机视觉相关服务、以及平台研发,以及现有服务维护及优化;<br>2.参与研发过程中的需求分析和架构设计。<br>任职资格:<br>1.三年及以上python开发经验,&nbsp;熟悉django&nbsp;flask等常见开发框架;<br>2.&nbsp;计算机相关专业,具备扎实的编程功底,熟悉常用数据结构和算法、设计模式;<br>3.&nbsp;具备独立数据库设计,以及系统设计能力,如mongo、redis、mysql,缓存、消息队列等相关技术;<br>4.&nbsp;具有良好的编程思想、沟通、团队合作精神、优秀的分析问题和解决问题的能力;具备强烈的责任心。</p>
</div>
'''

new_html = re.sub(r'<.+?>', '', html)
print(new_html)

6.2 split 

text = "nihao zhongguo, hello world"
result =  re.split(r' |,', text)
print(result) #['nihao', 'zhongguo', '', 'hello', 'world']

6.3 compile

# compile 编译正则表达式
text = "apple price is 34.56"
result = re.compile(r'\d+\.?\d*')
r = re.search(result, text)
print(r.group()) #34.56

6.4 VERBOSE 

# compile 编译正则表达式
text = "apple price is 34.56"
result = re.compile(r'''
\d+ #整数部分
\.? #小数点
\d* #小数部分
''', re.VERBOSE)#如果在正则表达式中加注释,需要在函数最后加re.VERBOSE 
r = re.search(result, text)
print(r.group())

6.5 findall 

#findall
# text = "apple price is $99, orange price is $88"
# result = re.findall(r'\$\d+', text) # findall()返回是一个list
# print(result) #['$99', '$88']

猜你喜欢

转载自blog.csdn.net/weixin_44566432/article/details/108691177