[Python3 crawler (5)] [Data analysis] [Regular expression]

Previous: [Python3 crawler (four)] [urlib.request module] [ssl authentication + cookies (string type conversion, session)]
++++++++++ start line+++++++ +++++++++++

One, regular expression

1.1 Mind Map

Insert picture description here

For details, please see: https://blog.csdn.net/make164492212/article/details/51699545

1.2 Greedy mode and non-greedy mode

re.py

import re

# 贪婪模式从开头匹配到结尾,默认为贪婪

# 非贪婪举例如下
one = 'mdfsdsfffdsn12345656n'
two = "2.5"
three = 'a\b'

# 正则表达式,一般设置pattern为正则的变量

# pattern = re.compile('m(.*)n')  # 贪婪模式
# result = pattern.findall(one) # findall返回列表
# print(result)
# ['dfsdsfffdsn12345656']

# pattern = re.compile('m(.*?)n')  # 非贪婪模式
# result = pattern.findall(one)
# print(result)
# ['dfsdsfffds']

# pattern = re.compile('2.5')
# result = pattern.findall(two)
# print(result)
# ['2.5']

# pattern = re.compile(r'a\b')
# result = pattern.findall(three)
# print(result)
# ['a']

1.3 Matching

re2.py

import re

# . 除了换行符号\n之外的匹配
one = """
    msfdsdffdsdfsn
    1234567778888N
"""

# 匹配m和n之间的字符
# pattern = re.compile('m(.*)n')
# result = pattern.findall(one)
# print(result)
# .不匹配换行符
# ['sfdsdffdsdfs']


# 正则表达式严格区分大小写
# pattern = re.compile('m(.*)n', re.S | re.I)
# result = pattern.findall(one)
# print(result)
# ['sfdsdffdsdfsn\n    1234567778888']

# re.S匹配换行符
# re.I匹配大小写

1.4 Pure digital regularity

re3.py

import re

# 纯数字的正则,\d代表0-9之间的一个数
pattern = re.compile('^\d+$')
one = '234'

# 匹配判断的方法
# match方法:是否匹配成功,从头开始匹配一次就停止
result = pattern.match(one)
print(result.group())
# 234

1.5 Range operations

re4.py

import re


# 范围运算
one = '7893452'

# 找1或2或3
# pattern = re.compile('[123]')
# ['3', '2']

# 找1-9之间的值
pattern = re.compile('[1-9]')
result = pattern.findall(one)

print(result)
# ['7', '8', '9', '3', '4', '5', '2']

1.6 Regular methods

re5.py

import re

one = 'abc 123'
patter = re.compile('\d+')

# match:从头开始匹配,匹配一次
result = patter.match(one)
# None

# search:从任意位置匹配, 匹配一次
# result = patter.search(one)
# <re.Match object; span=(4, 7), match='123'>

# findall:查找符合正则的内容,输出为list
# result = patter.findall(one)
# ['123']

# sub:替换字符串
# result = patter.sub('#', one)
# abc #

# split:拆分
# patter = re.compile(' ')
# result = patter.split(one)
# ['abc', '123']

print(result)

++++++++++End line++++++++++++++++++

Guess you like

Origin blog.csdn.net/qq_42893334/article/details/108433963