[Python reptile road day8]: Regular Expressions

Earlier we learned lxml and Beautifulsoup analytical tools, today we are relatively difficult to learn regular expressions.
In python, in short, regular expression you want to extract a string in a string of characters.
Re Import
# a. matches any single character

*#1.匹配字符串*
text="heooo"
ret=re.match("he",text)
print(ret.group())
*#2.点(.)匹配任意字符[一个],不能匹配换行符*
text="heooo"
ret=re.match(".",text)
print(ret.group())
*#3.“\d”匹配数字0-9*
text='2'
ret=re.match("\d",text)
print(ret.group())
*#4.“\D”匹配任意非数字,和”\d“正好相反*
text='d'
ret=re.match("\D",text)
print(ret.group())
*#5.\s匹配空白字符,“\n,\r,\t,空格”*
*#6.\w匹配a-z,A-Z,数字和下划线*
*#7.\W匹配与\w相反的
#8.组合匹配[],满足[]的内容就可以匹配*
text='2332-23212112'
ret=re.match("[\d\-]+",text)
print(ret.group())
*#8.1.[]匹配替代\d*
#ret=re.match("[0-9]",text)
*#8.2.[]匹配替代\D, 使用 "  ^  "*
text='='
ret=re.match("[^0-9]",text)
print(ret.group())
*#8.3.[]匹配替代\w*
text='D'
ret=re.match("[a-zA-Z0-9]",text)
print(ret.group())
*#8.4.[]匹配替代\W*
text='--'
ret=re.match("[^a-zA-Z0-9]+",text)
print(ret.group())

# II. Match multiple characters

#1.使用*匹配0个或多个字符*
text='98342'
ret=re.match("\d*",text)
print(ret.group())
*#2.使用+ 匹配1个或多个字符*
text='98342'
ret=re.match("\d+",text)
print(ret.group())
*#3.使用+ 匹配1个或多个字符*
text='98342'
ret=re.match("\d+",text)
print(ret.group())
*#4.使用? 匹配一个或者0个字符*
text='asd'
ret=re.match("\w?",text)
print(ret.group())
*#5.{m}匹配m个字符*
text='asdsds'
ret=re.match("\w{4}",text)
print(ret.group())
*#6.{m,n}匹配m-n个字符,以[2,3,4,5]中最多为准*
text='asdj33g'
ret=re.match("\w{2,5}",text)
print(ret.group())

####### Small Case #########

#1.匹配手机号
text='13837389987'
ret=re.match("1[345789]\d{9}",text)
print(ret.group())
#2.匹配邮箱
text='[email protected]'
ret=re.match("\w+@[a-z0-9]+\.[a-z]+",text)
print(ret.group())
#3.匹配网页
text='https://www.bilibili.com/'
ret=re.match("(http|https|ftp)://[^/s]+",text)
print(ret.group())
#4.匹配身份证
text='12432318881149884X'#17数字+x/X
ret=re.match("\d{17}[\dxX]",text)
print(ret.group())
#三.开始结束或语法  "^",在[]中表示取反
#1.^表示开始(脱字号)
text='fffs'
ret=re.search("^\w+",text)
print(ret.group())
#2.$表示结尾
text='[email protected]'
ret=re.match("\w+@qq\.com$",text)
print(ret.group())
#3."|"匹配多个字符串或者表达式
#4.贪婪模式和非贪婪模式
text='14232984'
ret=re.match("\d+",text)#贪婪1423984
ret=re.match("\d+?",text)#非贪婪1
print(ret.group())
text='<>嘻嘻嘻<\h1>'
ret=re.match("<.+>",text)#贪婪<h1>嘻嘻嘻<\h1>

ret=re.match("<.+?>",text)#非贪婪<h1>

#4.1.匹配0-100
text='12'
ret=re.match("0$|[1-9]\d?$|100$",text)
#问号表示要么有1个要么就没有

#four. Escape character and native string
text = 'Money IS $ 222'
RET = re.search ( "$ \ d +", text)
Print (ret.group ())
plus \ then lost its original meaning, it becomes ordinary characters.
Or add r "" (a native string)
in the regular expression and python, the "\" represents an escape, so if you want to match the character in general, you should write 4, using the native string can solve this problem.
= text 'Money IS \ C'
RET = the re.search ( "\\ C", text)
print (ret.group ())
RET = the re.search (R & lt "\ C", text) [String] native
print (ret.group ())
# five, match function and search groups
1.match from the beginning of the match
2.search sentence match, but once the match is not successful matches.
3. Packet:
[Image dump the chain fails, the source station may have security inserted here! Chain mechanism, described built]
Result:
Here Insert Picture Description
# .re six are regular in a library of common functions which are described below.

#findall 以列表形式返回
text='the app is $33 and the sun is $23'
ret=re.findall('\$\d+',text)
print(ret)#注意此处不用group
#sub函数,后者替换前者匹配的部分
text='the app is $33 and the sun is $23'
ret=re.sub('\$\d+',"2",text)
print(ret)'''
#小栗子:
text='''<dd class="job_bt">
        <h3 class="description">职位描述:</h3>
        <div class="job-detail">
        <p>【职位职责】<br> &nbsp;1. &nbsp;负责部门 python 组的管理;<br> &nbsp;2. &nbsp;负责高质量的设计和编码,承担重点、难点的技术攻坚;</p>
        </div>
    </dd>
ret=re.sub('<.+?>','',text)
print(ret)
#split,按照要求分割成列表
text='he is a do%g'
ret=re.split('[^a-zA-Z]',text)
print(ret)'''
#compile
#1.预编译,提高效率
text='he is a  98.32'
r=re.compile('\d+\.?\d*')
ret=re.search(r,text)
print(ret.group())
#2.提供一种写法
text='he is a  98.32'
r=re.compile('''
    \d+ #小数点前
    \.? #小数点
    \d*#小数点后
   ''', re.VERBOSE)
ret=re.search(r,text)
print(ret.group())
Published 12 original articles · won praise 3 · Views 2219

Guess you like

Origin blog.csdn.net/dinnersize/article/details/104415388