python正则表

python正则表达式在抓取网页，分析数据方面有特别重要的作用，下面是一些例子可以参考：
# -*- coding: utf-8 -*- 
import re

#\d   表示一位数字
#在一个模式后加上花括号包围的 3（{3}），就是说，“匹配这个模式 3 次”
phoneNumRegex = re.compile(r'\d{3}-\d{3}-\d{4}')
mo = phoneNumRegex.search('My number is 412-555-3232.')
print mo.group()

#利用括号分组
phoneNumRegex1 = re.compile(r'(\d{3})-(\d{3}-\d{4})')
mo = phoneNumRegex1.search('My number is 412-555-3232.')
print mo.groups()

#括号在正则表达式中有特殊含义，如果要匹配括号，需要用斜杠对进行字符转义
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My phone number is (415) 555-4242.')
print mo.group()

#用管道匹配多个分组
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
print mo1.group()
mo2 = heroRegex.search('Tina Fey and Batman.')
print mo2.group()

#(wo)为可选匹配
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print mo1.group()
mo2 = batRegex.search('The Adventures of Batwoman')
print mo2.group()

phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('My number is 415-555-4242')
print mo1.group()
mo2 = phoneRegex.search('My number is 555-4242')
print mo2.group()

#用星号匹配零次或多次(不管出现多少次都可以匹配的到)
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print mo1.group()
mo2 = batRegex.search('The Adventures of Batwoman')
print mo2.group()
mo3 = batRegex.search('The Adventures of Batwowowowoman')
print mo3.group()

#用加号匹配一次或多次（至少出现一次）
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batman')
print mo1
mo2 = batRegex.search('The Adventures of Batwoman')
print mo2.group()
mo3 = batRegex.search('The Adventures of Batwowowowoman')
print mo3.group()

#用花括号匹配特定次数{,3}匹配0~3次，{3,}匹配大于3次

#python的正则表达式默认是贪心匹配，若想非贪心匹配需要在{}?
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print mo1.group()
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
print mo2.group()

#findall()
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
print phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
print phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

#正则表达式\d+\s\w+匹配的文本有一个或多个数字(\d+)， 接下来是一个空白字符(\s)， 接下来是一个或多个字母/数字/下划线字符(\w+)
xmasRegex = re.compile(r'\d+\s\w+')
print xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7\
    swans, 6 geese, 5 rings, 4 birds, 3 hens,  doves, 1partridge')

#建立自己的字符分类，匹配所有的元音字母，不区分大小写
vowelRegex = re.compile(r'[aeiouAEIOU]')
print vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')
#符分类[a-zA-Z0-9]将匹配所有小写字母、 大写字母和数字
consonantRegex = re.compile(r'[^aeiouAEIOU]')
print consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

#插入字符和美元字符，表示开始和结束
beginsWithHello = re.compile(r'^Hello')
print beginsWithHello.search('Hello world!').group()
print beginsWithHello.search('He said hello.')

# r'\d$'表示遇到0~9数字就结束
endsWithNumber = re.compile(r'\d\d$')
print endsWithNumber.search('Your number is 423').group()

#r'^\d+$'从开始到结尾都是数字
namesRegex = re.compile(r'Agent \w+')
print namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

#.（句点）字符称为“通配符”
atRegex = re.compile(r'.at')
print atRegex.findall('The cat in the hat sat on the flat mat.')

#用点-星匹配所有字符
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.findall('First Name: Al Last Name: Sweigart')
print mo

#非贪心
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print mo.group()
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print mo.group()

#用句点字符匹配换行
newline = re.compile(".*")
print newline.search("Serve the public trust.\nProtect the innocent.\
    \n Uphold the law.").group()

newline = re.compile(".*", re.DOTALL) #!!!!可以匹配换行符
print newline.search("Serve the public trust.\nProtect the innocent.\
    \n Uphold the law.").group()


#不区分大小写的匹配
robocop = re.compile(r'robocop', re.I)
print robocop.search('RoboCop is part man, part machine, all cop.').group()

#管理复杂的正则表达式，添加re.VERBOSE选项，python自动忽略注释、空白及换行符。
phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))? # area code
(\s|-|\.)?         # separator
\d{3}              # first 3 digits
(\s|-|\.)         # separator
\d{4}             # last 4 digits
(\s*(ext|x|ext.)\s*\d{2,5})? # extension
)''', re.VERBOSE)
猜你喜欢