mport re #正则表达式 mySent = 'This book is the best book.' regEX = re.compile('\\W*') # \\W*表示除了单词数字外的任意字符串 list0fTokens = regEX.split(mySent) 上面也可以写成: mySent = 'This book is the best book.' listOfTokens = re.split(r'\W*', mySent) list1fTokens = [ tok.lower() for tok in list0fTokens if len(tok) > 0 ] 上面一句程序写全: list1fTokens = [] for tok in list0fTokens: if len(tok) > 0: list1fTokens.append( tok.lower() ) #.lower()将字符串全部转换成小写,.upper()将字符串全部转换成大写 print(list1fTokens) 结果:['this', 'book', 'is', 'the', 'best', 'book'] 分隔一封邮件文本 import re #正则表达式 import os os.chdir('E:\机器学习实战代码\machinelearninginaction\Ch04\email\ham') regEX = re.compile('\\W*') emailText = open('6.txt') emailText1 = regEX.split( emailText.read() ) emailText2 = [tok.lower() for tok in emailText1 if len(tok) > 3] 用>3来去除URL的残余字母 print(emailText2) 结果:['hello', 'since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', ......'changes', 'to', 'google', 'groups']
正则切分解析文本数据文件
猜你喜欢
转载自blog.csdn.net/xiaobaicai4552/article/details/79404567
今日推荐
周排行