前情提要: 手上有个报告HTML,要抓取其中的数据内容。HTML文件内容较多,相同标签更多,不能更改。所以,选择按内容定位,套用循环和判断,将需要的数据放入列表list里。
代码呈现:
from HTMLParser import HTMLParser
import HTMLParser
class TitleParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.handledtags = ['title','body']
self.processing = None
self.data = []
def handle_starttag(self,tag,attrs):
if tag in self.handledtags:
self.processing = tag
def handle_data(self,data):
if self.processing:
self.data.append(data)
def handle_endtag(self,tag):
if tag == self.processing:
self.processing = None
if __name__ == '__main__':
fd = open('index.html')
tp = TitleParser()
tp.feed(fd.read())
a =['none']
for each in tp.data:
if each == 'Statistics:':
#print each
i = 0
a = [each]
#print a
if each == 'Parameters:':
break
if a[0] == 'Statistics:' and ('\n' not in each):
a.append(each)
#print each
print a
结果
['Statistics:', 'Statistics:', 'Mean TTFF:', ' 37.048 ', 'Standard Deviation TTFF:', ' 19.634 ', 'CEP50:', ' 0.522 ', 'CEP95:', ' 1.515 ']