(2)Python笔记:内建模块HTMLParser实践

HTMLParser为Python的常用内建模块,使用时经常是继承HTMLParser并重写其方法。
其中常用方法如下:

handle_starttag(tag, attrs)处理开始标签,比如<input type="text" value="3">,tag即为input,attrs为储存对应属性,值的元组(tuple)列表(list):[('type':'text'),('value':'3')]
handle_endtag(tag)处理结尾标签
handle_data(data)处理标签内容
handle_startendtag(tag, attrs)处理<img src="" />这类标签
handle_comment(data)处理注释内容
#例子
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        """
        recognize start tag, like <div>
        :param tag:
        :param attrs:
        :return:
        """
        print("Encountered a start tag:", tag)
        print(attrs)

    def handle_endtag(self, tag):
        """
        recognize end tag, like </div>
        :param tag:
        :return:
        """
        print("Encountered an end tag :", tag)

    def handle_data(self, data):
        """
        recognize data, html content string
        :param data:
        :return:
        """
        print("Encountered some data  :", data)

    def handle_startendtag(self, tag, attrs):
        """
        recognize tag that without endtag, like <img />
        :param tag:
        :param attrs:
        :return:
        """
        print("Encountered startendtag :", tag)

    def handle_comment(self,data):
        """

        :param data:
        :return:
        """
        print("Encountered comment :", data)


parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
            '<body><h1>Parse me!</h1><img src = "" /><input type="text" value="3"></input>'
            '<!-- comment --></body></html>')
parser.close()

#输出

这里写图片描述

简单实际用例

从csdn博客视图列表中获取所有自己博文的名字+url

http://blog.csdn.net/yeyinglingfeng?viewmode=contents
主要思路就是利用各个函数的执行顺序

#查看网页源代码后可知所需数据在下面的html中,url在<a>标签中的href中,标题在内容中,因为HTMLParser在检索html时是按照出现的先后顺序触发各个方法的,所以在触发handle_starttag函数后,只要有内容,就会触发handle_data函数,所以可以利用这点来绑定url和title并存入list。PS:我这里多加的锁定可能有点多余。
<h1>
    <span class="link_title"><a href="/yeyinglingfeng/article/details/78332515">
        (1)Python笔记:抓取CSDN博文            
        </a>
    </span>
</h1>
#从csdn博客视图列表中获取所有自己博文的名字+url
#http://blog.csdn.net/yeyinglingfeng?viewmode=contents

#coding:utf-8
from html.parser import HTMLParser
import urllib.request 
import re
import sys

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.articleStr = []
        self.check='N'
        self.num=0
    def handle_starttag(self, tag, attrs):
        if tag =='a':
            if len(attrs)==1 and attrs[0][0]=='href' and '/yeyinglingfeng/article/details/' in attrs[0][1] and '#comments' not in attrs[0][1]:
                self.check='Y'
                article={}
                article['url']='http://blog.csdn.net'+attrs[0][1]
                self.articleStr.append(article)
                self.num+=1
    def handle_endtag(self, tag):
        pass
    def handle_data(self, data):
        if self.check=='Y':
            self.articleStr[-1]['articleTitle']=data.replace('\r\n','').strip()
            self.check='N'
    def handle_startendtag(self, tag, attrs):
        pass
    def handle_comment(self,data):
        pass

def getHtmlInfo(url):
    print('url:'+url)
    return str(urllib.request.urlopen(url).read(),'utf-8')

def saveInfo(info):
    try:
        with open('d:\\1\\csdnList.txt','w',encoding='utf-8') as file_write:
            file_write.write(info)
    except:
        print('error:something faile')  

html=getHtmlInfo("http://blog.csdn.net/yeyinglingfeng?viewmode=contents")
parser = MyHTMLParser()
parser.feed(html)
parser.close()
allInfo=''
for each in parser.articleStr:
    allInfo+=each['articleTitle']+' url:'+each['url']+'\n'
#print(allInfo)
saveInfo(allInfo)
print(str(parser.num))

保存的txt文件
这里写图片描述

猜你喜欢

转载自blog.csdn.net/yeyinglingfeng/article/details/78342514