用lxml的xpath演示爬虫提取笑话集网页其中的标题,url,浏览数,日期,笑话内容

人狠话不多,直接上源码

from  urllib import request,parse
from  urllib import error
import chardet
from lxml import etree
import csv,string
def jokeji(url,beginPage, endPage):
    for page in range(beginPage, endPage):
        pn =page
        fullurl = url + "me_page=" + str(pn)
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
        req = request.Request(fullurl, headers=headers)
        try:
            response = request.urlopen(req)
            resHtml = response.read()
            resHtml = resHtml.decode("gbk", 'ignore')
            html = etree.HTML(resHtml)
            results = html.xpath('//table[@width="646"]')
            for site in results:
                #标题
                title = site.xpath('.//td/a')[0].text
                # 浏览数
                view = site.xpath('.//td')[2].text
                # 日期
                date = site.xpath('.//td/span')[0].text
                # url
                jokeurl = site.xpath('.//td[2]/a[@class="main_14"]/@href')[0]
                newjokeurl="http://www.jokeji.cn/"+jokeurl
                newjokeurl = parse.quote(newjokeurl,safe=string.printable)
                # print(newjokeurl)
                # 笑话内容
                headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
                requrl = request.Request(newjokeurl, headers=headers)
                print("标题:%s,url:%s,浏览数:%s,日期%s" % (title, jokeurl, view, date))
                try:
                    jokresponse = request.urlopen(requrl)
                    jokresHtml = jokresponse.read()
                    jokresHtml = jokresHtml.decode("gbk")
                    html = etree.HTML(jokresHtml)
                    result = html.xpath('//div[@class="left_up"]//font[@face="Verdana"]//text()')
                    for i in result:
                        print(i)

                except Exception  as e:
                    pass


        except error.URLError as e:
            print(e)

if __name__ == "__main__":
    proxy = {"http": "118.31.220.3:8080"}
    proxy_support = request.ProxyHandler(proxy)
    opener = request.build_opener(proxy_support)
    request.install_opener(opener)
    beginPage = int(input("请输入起始页:"))
    endPage = int(input("请输入终止页:"))
    url = "http://www.jokeji.cn/hot.asp?"
    jokeji(url,beginPage, endPage)

猜你喜欢

转载自blog.csdn.net/lzz781699880/article/details/81111723