人狠话不多,直接上源码
from urllib import request,parse
from urllib import error
import chardet
from lxml import etree
import csv,string
def jokeji(url,beginPage, endPage):
for page in range(beginPage, endPage):
pn =page
fullurl = url + "me_page=" + str(pn)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
req = request.Request(fullurl, headers=headers)
try:
response = request.urlopen(req)
resHtml = response.read()
resHtml = resHtml.decode("gbk", 'ignore')
html = etree.HTML(resHtml)
results = html.xpath('//table[@width="646"]')
for site in results:
#标题
title = site.xpath('.//td/a')[0].text
# 浏览数
view = site.xpath('.//td')[2].text
# 日期
date = site.xpath('.//td/span')[0].text
# url
jokeurl = site.xpath('.//td[2]/a[@class="main_14"]/@href')[0]
newjokeurl="http://www.jokeji.cn/"+jokeurl
newjokeurl = parse.quote(newjokeurl,safe=string.printable)
# print(newjokeurl)
# 笑话内容
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
requrl = request.Request(newjokeurl, headers=headers)
print("标题:%s,url:%s,浏览数:%s,日期%s" % (title, jokeurl, view, date))
try:
jokresponse = request.urlopen(requrl)
jokresHtml = jokresponse.read()
jokresHtml = jokresHtml.decode("gbk")
html = etree.HTML(jokresHtml)
result = html.xpath('//div[@class="left_up"]//font[@face="Verdana"]//text()')
for i in result:
print(i)
except Exception as e:
pass
except error.URLError as e:
print(e)
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入终止页:"))
url = "http://www.jokeji.cn/hot.asp?"
jokeji(url,beginPage, endPage)