xpath使用实例之爬取好段子网好段子代码

haoduanzi.py

#
!/usr/local/bin/python3.7 import urllib.request import urllib.parse from lxml import etree import time def handler_request(url, page): # 创建请求头 headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15' } # 创建请求 req = urllib.request.Request(url=url, headers=headers) # 发送请求 rep = urllib.request.urlopen(req) # 获取内容 content = rep.read().decode() return content def location_element(tree): # 段子标题 titles = tree.xpath("//div[@class='head']/h2/text()") # print(ret) # 获取喜欢量和不喜欢量 good = tree.xpath("//div[@class='ping x1']/a[1]/span/text()") # print(good) bad = tree.xpath("//div[@class='ping x1']/a[2]/span/text()") # print(bad) for i in range(len(titles)): # print('标题:', titles[i]) # print('喜欢:', good[i]) # print('不喜欢:', bad[i] str1 = '<h3>%s</h3>'%titles[i]+ '\n' + '<b>喜欢:</b>' + '<span>%s</span>'%good[i] + ' ' + '<b>不喜欢:</b>' + '<span>%s</span>'%bad[i] with open('Reptile/duanzi.html', 'a') as stream: stream.write(str1) if __name__ == "__main__": start_page = input('请输入起始页码:') end_page = input('请输入结束页码:') url = 'http://www.haoduanzi.com/category/?1-{}.html' for page in range(int(start_page), int(end_page)+1): url = url.format(page) # print(url) # 创建请求 content = handler_request(url, page) # print(content) # 创建对象 time.sleep(1) tree = etree.HTML(content) # print(tree) # 定位内容 location_element(tree)

使用浏览器查看结果(第一页内容,页数可自己设定):

猜你喜欢

转载自www.cnblogs.com/lxmtx/p/12912184.html