解析库的使用[爬取知乎发现]

使用XPath

 1 import requests
 2 import json
 3 from lxml import etree
 4 from urllib import parse
 5 
 6 url = 'https://www.zhihu.com/explore'
 7 headers = {
 8     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 9 }
10 html = requests.get(url, headers=headers).text
11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
12 text = etree.HTML(html)
13 # 返回所有内容的结点位置
14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')
15 items ={}
16 for node in node_list:
17     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来
18     #问题
19     question = node.xpath('.//h2/a')[0].text.replace("\n","")
20     # 作者
21     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
22     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
23     # 回答
24     answer = node.xpath('.//*[@class="content"]')[0].text
25     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
26     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
27 
28     items = {
29         "question" : question,
30         "author" : author,
31         "answer" : answer,
32     } 
33 
34     with open("explore.json", "a") as f:
35         #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")
36         f.write(json.dumps(items, ensure_ascii = False) + "\n")

猜你喜欢

转载自www.cnblogs.com/wanglinjie/p/9248573.html