使用XPath
1 import requests 2 import json 3 from lxml import etree 4 from urllib import parse 5 6 url = 'https://www.zhihu.com/explore' 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 9 } 10 html = requests.get(url, headers=headers).text 11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html) 12 text = etree.HTML(html) 13 # 返回所有内容的结点位置 14 node_list = text.xpath('//div[@class="explore-feed feed-item"]') 15 items ={} 16 for node in node_list: 17 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来 18 #问题 19 question = node.xpath('.//h2/a')[0].text.replace("\n","") 20 # 作者 21 author = node.xpath('.//*[@class="author-link-line"]/*')[0].text 22 #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","") 23 # 回答 24 answer = node.xpath('.//*[@class="content"]')[0].text 25 #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip() 26 #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1] 27 28 items = { 29 "question" : question, 30 "author" : author, 31 "answer" : answer, 32 } 33 34 with open("explore.json", "a") as f: 35 #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n") 36 f.write(json.dumps(items, ensure_ascii = False) + "\n")