Preparatory
- python2.7
- html2text
- markdownpad (here freely, as long as the line can support md)
- Will capture
- The most important thing is that you have a proxy, know almost since the beginning of the letters IP
principle
He said the principle is simple: Get to request the BODY section of the content, and then re-build an HTML file, then use html2text this module to convert it to markdown file, and finally the picture and caption do some processing just fine in accordance with the markdown format. Scenarios currently applied mainly in the know almost.
Code
Almost get to know the answer
Write code, considered two main usage scenarios. First, a given data acquisition and conversion answers; second, obtaining all the answers a question is then converted one by one, where quality control can be obtained by the answer to the number agree.
A particular answer data acquisition
url:https://www.zhihu.com/question/27621722/answer/48658220
'''
(前面那个是问题ID,后边的是答案ID)
'''
I am here to get this data is divided into two parts, the first part of the request URL above, the main data and get answers to endorse number, the second part of the request the following interfaces:
https://www.zhihu.com/api/v4/answers/48658220
Why is this so? Because this is not a complete answer to the text data obtained by the data interface, so only two steps up.
A particular answer data acquisition
This data can be obtained by a very simple manner, the following interfaces:
https://www.zhihu.com/api/v4/questions/27621722/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_collapsed%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=3
JSON data is returned, it is easy to obtain. But there is one catch, the answer from this text data is text data inside take, not a complete html file, you need to look at the structure.
Save the field
- author_name answer username
- answer_id answer ID
- question_id issue ID
- question_title problem
- vote_up_count agreed number
- Created create_time
- The answer body
The main Screenplay: zhihu.py
import os
import re
import json
import requests
import html2text
from parse_content import parse
'''
更多Python学习资料以及源码教程资料,可以在群821460695 免费获取
'''
"""
just for study and fun
Talk is cheap
show me your code
"""
class ZhiHu(object):
def __init__(self):
self.request_content = None
def request(self, url, retry_times=10):
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
'Host': 'www.zhihu.com'
}
times = 0
while retry_times>0:
times += 1
print 'request %s, times: %d' %(url, times)
try:
ip = 'your proxy ip'
if ip:
proxy = {
'http': 'http://%s' % ip,
'https': 'http://%s' % ip
}
self.request_content = requests.get(url, headers=header, proxies=proxy, timeout=10).content
except Exception, e:
print e
retry_times -= 1
else:
return self.request_content
def get_all_answer_content(self, question_id, flag=2):
first_url_format = 'https://www.zhihu.com/api/v4/questions/{}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_collapsed%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=3'
first_url = first_url_format.format(question_id)
response = self.request(first_url)
if response:
contents = json.loads(response)
print contents.get('paging').get('is_end')
while not contents.get('paging').get('is_end'):
for content in contents.get('data'):
self.parse_content(content, flag)
next_page_url = contents.get('paging').get('next').replace('http', 'https')
contents = json.loads(self.request(next_page_url))
else:
raise ValueError('request failed, quit......')
def get_single_answer_content(self, answer_url, flag=1):
all_content = {}
question_id, answer_id = re.findall('https://www.zhihu.com/question/(\d+)/answer/(\d+)', answer_url)[0]
html_content = self.request(answer_url)
if html_content:
all_content['main_content'] = html_content
else:
raise ValueError('request failed, quit......')
ajax_answer_url = 'https://www.zhihu.com/api/v4/answers/{}'.format(answer_id)
ajax_content = self.request(ajax_answer_url)
if ajax_content:
all_content['ajax_content'] = json.loads(ajax_content)
else:
raise ValueError('request failed, quit......')
self.parse_content(all_content, flag, )
def parse_content(self, content, flag=None):
data = parse(content, flag)
self.transform_to_markdown(data)
def transform_to_markdown(self, data):
content = data['content']
author_name = data['author_name']
answer_id = data['answer_id']
question_id = data['question_id']
question_title = data['question_title']
vote_up_count = data['vote_up_count']
create_time = data['create_time']
file_name = u'%s--%s的回答[%d].md' % (question_title, author_name,answer_id)
folder_name = u'%s' % (question_title)
if not os.path.exists(os.path.join(os.getcwd(),folder_name)):
os.mkdir(folder_name)
os.chdir(folder_name)
f = open(file_name, "wt")
f.write("-" * 40 + "\n")
origin_url = 'https://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id)
f.write("## 本答案原始链接: " + origin_url + "\n")
f.write("### question_title: " + question_title.encode('utf-8') + "\n")
f.write("### Author_Name: " + author_name.encode('utf-8') + "\n")
f.write("### Answer_ID: %d" % answer_id + "\n")
f.write("### Question_ID %d: " % question_id + "\n")
f.write("### VoteCount: %s" % vote_up_count + "\n")
f.write("### Create_Time: " + create_time + "\n")
f.write("-" * 40 + "\n")
text = html2text.html2text(content.decode('utf-8')).encode("utf-8")
# 标题
r = re.findall(r'\*\*(.*?)\*\*', text, re.S)
for i in r:
if i != " ":
text = text.replace(i, i.strip())
r = re.findall(r'_(.*)_', text)
for i in r:
if i != " ":
text = text.replace(i, i.strip())
text = text.replace('_ _', '')
# 图片
r = re.findall(r'!\[\]\((?:.*?)\)', text)
for i in r:
text = text.replace(i, i + "\n\n")
f.write(text)
f.close()
if __name__ == '__main__':
zhihu = ZhiHu()
url = 'https://www.zhihu.com/question/27621722/answer/105331078'
zhihu.get_single_answer_content(url)
# question_id = '27621722'
# zhihu.get_all_answer_content(question_id)
zhihu.py-based script, very simple, initiated the request, call the analytic function to parse, and finally save.
Analytic function script: parse_content.py
import time
from bs4 import BeautifulSoup
'''
更多Python学习资料以及源码教程资料,可以在群821460695 免费获取
'''
def html_template(data):
# api content
html = '''
<html>
<head>
<body>
%s
</body>
</head>
</html>
''' % data
return html
def parse(content, flag=None):
data = {}
if flag == 1:
# single
main_content = content.get('main_content')
ajax_content = content.get('ajax_content')
soup = BeautifulSoup(main_content.decode("utf-8"), "lxml")
answer = soup.find("span", class_="RichText CopyrightRichText-richText")
author_name = ajax_content.get('author').get('name')
answer_id = ajax_content.get('id')
question_id = ajax_content.get('question').get('id')
question_title = ajax_content.get('question').get('title')
vote_up_count = soup.find("meta", itemprop="upvoteCount")["content"]
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ajax_content.get('created_time')))
else:
# all
answer_content = content.get('content')
author_name = content.get('author').get('name')
answer_id = content.get('id')
question_id = content.get('question').get('id')
question_title = content.get('question').get('title')
vote_up_count = content.get('voteup_count')
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content.get('created_time')))
content = html_template(answer_content)
soup = BeautifulSoup(content, 'lxml')
answer = soup.find("body")
print author_name,answer_id,question_id,question_title,vote_up_count,create_time
# 这里非原创,看了别人的代码,修改了一下
soup.body.extract()
soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
soup.body.append(answer)
img_list = soup.find_all("img", class_="content_image lazy")
for img in img_list:
img["src"] = img["data-actualsrc"]
img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
for img in img_list:
img["src"] = img["data-actualsrc"]
noscript_list = soup.find_all("noscript")
for noscript in noscript_list:
noscript.extract()
data['content'] = soup
data['author_name'] = author_name
data['answer_id'] = answer_id
data['question_id'] = question_id
data['question_title'] = question_title
data['vote_up_count'] = vote_up_count
data['create_time'] = create_time
return data
parse_content.py primarily responsible for building the new html, and then parse it to obtain the data.
The test results show
Be sure to networking!
Be sure to networking!
Be sure to networking!