16、爬取知乎大v张佳玮的文章“标题”、“摘要”、“链接”,并存储到本地文件

爬取知乎大v张佳玮的文章“标题”、“摘要”、“链接”,并存储到本地文件

 1 #   爬取知乎大v张佳玮的文章“标题”、“摘要”、“链接”,并存储到本地文件
 2 #   URL  https://www.zhihu.com/people/zhang-jia-wei/posts
 3 
 4 import requests
 5 import time
 6 import openpyxl
 7 import csv
 8 
 9 headers = {
10     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
11     'accept-language': 'zh-CN,zh;q=0.9',
12     'cache-control': 'max-age=0',
13     'cookie':'__DAYU_PP=iJb63REJnnjIMmBvzNMV65ab0a6aae4f; q_c1=d75d908a13c44b95bd75f27578ad2088|1521641428000|1521641428000; _zap=bec28151-809b-4936-971a-d18f5255add0; tgw_l7_route=f2979fdd289e2265b2f12e4f4a478330; _xsrf=wQDRNSLBlRv3aimMzhUNyqg1BpLUnWAr; d_c0="ABDmoGi2RQ-PTp5SSFyQvDgz_QEjeQfCFgk=|1555156366"; capsion_ticket="2|1:0|10:1555156366|14:capsion_ticket|44:MTBhN2FkYjYyNWEyNDFjYWJiYTk2N2E1YTA1NDE4OTk=|084e15694c6993269b3aab564e9ea5d7983782f4b37dbc4537e000aa7b081901"',
14     'upgrade-insecure-requests': '1',
15     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
16 }
17 
18 res = requests.get('https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20&sort_by=created',headers=headers)
19 
20 totals = res.json()['paging']['totals']
21 num = 0
22 
23 #   excel 表头部分
24 
25 wb = openpyxl.Workbook()
26 sheet = wb.active
27 sheet.title = '张佳玮的文章'
28 sheet['A1'] = '编号'
29 sheet['B1'] = '标题'
30 sheet['C1'] = '创建时间'
31 sheet['D1'] = '链接'
32 sheet['E1'] = '摘要'
33 
34 #   csv 表头部分
35 
36 with open('zhihu.csv','w',newline='',encoding='utf-8') as csv_file:
37     writer = csv.writer(csv_file)
38     writer.writerow(['编号','标题','创建时间','链接','摘要'])
39 
40     for offset in range(0,totals,20):
41         res = requests.get('https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20&sort_by=created'.format(offset),headers=headers)
42         html = res.json()
43         items = res.json()['data']
44         for item in items:
45             num = num + 1
46             print(num,end='\t')
47             timeArray = time.localtime(item['created'])
48             print(time.strftime("%Y-%m-%d %H:%M:%S",timeArray),end='\t')
49             print(item['title'])
50 
51             #   excel 内容部分
52             sheet.append([num,item['title'],time.strftime("%Y-%m-%d %H:%M:%S",timeArray),item['url'],item['excerpt']])
53 
54             #   csv 内容部分
55             writer.writerow([num,item['title'],time.strftime("%Y-%m-%d %H:%M:%S",timeArray),item['url'],item['excerpt']])
56 
57 #   excel 保存到文件
58 wb.save('zhihu.xlsx')
 
下面截图是有一次只爬了一页的结果
 

猜你喜欢

转载自www.cnblogs.com/www1707/p/10720645.html