python爬取凤凰网站的新闻,及其链接地址,来源,时间和内容,用selenium自动化和requests处理数据

 1 import requests
 2 from selenium import webdriver
 3 import time
 4 
 5 def grasp(urlT):
 6 driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') #自动化测试程序工具本地所在地
 7 resAll = [] #用于存储单条数据
 8 rest = {} #用于存储单个数据
 9 res=requests.get(urlT)
10 for i in range(0,29):
11 print(res.json()['data'][i]['title'])
12 try:
13 print(res.json()['data'][i]['newsTime'])
14 except:
15 print('None')
16 print(res.json()['data'][i]['source'])
17 print(res.json()['data'][i]['url'])
18 rest['title']=res.json()['data'][i]['title']
19 try:
20 rest['newsTime'] = res.json()['data'][i]['newsTime']
21 except:
22 rest['newsTime'] = 'None'
23 rest['source'] = res.json()['data'][i]['source']
24 url = res.json()['data'][i]['url']
25 rest['url'] = res.json()['data'][i]['url']
26 try:
27 driver.get(url)
28 time.sleep(4)
29 contend = driver.find_element_by_class_name('text-3zQ3cZD4').text
30 rest['contend'] = str(contend)
31 print(contend)
32 driver.back()
33 time.sleep(6)
34 except:
35 print(f'第{i}条新闻失败')
36 print('#-----------------------某些格式不符合------------------------#')
37 resAll.append(rest)
38 with open('./news.txt', 'a+', encoding='utf-8') as f:
39 try:
40 f.write(''.join(resAll[i].values())+'\n')
41 except:
42 print('写入失败')
43 
44 url = "https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219"
45 grasp(url)
46 
47 
48 class Grasp:
49 
50 def __init__(self):
51 self.driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
52 self.resAll = []#用于存储单条数据
53 self.rest = {}#用于存储单个数据
54 self.res = requests.get("https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219")#目标链接
55 
56 def run(self):
57 for i in range(0, len(self.res.json()['data'])):
58 print(self.res.json()['data'][i]['title']) #输出标题
59 try:
60 print(self.res.json()['data'][i]['newsTime']) #输出时间
61 except:
62 print('None')
63 print(self.res.json()['data'][i]['source']) #输出来源
64 print(self.res.json()['data'][i]['url']) #输出链接地址
65 self.rest['title'] = self.res.json()['data'][i]['title'] #获取标题
66 try:
67 self.rest['newsTime'] = self.res.json()['data'][i]['newsTime'] #获取时间
68 except:
69 self.rest['newsTime'] = 'None'
70 self.rest['source'] = self.res.json()['data'][i]['source'] #获取来源
71 self.url = self.res.json()['data'][i]['url']
72 self.rest['url'] = self.res.json()['data'][i]['url']#获取链接地址
73 try:
74 self.driver.get(url)
75 time.sleep(4)
76 self.contend = self.driver.find_element_by_class_name('text-3zQ3cZD4').text#获取网页标签下的文本
77 self.rest['contend'] = str(self.contend)#插入单条数据
78 print(f'第{i}条新闻成功')
79 self.driver.back()
80 time.sleep(4)
81 except:
82 print(f'第{i}条新闻失败')
83 print('#-----------------------某些格式不符合------------------------#')
84 self.resAll.append(self.rest)
85 with open('./news.txt', 'a+', encoding='utf-8') as f:
86 try:
87 f.write(f'第{i}条新闻开始')
88 f.write(''.join(self.resAll[i].values()) + '\n') #写入数据
89 f.write(f'第{i}条新闻结束')
90 except:
91 print('写入失败')
92 
93 g = Grasp()
94 g.run()
View Code

有写规则需要自己定义判断。

希望,帮到大家

猜你喜欢

转载自www.cnblogs.com/superSmall/p/11520883.html
今日推荐