Phoenix python crawling news websites, and link address, source, time and content, with selenium automation and data processing requests

. 1  Import Requests
 2  from Selenium Import the webdriver
 . 3  Import Time
 . 4  
. 5  DEF GRASP (urlT):
 . 6 Driver = webdriver.Chrome (R & lt ' C: \ Program Files (the x86) \ the Google \ the Chrome \ the Application \ chromedriver.exe ' ) # Automation tool local test program location 
. 7 resAll = [] # for storing single data 
. 8 REST} = { # used to store individual data 
. 9 RES = requests.get (urlT)
 10  for I in Range (0,29 ):
 . 11  Print(res.json()['data'][i]['title'])
12 try:
13 print(res.json()['data'][i]['newsTime'])
14 except:
15 print('None')
16 print(res.json()['data'][i]['source'])
17 print(res.json()['data'][i]['url'])
18 rest['title']=res.json()['data'][i]['title']
19 try:
20 rest['newsTime'] = res.json()['data'][i]['newsTime']
21 except:
22 rest['newsTime'] = 'None'
23 rest['source'] = res.json()['data'][i]['source']
24 url = res.json()['data'][i]['url']
25 rest['url'] = res.json()['data'][i]['url']
26 try:
27 driver.get(url)
28 time.sleep(4)
29 contend = driver.find_element_by_class_name('text-3zQ3cZD4').text
30REST [ ' contend ' ] = STR (contend)
 31 is  Print (contend)
 32  driver.back ()
 33 is the time.sleep (. 6 )
 34 is  the except :
 35  Print (F ' of failure news {i} ' )
 36  Print ( ' # ----------------------- some formats do not meet --------------------- # --- ' )
 37 [  resAll.append (REST)
 38 is with Open ( ' ./news.txt ' , ' A + ' , encoding = ' UTF-. 8 ') as f:
39 try:
40 f.write(''.join(resAll[i].values())+'\n')
41 except:
42 print('写入失败')
43 
44 url = "https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219"
45 grasp(url)
46 
47 
48 class Grasp:
49 
50 def __init__(self):
51 self.driver = webdriver.Chrome(r'C: \ Program Files (the x86) \ the Google \ the Chrome \ the Application \ chromedriver.exe ' )
 52 is self.resAll = [] # for storing a single data 
53 is self.rest} = { # used to store individual data 
54 is self.res requests.get = ( " https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219 " ) # target links 
55  
56  DEF RUN (Self):
 57  for i in the Range (0, len (Self. res.json () [ ' Data ' ])):
 58  Print (self.res.json () [ ' Data ' ] [I] [ 'title ' ]) # Output Header 
59  the try :
 60  Print (self.res.json () [ ' Data ' ] [I] [ ' newstime ' ]) # output time 
61 is  the except :
 62 is  Print ( ' None ' )
 63 is  Print ( self.res.json () [ ' Data ' ] [I] [ ' source ' ]) # output source 
64  Print (self.res.json () [ ' Data ' ] [I] [' URL ' ]) # output link address 
65 self.rest [ ' title ' ] = self.res.json () [ ' Data ' ] [I] [ ' title ' ] # Get Title 
66  the try :
 67 self.rest [ ' newstime ' ] = self.res.json () [ ' Data ' ] [I] [ ' newstime ' ] # acquisition time 
68  the except :
 69 self.rest [ ' newstime '] = 'None ' 
70 self.rest [ ' Source ' ] = self.res.json () [ ' Data ' ] [I] [ ' Source ' ] # Get source 
71 is self.url = self.res.json () [ ' Data ' ] [I] [ ' URL ' ]
 72 self.rest [ ' URL ' ] = self.res.json () [ ' Data ' ] [I] [ ' URL ' ] # for a link address 
73 is  the try :
 74  Self. driver.get (url)
75 the time.sleep (. 4 )
 76 self.contend = self.driver.find_element_by_class_name ( ' text-3zQ3cZD4 ' ) .text # text acquisition webpage tag under 
77 self.rest [ ' contend ' ] = STR (self.contend) # insert single data 
78  Print (F ' of successful news {i} ' )
 79  self.driver.back ()
 80 the time.sleep (. 4 )
 81  the except :
 82  Print (F ' of failure news {i} ' )
 83  Print ( '# ----------------------- Some formats do not meet --------------------- # --- ' )
 84  self.resAll.append (self.rest)
 85 with Open ( ' ./news.txt ' , ' A + ' , encoding = ' UTF-. 8 ' ) AS F:
 86  the try :
 87 F. write (F ' of {i} news starts ' )
 88 f.write ( '' .join (self.resAll [I] .values ()) + ' \ n- ' ) # write data 
89 f.write (F ' The first end of the news article {i} ' )
 90 the except :
 91 is  Print ( ' write failure ' )
 92  
93 G = Grasp ()
 94 g.run ()
View Code

There are written rules need to define their own judgment.

We hope to help everyone

Guess you like

Origin www.cnblogs.com/superSmall/p/11520883.html