Phoenix website crawling finance and economics news, when programming function, can be fully realized, due to the large amount of content, require too long, the server will be banned, in order to prevent, can be time.sleep () set a long time point

. 1  Import Requests
 2  from Selenium Import the webdriver
 . 3  Import Time
 . 4  DEF GRASP (urlT):
 . 5      Driver = webdriver.Chrome (R & lt ' C: \ Program Files (the x86) \ the Google \ the Chrome \ the Application \ chromedriver.exe ' ) # automated testing local programming tools location 
. 6      resAll = []          # for storing single data 
. 7      REST} = {            # used to store individual data 
. 8      URLs = []
 . 9      RES = requests.get (urlT)
 10      for I in range(0,29):
11         
12         print(f'第{i+1}条新闻开始')
13         print(res.json()['data'][i]['title'])
14         try:
15             print(res.json()['data'][i]['newsTime'])
16         except:
17             print('None')
18         print(res.json()['data'] [I] [ ' Source ' ])
 . 19          REST [ ' title ' ] = res.json () [ ' Data ' ] [I] [ ' title ' ]
 20 is          the try :
 21 is              REST [ ' newstime ' ] = res.json () [ ' Data ' ] [I] [ ' newstime ' ] # some time is not 
22 is          the except :
 23 is              REST [ ' newstime ' ] = 'None'
24         rest['source'] = res.json()['data'][i]['source']
25         url = res.json()['data'][i]['url']
26         rest['url'] = res.json()['data'][i]['url']
27 
28         try:
29 
30             driver.get(url)
31             time.sleep(4)
32              contend driver.find_element_by_class_name = ( ' text-3zQ3cZD4 ' ) .text
 33 is              REST [ ' contend ' ] = STR (contend)
 34 is              Print (F ' of {i + 1} end news ' )
 35              the time.sleep (. 6 )
 36          the except :
 37 [              REST [ ' contend ' ] = ' nested ' 
38 is              the time.sleep (. 6 )
 39              L = driver.find_elements_by_xpath ( "P // [@ class = 'text-3YbAxaNR'] " )   # Get the number of connections 
40              S = driver.find_elements_by_xpath ( " // P [@ class = 'text-3YbAxaNR'] / A " ) # Get the current page of all link 
41 is              for J in Range (0, len (L)):
 42 is                  SS = S [J] .get_attribute ( ' the href ' )
 43 is                  Print (type (SS))
 44 is                  the try :
 45                      urls.append (STR (STR (SS ) .split ()). Replace ( ' " ' , '' ) .replace ( " '", "" ) .Replace ( ' [ ' , '' ) .replace ( ' ] ' , '' )) # links may be stored into a string 
46 is                      Print (URLs)
 47                  the except :
 48                      Print (driver.find_element_by_class_name ( ' Topic--3bY8Hw. 9 ' ) .text) # output header 
49          resAll.append (REST)
 50          with Open ( ' ./news.txt ' , ' A + ' , encoding = ' UTF-. 8 ') as f:
51                 try:
52                     f.write(''.join(resAll[i].values())+'\n')
53                 except:
54                     print('写入失败')
55 
56 
57     resAll.clear()
58     print(urls)
59     for k in range(0,len(urls)):
60         try:
61             driver.get(urls[k])
62             # time.sleep(3)
63             rest['title1'] = driver.find_element_by_class_name('topic-3bY8Hw-9').text
64             rest['source1'] = driver.find_element_by_class_name('source-2pXi2vGI').text
65             rest['newsTime1'] = driver.find_element_by_xpath('//p[@class="time-hm3v7ddj"]/span').text
66             rest['contend1'] = driver.find_element_by_class_name('text-3zQ3cZD4').text
67             resAll.append (REST)
 68              the time.sleep (. 4 )
 69              with Open ( ' ./news.txt ' , ' A + ' , encoding = ' UTF-. 8 ' ) AS F:
 70                  the time.sleep (. 5 )
 71 is                  F. Write ( '' .join (resAll [K] .values ()) + ' \ n- ' )
 72          the except :
 73 is              Print ( ' too much content, the server is prohibited ' )
 74  
75  
76 URL = "https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219 "  # Finance API 
77 T = GRASP (URL)
View Code

Has achieved the nesting get information on the website,

Can be used directly

Guess you like

Origin www.cnblogs.com/superSmall/p/11528066.html