Use python crawled latest national civil procedure code provinces, requests, beautifulsoup, lxml

 

python3.6 use

Civil Affairs website, different years of data may not be consistent page structure, I stepped on a lot of this pit, which is the code the more reason to write longer.

If after this code is not available, I wish to carefully study whether the next page structure changed.

 

 

 

  1 # -*- coding: utf-8 -*-
  2 """
  3 Created on Wed Jul 10 14:40:41 2019
  4 
  5 @author: Administrator
  6 """
  7 
  8 import pandas as pd
  9 import requests 
 10 from bs4 import BeautifulSoup
 11 import time 
 12 
 13 url1 = 'http://www.mca.gov.cn/article/sj/xzqh//1980/'
 14 headers = {'content-type': 'application/json',
 15                 ' the User-- Agent ' : ' the Mozilla / 5.0 (X11; the Ubuntu; the Linux the x86_64; RV: 22.0) the Gecko / Firefox 20,100,101 / 22.0 ' }
 16  
. 17  # 1. Get All Links ========== ================================================== ============ 
18  DEF f1 (URL1):
 19      ' 2018 - 1980 Nian links all administrative divisions code People's Republic of China ' 
20      # requests request, set the url, header parameter 
21      the Response = requests .get (URL1, headers = headers, timeout = 200 is, Verify = False)
 22 is      Soup = the BeautifulSoup (response.text, ' lxml ' ) #The return to the source page type BeautifulSoup 
23 is      _tmp1 = soup.select ( ' td.arlisttd ' )
 24      END_1 = []
 25      for I in _tmp1:
 26 is          _a = i.select ( ' A ' ) [0] .get ( ' the href ' )
 27          _B = i.select ( ' A ' ) [0] .get ( ' title ' ) [:. 4 ]
 28          end_1.append ([ ' http://www.mca.gov.cn ' + _a, _B ])
 29      returnEND_1
 30  
31 is end_2 = []
 32  for I in [ '' , ' 2? ' , ' . 3? ' ]:
 33 is      end_2 = end_2 + F1 (URL1 + I)
 34 is      
35      
36  DEF F2 (URL1 = ' HTTP: // WWW .mca.gov.cn / Article This article was / SJ / xzqh / 2019 / ' ):
 37 [      ' 2019 years People's Republic of China administrative regionalization code ' 
38 is      Response = requests.get (URL1, headers = headers, timeout = 200 is, Verify = False)
 39      Soup = the BeautifulSoup (response.text, 'lxml')
 40     _tmp1 = soup.select('td.arlisttd')
 41     end_1 = []
 42     for i in _tmp1:
 43         _a = i.select('a')[0].get('href')
 44         _b = i.select('a')[0].get('title')[:7]
 45         end_1.append(['http://www.mca.gov.cn'+_a,_b])
 46     return end_1
 47 
 48 end_2 = end_2+f2()
 49 
 50 # 2. 获取数据========================================================================
 51 def f3(url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201903/20190300014989.shtml'):
 52     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854922.shtml'
 53     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854918.shtml'
 54     #
 55     response = requests.get(url1, headers=headers, timeout=200, verify=False)
 56     soup = BeautifulSoup(response.text,'lxml')
 57     _txt = soup.select('script')[4].get_text().strip().replace('window.location.href="','').strip('";')
 58     if _txt[-4:]=='html':
 59         print('script!')
 60         url2 = _txt
 61     else:
 62         _tmp1 = soup.select('div.artext> div> P> A ' )
 63 is          IF len (_tmp1) == 0:
 64              _tmp1 = soup.select ( ' div # Zoom> A ' )
 65          URL2 _tmp1 = [0] .get ( ' the href ' )
 66      Print (URL2)
 67      # return URL2 
68      # URL2 = 'HTTP: //www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html' 
69      the time.sleep (0.5 )
 70      Response = requests.get (URL2, headers = headers, timeout = 200 is, Verify = False)
 71 is      # to return to the source page type BeautifulSoup
72      Soup = the BeautifulSoup (response.text, ' lxml ' )
 73 is      _tmp1 = soup.select ( ' Table> TR [height = ". 19"] ' )
 74      END_1 = []
 75      IF len (_tmp1)>. 5 :
 76          for I in _tmp1:
 77              _a = i.select ( ' TD ' ) [. 1 .] .get_text () Strip ()
 78              IF len (_a)> 15: # part of the data page, the last line is a comment. 
79                  Continue 
80              the else :
 81                 i.select = _B ( ' TD ' ) [2 .] .get_text () Strip ()
 82                  end_1.append ([_ A, _B])
 83      the else :
 84          _tmp1 = soup.select ( ' Table> TR [height = " 20 is "] " )
 85          for I in _tmp1:
 86              _a = i.select ( ' TD ' ) [0] .get_text () Strip ().
 87              IF len (_a)> 15 or _a == ' administrative regionalization code ' : # part of the data page, the last line is a comment. 
88                  the Continue 
89             the else :
 90                  _B = i.select ( ' TD ' ) [. 1 ] .get_text () Strip ().
 91 is                  end_1.append ([_ A, _B])
 92      
93      return END_1
 94  
95  # cycles to obtain data for each link 
96 = end_3 []; # end_4 = [] 
97  for J in Range (len (end_2)):
 98      Item = end_2 [J]
 99      IF  ' . 19 '   in Item [. 1] or  ' 20 is '   in Item [. 1 ]:
100         print(j,item[0],item[1])
101         tmp2 = f3(item[0])
102         print('.')
103         end_3.extend([[item[1]]+i for i in tmp2])
104         #end_4.append(tmp2)
105         time.sleep(0.1)
106     
107 df_result = pd.DataFrame(end_3)
108 #pd.DataFrame(end_4).to_excel('所有连接.xlsx',index=False)
109 df_result.to_excel('地区编码.xlsx',index=False)
110 
111  
112  '' 
113  # \ least 3 May 2019 County Administrative regionalization code _3852> Table> tbody> TR: Child-Nth (. 4)
 114  #list_content> div.list_right> div> UL> Table> tbody> TR : Nth-Child (. 1)> td.arlisttd> A
 115  '' '

 

Guess you like

Origin www.cnblogs.com/andylhc/p/11490563.html