python3.6 use
Civil Affairs website, different years of data may not be consistent page structure, I stepped on a lot of this pit, which is the code the more reason to write longer.
If after this code is not available, I wish to carefully study whether the next page structure changed.
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Jul 10 14:40:41 2019 4 5 @author: Administrator 6 """ 7 8 import pandas as pd 9 import requests 10 from bs4 import BeautifulSoup 11 import time 12 13 url1 = 'http://www.mca.gov.cn/article/sj/xzqh//1980/' 14 headers = {'content-type': 'application/json', 15 ' the User-- Agent ' : ' the Mozilla / 5.0 (X11; the Ubuntu; the Linux the x86_64; RV: 22.0) the Gecko / Firefox 20,100,101 / 22.0 ' } 16 . 17 # 1. Get All Links ========== ================================================== ============ 18 DEF f1 (URL1): 19 ' 2018 - 1980 Nian links all administrative divisions code People's Republic of China ' 20 # requests request, set the url, header parameter 21 the Response = requests .get (URL1, headers = headers, timeout = 200 is, Verify = False) 22 is Soup = the BeautifulSoup (response.text, ' lxml ' ) #The return to the source page type BeautifulSoup 23 is _tmp1 = soup.select ( ' td.arlisttd ' ) 24 END_1 = [] 25 for I in _tmp1: 26 is _a = i.select ( ' A ' ) [0] .get ( ' the href ' ) 27 _B = i.select ( ' A ' ) [0] .get ( ' title ' ) [:. 4 ] 28 end_1.append ([ ' http://www.mca.gov.cn ' + _a, _B ]) 29 returnEND_1 30 31 is end_2 = [] 32 for I in [ '' , ' 2? ' , ' . 3? ' ]: 33 is end_2 = end_2 + F1 (URL1 + I) 34 is 35 36 DEF F2 (URL1 = ' HTTP: // WWW .mca.gov.cn / Article This article was / SJ / xzqh / 2019 / ' ): 37 [ ' 2019 years People's Republic of China administrative regionalization code ' 38 is Response = requests.get (URL1, headers = headers, timeout = 200 is, Verify = False) 39 Soup = the BeautifulSoup (response.text, 'lxml') 40 _tmp1 = soup.select('td.arlisttd') 41 end_1 = [] 42 for i in _tmp1: 43 _a = i.select('a')[0].get('href') 44 _b = i.select('a')[0].get('title')[:7] 45 end_1.append(['http://www.mca.gov.cn'+_a,_b]) 46 return end_1 47 48 end_2 = end_2+f2() 49 50 # 2. 获取数据======================================================================== 51 def f3(url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201903/20190300014989.shtml'): 52 #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854922.shtml' 53 #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854918.shtml' 54 # 55 response = requests.get(url1, headers=headers, timeout=200, verify=False) 56 soup = BeautifulSoup(response.text,'lxml') 57 _txt = soup.select('script')[4].get_text().strip().replace('window.location.href="','').strip('";') 58 if _txt[-4:]=='html': 59 print('script!') 60 url2 = _txt 61 else: 62 _tmp1 = soup.select('div.artext> div> P> A ' ) 63 is IF len (_tmp1) == 0: 64 _tmp1 = soup.select ( ' div # Zoom> A ' ) 65 URL2 _tmp1 = [0] .get ( ' the href ' ) 66 Print (URL2) 67 # return URL2 68 # URL2 = 'HTTP: //www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html' 69 the time.sleep (0.5 ) 70 Response = requests.get (URL2, headers = headers, timeout = 200 is, Verify = False) 71 is # to return to the source page type BeautifulSoup 72 Soup = the BeautifulSoup (response.text, ' lxml ' ) 73 is _tmp1 = soup.select ( ' Table> TR [height = ". 19"] ' ) 74 END_1 = [] 75 IF len (_tmp1)>. 5 : 76 for I in _tmp1: 77 _a = i.select ( ' TD ' ) [. 1 .] .get_text () Strip () 78 IF len (_a)> 15: # part of the data page, the last line is a comment. 79 Continue 80 the else : 81 i.select = _B ( ' TD ' ) [2 .] .get_text () Strip () 82 end_1.append ([_ A, _B]) 83 the else : 84 _tmp1 = soup.select ( ' Table> TR [height = " 20 is "] " ) 85 for I in _tmp1: 86 _a = i.select ( ' TD ' ) [0] .get_text () Strip (). 87 IF len (_a)> 15 or _a == ' administrative regionalization code ' : # part of the data page, the last line is a comment. 88 the Continue 89 the else : 90 _B = i.select ( ' TD ' ) [. 1 ] .get_text () Strip (). 91 is end_1.append ([_ A, _B]) 92 93 return END_1 94 95 # cycles to obtain data for each link 96 = end_3 []; # end_4 = [] 97 for J in Range (len (end_2)): 98 Item = end_2 [J] 99 IF ' . 19 ' in Item [. 1] or ' 20 is ' in Item [. 1 ]: 100 print(j,item[0],item[1]) 101 tmp2 = f3(item[0]) 102 print('.') 103 end_3.extend([[item[1]]+i for i in tmp2]) 104 #end_4.append(tmp2) 105 time.sleep(0.1) 106 107 df_result = pd.DataFrame(end_3) 108 #pd.DataFrame(end_4).to_excel('所有连接.xlsx',index=False) 109 df_result.to_excel('地区编码.xlsx',index=False) 110 111 112 '' 113 # \ least 3 May 2019 County Administrative regionalization code _3852> Table> tbody> TR: Child-Nth (. 4) 114 #list_content> div.list_right> div> UL> Table> tbody> TR : Nth-Child (. 1)> td.arlisttd> A 115 '' '