Administration of the People's website crawler

# Administration Department of the People's website crawler

** ** goal today crawling over the site of administrative data in the latest County Administrative Code `` ` Import Requests from lxml Import etree Import pymysql Import Re class Govement (Object): DEF __init__ (Self) : self.one_url = ' http://www.mca.gov.cn/article/sj/ ' \ ' xzqh / 2019 / ' self.headers = { ' the User-- Agent ' : 'Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 ' } self.db = pymysql.connect ( ' localhost ' , ' the root ' , ' 123456 ' , ' govdb ' , charset = ' utf8 ' ) self.cursor = self.db.cursor () # extract secondary pages link (false links) DEF get_false_link (Self): # xpath: // A [@ class = "artitlelist "] html = requests.get(url = self.one_url,headers = self.headers).content.decode('utf-8','ignore') # 解析 parse_html = etree.HTML(html) a_list = parse_html.xpath('//a[@class="artitlelist"]') for a in a_list: # title = a.xpath('./@title')[0] title = a.get('title') if re.findall('.*以上行政区划代码',title,re.S): two_false_link = ' http://www.mca.gov.cn ' + \ a.get ( ' the href ' ) return two_false_link # extracted two real pages link (return data) DEF get_true_link (Self): # fetch response content false_link = self.get_false_link () HTML = requests.get (URL = false_link, headers = .text self.headers) # printing response content to view real jump links, link matches the real pattern the re.compile = (R & lt ' the window.location = .href "(. *?)" ' , re.S) real_link= Pattern.findall (HTML) [0] # achieve incremental crawl # to the version table queries whether real_link # are: Latest Data No: grasping data SEL = ' SELECT * WHERE Link from version = "{}" ' . format (real_link) self.cursor.execute (SEL) # link already exists (does not need to fetch data) IF self.cursor.fetchall (): Print ( ' data is up to date ' ) the else : # first catch data self.get_data (real_link) # the real_link inserted into the table version INS = ' iNSERT iNTO version values (% S)' self.cursor.execute(ins,[real_link]) self.db.commit() # 真正提取数据函数 def get_data(self,real_link): html = requests.get( url = real_link, headers = self.headers ).text # 基准xpath: //tr[@height="19"] parse_html = etree.HTML(html) tr_list = parse_html.xpath('//tr[@height="19"]') for tr in tr_list: code = tr.xpath('./td[2]/text()')[0] name = tr.xpath('./td[3]/text()')[0] print(name,code) # 主函数 def main(self): self.get_true_link() if __name__ == '__main__': spider = Govement() spider.main() ```

 

Guess you like

Origin www.cnblogs.com/cxiaolong/p/11261023.html
Recommended