import re import requests import threading import csv import time import queue as Queue start = time.time() g_writecount = 0 exist_url = [] all_url = [] def GetUrls1(url,depth = 1): #先用该深度优化递归函数获取一到三级网站形成的一个all_url列表 global g_writecount,all_url #形式为[‘01/420112.html’,‘01/420114.html’,...] 大约3300个 try: url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + url #index.html kv = {'user-agent': 'Mozilla/5.0'} r = requests.get(url1, headers=kv ,timeout = 5) r.raise_for_status() r.encoding = r.apparent_encoding except: print(url1,'爬取失败') exist_url.append(url) return None exist_url.append(url) pattern = re.compile("<a href='(.*?)'>") unique_list = list(set(re.findall(pattern, r.text)) - set(exist_url)) for eachone in unique_list: g_writecount += 1 output = 'NO.' + str(g_writecount) + '\t Depth:' + str(depth) + '\t' + url + '->' + eachone + '\n' print(output) # with open('title1.txt', 'a+') as f: # f.write(output) # f.close() if depth < 3: GetUrls1(eachone, depth + 1) all_url = all_url + unique_list def GetUrls2(q): i = q.get(timeout=2) #从all_url中一个一个获取 workQueue.put(url) 传送过来的部分url url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + i[3:5] + '/' + i #形成完整的可爬取的url try: kv = {'user-agent': 'Mozilla/5.0'} r = requests.get(url, headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding pattern = re.compile("<a href='(.*?)'>") result = list(set(re.findall(pattern, r.text))) except: print(url, '爬取失败') for i in result: #result返回的是形如['01/110101001.html','01/110101002.html','01/110101003.html'...]的列表 url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + i[3:5] + '/' + i[5:7] + '/' + i #形成完整的可爬取的第五级url try: GetDatas(url) #爬取第五级网站的所需信息 except: continue #出错的话继续爬取下一个网站 def GetDatas(url): #爬取第五级网站的所需信息 try: kv = {'user-agent': 'Mozilla/5.0'} r = requests.get(url, headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding html = r.text pattern = re.compile(r"<tr class='villagetr'><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>") Data = re.findall(pattern, html) with open('a.csv', 'a+') as f: #创建并将数据存储到csv文件 f_csv = csv.writer(f) f_csv.writerows(Data) f.close() except: print(url, '爬取失败') class myThread (threading.Thread): def __init__(self,q): threading.Thread.__init__(self) self.q = q def run(self): while not self.q.empty(): GetUrls2(self.q) if __name__ == '__main__': start = time.time() headers = ['代码','城乡分类','名称'] with open('网站.csv', 'a+') as f: f_csv = csv.writer(f) f_csv.writerow(headers) f.close() GetUrls1('index.html') # 为all_url 添砖加瓦 listNum = len(all_url) workQueue = Queue.Queue(listNum) threads = [] for url in all_url: workQueue.put(url) for i in range(5): thread = myThread(workQueue) thread.start() threads.append(thread) for thread in threads: thread.join() end = time.time() print(end - start)
1.递归函数 GetUrls1(url,depth = 1 ) 可以减少相同代码的重复率,更简洁。要注意代码形式相同的才能用!
2.爬虫是IO密集型的任务,用threading多线程可节省爬取时间!同时+Queue的运用更节省时间!