基于上篇博客,利用Re + Requests爬取国家统计局70w城乡数据信息!改进方案:深度优化递归函数+threading +Queue多线程提高爬取效率 !

 
 
 
 
import re
import requests
import threading
import csv
import time
import queue as Queue

start = time.time()
g_writecount = 0
exist_url = []
all_url = []

def GetUrls1(url,depth = 1):  #先用该深度优化递归函数获取一到三级网站形成的一个all_url列表
    global g_writecount,all_url  #形式为[‘01/420112.html’,‘01/420114.html’,...] 大约3300个
    try:
        url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + url #index.html
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(url1, headers=kv ,timeout = 5)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
    except:
        print(url1,'爬取失败')
        exist_url.append(url)
        return None
    exist_url.append(url)
    pattern = re.compile("<a href='(.*?)'>")
    unique_list = list(set(re.findall(pattern, r.text)) - set(exist_url))

    for eachone in unique_list:
        g_writecount += 1
        output = 'NO.' + str(g_writecount) + '\t Depth:' + str(depth) + '\t' + url + '->' + eachone + '\n'
        print(output)
        # with open('title1.txt', 'a+') as f:
        #     f.write(output)
        #     f.close()
        if depth < 3:
            GetUrls1(eachone, depth + 1)
    all_url = all_url + unique_list

def GetUrls2(q):
    i = q.get(timeout=2) #从all_url中一个一个获取 workQueue.put(url) 传送过来的部分url
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + i[3:5] + '/' + i #形成完整的可爬取的url
    try:
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(url, headers=kv)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        pattern = re.compile("<a href='(.*?)'>")
        result = list(set(re.findall(pattern, r.text)))
    except:
        print(url, '爬取失败')

    for i in result: #result返回的是形如['01/110101001.html','01/110101002.html','01/110101003.html'...]的列表
        url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + i[3:5] + '/' + i[5:7] + '/' + i #形成完整的可爬取的第五级url
        try:
            GetDatas(url) #爬取第五级网站的所需信息
        except:
            continue #出错的话继续爬取下一个网站

def GetDatas(url): #爬取第五级网站的所需信息
    try:
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(url, headers=kv)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html = r.text
        pattern = re.compile(r"<tr class='villagetr'><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>")
        Data = re.findall(pattern, html)
        with open('a.csv', 'a+') as f: #创建并将数据存储到csv文件
            f_csv = csv.writer(f)
            f_csv.writerows(Data)
            f.close()
    except:
        print(url, '爬取失败')

class myThread (threading.Thread):
    def __init__(self,q):
        threading.Thread.__init__(self)
        self.q = q

    def run(self):
        while not self.q.empty():
            GetUrls2(self.q)

if __name__ == '__main__':
    start = time.time()
    headers = ['代码','城乡分类','名称']
    with open('网站.csv', 'a+') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f.close()
    GetUrls1('index.html')  # 为all_url 添砖加瓦
    listNum = len(all_url)
    workQueue = Queue.Queue(listNum)
    threads = []
    for url in all_url:
        workQueue.put(url)
    for i in range(5):
        thread = myThread(workQueue)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
    end = time.time()
    print(end - start)

1.递归函数 GetUrls1(url,depth = 1 ) 可以减少相同代码的重复率,更简洁。要注意代码形式相同的才能用!
2.爬虫是IO密集型的任务,用threading多线程可节省爬取时间!同时+Queue的运用更节省时间!

猜你喜欢

转载自blog.csdn.net/weixin_42060681/article/details/80348137