协程实践及应用(获取区划代码和城乡划分代码)

协程实践及应用(获取区划代码和城乡划分代码)

参考文章:获取区划代码和城乡划分代码
异步IO( asyncio) 协程
通过协程实现高并发。
就像参考文章中所说的那样,
协程之间的切换不需要使用系统调用,进程线程和协程之间的关系可以用下图来表示。
在这里插入图片描述
通过async关键字定义一个协程(coroutine),协程也是一种对象。协程不能直接运行,需要把协程加入到事件循环(loop),由后者在适当的时候调用协程。asyncio.get_event_loop方法可以创建一个事件循环,然后使用run_until_complete将协程注册到事件循环,并启动事件循环。

对基于aiohttp的爬虫程序的分析修改:
调整爬取速度和添加cookie防止网站的反爬。

import functools, os, time, aiohttp, asyncio, random, re, csv, urllib.parse
import datetime as dt
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


DEBUG = False

HTML_ENCODEING = 'gb18030'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
BASE_PATH = '../China_Province_2018/'
if not os.path.isdir(BASE_PATH):
    os.mkdir(BASE_PATH)


# 使用关键字async定义一个协程
async def get_html(sem, url, handle, result, counter=None):
    if not isinstance(result, list):
        raise TypeError('result must be a list')
    if not isinstance(url, str):
        raise TypeError('url must be a string')
    if not isinstance(counter, dict):
        counter = {'all': 0, 'done': -1, 'now': dt.datetime.now()}
    async with sem:		# 异步上下文管理,限制并发请求的数量
        async with aiohttp.ClientSession() as session:
            while True:
                status = -1
                try:
                    # async with session.get(url, headers={'user-agent': ua.random, 'cookie':'AD_RS_COOKIE=20080919; wzws_cid=0531b58062570b0f375dda6b007d21ea15ab01eb1ac6dd26aef9d4aeedda66a1f68f49a6d670349fb2eebd06e876f32cca61274cf20431ba78476eeae2727ed84eb7d0478e5243373d18c13f00feae97'}, timeout=20) as resp:
                    async with session.get(url, headers={'user-agent': USER_AGENT, 'cookie':'AD_RS_COOKIE=20080919'}, timeout=20) as resp:
                        time.sleep(1)     # 防止爬取速度过快
                        if resp.status != 200:
                            status = resp.status
                            raise Exception(str(status))
                        else:
                            # response = await resp.content.read()
                            try:
                                response = await resp.text(HTML_ENCODEING)
                            except TimeoutError as e:
                                raise e
                            except aiohttp.ClientPayloadError as e:
                                raise e
                            except (UnicodeDecodeError, UnicodeEncodeError) as e:
                                try:
                                    t = await resp.text('utf-8', errors='ignore')
                                except Exception as e:
                                    print(repr(e))
                                    exit(0)
                                else:
                                    if '请开启JavaScript并刷新该页' in t:
                                        raise Exception('请开启JavaScript并刷新该页')
                                    else:
                                        print(t)
                                        print('解码页面时出错,请查看提示信息。url({})'.format(url))
                                        exit(0)
                            except Exception as e:
                                print('@' * 100)
                                raise e
                            result.extend(handle(response, url))
                            counter['done'] += 1
                            dt.timedelta().total_seconds()
                            print('{} \tsuccess! \t({}/{}) \t{}'.format(url, counter['done'], counter['all'],
                                                                        str(dt.datetime.now() - counter['now'])[:-7]))
                            break
                except Exception as e:
                    await asyncio.sleep(3 + random.random() * 7)
                    print('{} \tretry due to status:{}\t{}'.format(url, status, repr(e)))


def get_htmls_and_handle(url_list, handle, handle_kwargs_list, count=False):
    sem = asyncio.Semaphore(6)
    tasks = []  # 协程任务列表

    result = []
    counter = {'all': len(url_list), 'done': 0, 'now': dt.datetime.now()} if count else None
    for url, kwargs in zip(url_list, handle_kwargs_list):
        tasks.append(get_html(sem, url, functools.partial(handle, **kwargs), result, counter))
    if not tasks:
        raise ValueError('tasks is empty')
    loop = asyncio.get_event_loop()     # 创建一个事件循环
    start_time = dt.datetime.now()
    loop.run_until_complete(asyncio.wait(tasks))    # run_until_complete的参数是一个futrue对象。当传入一个协程,其内部会自动封装成task
    print('#' * 100)
    print('time cost :{}, \t({}) tasks all done!'.format(str(dt.datetime.now() - start_time)[:-7], len(tasks)))
    print('#' * 100)
    return result


def fun1(response, req_url):
    # pattern = re.compile("<a href='(.*?)'>(.*?)<")  # 正则表达式
    # result = list(set(re.findall(pattern, response)))  # 从主页面获取子页面的html
    soup = BeautifulSoup(response, 'lxml')
    a_list = soup.select('table.provincetable > tr.provincetr > td > a')
    result = [[i['href'], i.text] for i in a_list]
    return [{'url': urllib.parse.urljoin(req_url, url),
             'kwargs': {
                 'address_list': [address]
             }} for url, address in result]


def current_level(soup):
    if soup.select('table.citytable > tr.citytr'):
        return 2
    elif soup.select('table.countytable > tr.countytr'):
        return 3
    elif soup.select('table.towntable > tr.towntr'):
        return 4
    else:
        raise Exception('can not recognition current level')


def fun2(response, req_url, address_list):
    # pattern = re.compile("<a href='(.*?)'>(.*?)<")
    # result = list(set(re.findall(pattern, response)))
    soup = BeautifulSoup(response, 'lxml')
    a_list = soup.select('table > tr > td:nth-of-type(2) > a')
    if a_list:
        delta_level = current_level(soup) - len(address_list) - 1
        assert delta_level >= 0, 'delta_level can not < 0'
        result = [[i['href'], i.text] for i in a_list]
        return [{'url': urllib.parse.urljoin(req_url, url),
                 'kwargs': {
                     'address_list': address_list + [''] * delta_level + [address]
                 }} for url, address in result]
    else:
        tr_list = soup.select('table.villagetable > tr.villagetr')
        result = [[i('td')[0].text, i('td')[1].text, i('td')[2].text] for i in tr_list]
        return [address_list + [code1,
                                code2,
                                address,
                                '1' if code2[0] == '1' else '0',
                                ] for code1, code2, address in result]


def main():
    handle_dict = {
        '一级': fun1,
        '二级': fun2,
        '三级': fun2,
        '四级': fun2,
        '五级': fun2,
    }
    temp = []
    result = []
    for idx, (name, handle) in enumerate(handle_dict.items()):
        if idx == 0:
            url_list = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html']
            handle_kwargs_list = [{}]
        else:
            # if idx == 1:
            #     temp = [i for i in temp if '广东省' in i['kwargs']['address_list']]
            url_list = []
            handle_kwargs_list = []
            for row in temp:
                if isinstance(row, dict):
                    url_list.append(row['url'])
                    handle_kwargs_list.append(row['kwargs'])
                else:
                    result.append(row)
            if DEBUG:
                url_list = url_list[:30]
                handle_kwargs_list = handle_kwargs_list[:30]

        temp = get_htmls_and_handle(url_list, handle, handle_kwargs_list, count=True)
        print('-' * 50)
        print(temp)

    result.extend(temp)
    result.sort()
    with open(os.path.join(BASE_PATH, 'csv_{}.csv'.format(dt.datetime.now().strftime("%Y%m%d%H%M"))), 'w',
              newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        for row in result:
            writer.writerow(row)


if __name__ == '__main__':
    ua = UserAgent(use_cache_server=False)
    main()
发布了46 篇原创文章 · 获赞 19 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/li123_123_/article/details/103406654
今日推荐