python3爬取古诗词

import mysql.connector
from requests_html import HTMLSession
from multiprocessing import Pool

session = HTMLSession()


def run_proc(url):
    print('参数:%s' % url)

    a = get(url).html.find('.main3', first=True).find('.left', first=True).find('.sons', first=True).find('a')
    result = []
    for j in range(len(a)):
        url_info = 'https://so.gushiwen.org' + a[j].attrs['href']
        print(url_info)

        val = parse_html(url_info, False, 'h1', True)
        if len(result) == 200:
            save_db(result)
            result = []
        result.append(val[0])
    if len(result) > 0:
        save_db(result)


def run_proc_v2(url):
    print('参数:%s' % url)
    val = parse_html(url, True, 'b', False)
    result = val['result']
    total = int(val['total'])
    page_no = (total + 9) // 10
    url_split = url.split('.aspx')[0]
    prefix = url_split[:len(url_split) - 1]
    for i in range(1, page_no):
        page_url = prefix + str(i) + '.aspx'
        if len(result) == 200:
            save_db(result)
            result = []
        r = parse_html(page_url, False, 'b', False)
        if len(r) > 0:
            for j in range(len(r)):
                result.append(r[j])
    if len(result) > 0:
        save_db(result)


def get(url):
    re = session.get(url)
    re.encoding = 'utf-8'
    return re


# 解析html
# url:要解析的url
# is_total 是否解析总数
# title_node 诗词名html元素节点
# flag 是否只解析一条
def parse_html(url, is_total, title_node, flag):
    div = get(url).html.find('.main3', first=True).find('.left', first=True)
    sons = div.find('.sons')
    result = []
    if sons is not None and len(sons) > 0:
        for i in range(len(sons)):
            values = []

            son = sons[i]
            # title
            values.insert(0, son.find(title_node, first=True).text)

            # author
            source = son.find('.source', first=True).find('a')
            values.insert(1, source[1].text)

            # content
            values.insert(2, son.find('.contson', first=True).text)
            # dynasty
            values.insert(3, source[0].text)
            result.append(values)
            if flag:
                break
    # 解析总数
    if is_total:
        total_str = div.find('.pages', first=True).find('span')[1].text
        total = total_str[1:len(total_str) - 1]
        val = {'result': result, 'total': total}
        return val
    else:
        return result


def save_db(result):
    if result is not None and len(result) > 0:
        print('开始插入数据...')
        conn = mysql.connector.connect(user='root', password='wujinlei', host='127.0.0.1', port='3307',
                                       database='crawler')
        cursor = conn.cursor()
        for i in range(len(result)):
            values = result[i]
            cursor.execute('insert into poetry (title,author,content,dynasty) values (%s,%s,%s,%s)',
                           values)
        conn.commit()
        print('已经插入%s条数据' % len(result))
        cursor.close()
        conn.close()


if __name__ == "__main__":
    p = Pool(8)

    cate = get("https://www.gushiwen.org/shiwen/").html.find('.main3 .right', first=True).find('a')
    print(len(cate))
    if cate is not None and len(cate) > 0:
        for i in range(len(cate)):
            c = cate[i]
            url = c.attrs['href']
            if url[0] == '/':
                url = 'https://www.gushiwen.org' + url
                p.apply_async(run_proc_v2, args=(url,))
            else:
                p.apply_async(run_proc, args=(url,))

    p.close()
    p.join()
    session.close()

猜你喜欢

转载自my.oschina.net/u/3163032/blog/1802000