数据获取

我主要是通过 BeautifulSoup 进行数据抓取，然后存储到本地的 Elasticsearch

#!/usr/bin/env python
# coding:utf-8

import lxml
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

host = '127.0.0.1'  # 服务器 host 地址
post = 9200

# 1. 创建索引
def create_index():
    es = Elasticsearch([{'host': host, 'port': post}])

    # 判断索引是否存在
    if not es.exists(index="poetry", id=0):
        # 创建索引
        doc = {
            "mappings": {
                "properties": {
                    "title": {
                        "type": "keyword"
                    },
                    "epigraph": {
                        "type": "keyword"
                    },
                    "dynasty": {
                        "type": "keyword"
                    },
                    "author": {
                        "type": "keyword"
                    },
                    "content": {
                        "type": "text"
                    }
                }
            }
        }
        es.create(index='poetry', body=doc, id=0)

# 2. 诗歌入库
def get_poetry(list_url):
    es = Elasticsearch([{'host': host, 'port': post}])

    # 取得列表页面
    html = requests.get(list_url).text
    soup = BeautifulSoup(html, "lxml")
    typecont = soup.find_all(attrs={"class": "typecont"})

    # 遍历列表
    for div in typecont:
        for ch in div.children:
            if ch.name == 'span':
                # 取得诗词内容
                print('get:', ch.a.text, ch.a.attrs['href'])
                html = requests.get('https://so.gushiwen.org' + ch.a.attrs['href']).text
                soup = BeautifulSoup(html, "lxml")
                cont = soup.select('.main3 .left .sons .cont')[0]

                # 标题
                title = cont.h1.text

                # 词牌
                epigraph = ""
                if '·' in title:
                    epigraph = title[:title.index('·')]

                al = cont.p.select('a')

                # 朝代
                dynasty = al[1].text

                # 作者
                author = al[0].text

                # 内容
                content = cont.select('.contson')[0].text.strip()

                # 索引数据
                doc = {
                    "title": title,
                    "epigraph": epigraph,
                    "dynasty": dynasty,
                    "author": author,
                    "content": content
                }
                # ret = es.index(index='poetry', doc_type='poetry', body=doc)
                ret = es.index(index='poetry', body=doc)
                print(ret)

# 主方法
def main():
    create_index()
    get_poetry('https://so.gushiwen.cn/gushi/xiatian.aspx')
    #唐诗
    #get_poetry('https://so.gushiwen.org/gushi/tangshi.aspx')
    #宋词
    #get_poetry('https://so.gushiwen.org/gushi/songsan.aspx')


if __name__ == '__main__':
    main()
复制代码

执行日志如下：

数据查询

我们可以通过 elasticsearch-head 插件查询 李白 的写过那些有关夏天的诗歌。结果如下：

从上面可以看到我们并没有完全的匹配，对于 elasticsearch 其实是做了一个分词后的查询。这样其实非常适合我们全文搜索。比如我通过内容再搜索一下 初夏

其实这个搜索的结果还是非常的好的，就出来了 白居易 的 首夏南池独酌

春尽杂英歇，夏初芳草深。薰风自南至，吹我池上林。绿蘋散还合，赪鲤跳复沉。新叶有佳色，残莺犹好音。依然谢家物，池酌对风琴。惭无康乐作，秉笔思沉吟。境胜才思劣，诗成不称心。

如果我们想全值匹配可以通过一下方式查询

通过 Python 来分析关于夏天的诗句

数据获取

数据查询

猜你喜欢