Scrapy框架连接ES

目的: 将Scrapy爬取的数据保存到ES中!

1. 新建一个py文件, 名字自取(如es_model.py)

# 字段类型
from elasticsearch_dsl import DocType, Completion, Keyword, Text, Boolean, Integer, Date
# 链接函数
from elasticsearch_dsl.connections import connections
# 分析器
from elasticsearch_dsl.analysis import CustomAnalyzer

# 1. 创建es连接,参数就是es的地址
connections.create_connection(hosts=["127.0.0.1"])


# 自定义一个分词器
class Analyzer(CustomAnalyzer):
    # 返回分析器对象
    def get_analysis_definition(self):
        return {}


# 创建分析器对象
ik_analyzer = Analyzer('ik_max_word', filter=['lowercase'])


class Field(DocType):
    # 搜索框中的自动补齐功能
    suggest = Completion(analyzer=ik_analyzer)
    # ik_max_word 分词策略 细分
    # ik_smart 分词策略 粗分
    # analyzer: 分析器(意思)
    name = Text(analyzer='ik_max_word')
    author = Text(analyzer='ik_max_word')
    content = Text()

    class Meta:
        index = 'novels'
        doc_type = 'novel'

if __name__ == '__main__':
    Field.init()

2. 在pipelines.py文件中建一个pipline

# 将爬虫爬取的数据存储搜索服务器中
class EsPipline(object):

    def process_item(self, item, spider):
        item.save_es()
        return item

3. 在settings中设置优先级

ITEM_PIPELINES = {
   'PaCongSpider.pipelines.EsPipline': 3,
}

4. 在items.py中

import scrapy
from .es_model import Field
from elasticsearch_dsl.connections import connections
es = connections.create_connection(hosts=['127.0.0.1'])


class PacongspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


# 将建立的字段进行分词处理
# *args 元组 **args字典
def conduct_suggest(index, *args):
    """

    :param index: 操作的数据库
    :param args: 需要进行分词的内容
    :return: 返回分词之后的列表
    """
    # 声明空集合
    use_words = set()
    # 声明列表
    suggest = []
    # 分词的分重
    for text, weight in args:
        # 调用分词接口
        words = es.indices.analyze(
            index=index,
            params={
                'filter': ['lowercase']
            },
            body={
                'analyzer': 'ik_max_word',
                'text': text
            }
        )
        analyzer_word = set([x['token'] for x in words['tokens']])
        print(analyzer_word)
        # 计算差集
        new_words = analyzer_word - use_words
        # 加入suggest之前,这条数据在suggest是不存在的
        suggest.append({'input': list(new_words), 'weight': weight})
        use_words = analyzer_word
    # suggest是没有重复数据的
    # [{'input':['土豆','豆','逆','袭'],'weight':10},{'words':['天蚕'],'weight':8}]
    return suggest


class QiShuItem(scrapy.Item):
    novel_name = scrapy.Field()
    novel_author = scrapy.Field()
    novel_content = scrapy.Field()

    # 将数据保存到es搜索服务器中
    def save_es(self):
        novel = Field()
        # 从传递进来的item中取值
        novel.name = self['novel_name']
        novel.author = self['novel_author']
        novel.content = self['novel_content']
        # 将数据对应分词信息进行保存
        # 将某些字段进行分词处理, 将处理后的数据保存到suggest中
        novel.suggest = conduct_suggest('novels', (novel.name, 10), (novel.author, 8))
        novel.save()

5. 只要你的Scrapy运行没有问题, 这样就保存到ES当中!

猜你喜欢