python insertion operation Elasticsearch

In doing reptiles with scrapy when data needs to be stored in the es. Internet to find two ways, you can also copy or imitate, suspense down:

First install es, version 5.6.1 is an earlier version of

Associated with the package pip es es version Installation and opposite

pip install elasticsearch-dsl==5.1.0

method one:

The following is the complete code module pipelines.py

# - * - Coding: UTF-8 - * - 

# the Define your Item Pipelines here Wallpaper 
# 
# the Do not forget to your Pipeline to the Add Setting at The ITEM_PIPELINES 
# See: https://docs.scrapy.org/en/latest/topics /item-pipeline.html 
Import the chardet 

class SinafinancespiderPipeline (Object):
     DEF process_item (Self, Item, Spider):
         return Item 


# written into es should be enabled in the settings in this class ExchangeratespiderESPipeline 
# need to pip install elasticsearch-dsl = and es = 5.1.0 Note that the version corresponding to 
from elasticsearch_dsl Import   a Date, the Nested, Boolean, Analyzer, Completion, Keyword, the Text, Integer, the DocType
 from elasticsearch_dsl.connections import connections
connections.create_connection(hosts=['192.168.52.138'])
from elasticsearch import Elasticsearch
es = Elasticsearch()

class AticleType(DocType):
    page_from = Keyword()
    # domain报错
    domain=Keyword()
    cra_url=Keyword()
    spider = Keyword()
    cra_time = Keyword()
    page_release_time = Keyword()
    PAGE_TITLEThe Text = (Analyzer = " ik_max_word " ) 
    PAGE_CONTENT = the Text (Analyzer = " ik_max_word " )
 class Meta -: 
        index = " Scrapy " 
        DOC_TYPE = " sinafinance " 
        # The following settings and did not function mappings, let us write down 
        settings = {
             " number_of_shards " :. 3 , 
        } 
        Mappings = {
             ' the _id ' : { ' path ':'cra_url'}
        }


class ExchangeratespiderESPipeline(DocType):
    from elasticsearch5 import  Elasticsearch
    ES = ['192.168.52.138:9200']
    es = Elasticsearch(ES,sniff_on_start=True)

    def process_item(self, item, spider):

        spider.logger.info("-----enter into insert ES")
        article = AticleType()

        article.page_from=item['page_from']
        article.domain=item['domain']
        article.cra_url =item['cra_url']
        article.spider =item['spider']
        article.cra_time =item['cra_time']
        article.page_release_time =item['page_release_time']
        article.page_title =item['page_title']
        article.page_content =item['page_content']

        article.save()
        return item

The method of writing the data ES above, but if it is repeated crawling, repeated insertion of data, because the primary key "_id" ES is self-generated, can not find the custom _id inlet. So give up.

Method two: writing implement custom primary key, cover the insertion

# - * - Coding: UTF-8 - * - 

# the Define your Item Pipelines here Wallpaper 
# 
# the Do not forget to your Pipeline to the Add Setting at The ITEM_PIPELINES 
# See: https://docs.scrapy.org/en/latest/topics /item-pipeline.html 
from elasticsearch5 Import elasticsearch 

class SinafinancespiderPipeline (Object):
     DEF process_item (Self, Item, Spider):
         return Item 


# written to the es, the need to enable the settings in this class ExchangeratespiderESPipeline 
# need to install pip install elasticsearch- dsl == 5.1.0 version requires attention and es correspondence 
class SinafinancespiderESPipeline ():
     DEF  __init__(Self): 
        self.ES = [ ' 192.168.52.138:9200 ' ]
         # create es client 
        self.es = elasticsearch ( 
            self.ES, 
            # before the start sniffing es cluster server 
            sniff_on_start = True,
             # es cluster server node when abnormal connection node information whether to refresh es 
            sniff_on_connection_fail = True,
             # every 60 seconds to refresh the node information 
            sniffer_timeout = 60 
        ) 


    DEF process_item (Self, Item, Spider): 
        spider.logger.info ( " ----- Enter INSERT INTO ES " ) 
        DOC = {
            'page_from': item['page_from'],
            'domain': item['domain'],
            'spider': item['spider'],
            'page_release_time': item['page_release_time'],
            'page_title': item['page_title'],
            'page_content': item['page_content'],
            'cra_url': item['cra_url'],
            'cra_time': item['cra_time']
        }
        self.es.index(index='scrapy', doc_type='sinafinance', body=doc, id=item['cra_url'])

        return item

Data search method:

# Dictionary provided in the form body 
Query = { ' Query ' : { ' BOOL ' : { ' MUST ' : [ { ' match ' : { ' _all ' : ' Python Web ' }} ], ' filter ' : [ { ' Term ' : { ' Status ' : 2 }} ] } } } RET = ES.search(index='Articles ' , DOC_TYPE = ' Article This article was ' , body = Query)

# query data
Data = es.search (index = ' Articles ', DOC_TYPE = ' Article This article was ', body = body )
Print (Data)
# increase
es.index (.. .)
# modify
es.update (...)
# delete
es.delete ()

 

after finishing

Registered in the custom class module settings.py

= ITEM_PIPELINES {
    # 'sinafinancespider.pipelines.SinafinancespiderPipeline': 300, 
   ' sinafinancespider.pipelines.SinafinancespiderESPipeline ' : 300 , 
}

Guess you like

Origin www.cnblogs.com/yoyowin/p/12209706.html