In doing reptiles with scrapy when data needs to be stored in the es. Internet to find two ways, you can also copy or imitate, suspense down:
First install es, version 5.6.1 is an earlier version of
Associated with the package pip es es version Installation and opposite
pip install elasticsearch-dsl==5.1.0
method one:
The following is the complete code module pipelines.py
# - * - Coding: UTF-8 - * - # the Define your Item Pipelines here Wallpaper # # the Do not forget to your Pipeline to the Add Setting at The ITEM_PIPELINES # See: https://docs.scrapy.org/en/latest/topics /item-pipeline.html Import the chardet class SinafinancespiderPipeline (Object): DEF process_item (Self, Item, Spider): return Item # written into es should be enabled in the settings in this class ExchangeratespiderESPipeline # need to pip install elasticsearch-dsl = and es = 5.1.0 Note that the version corresponding to from elasticsearch_dsl Import a Date, the Nested, Boolean, Analyzer, Completion, Keyword, the Text, Integer, the DocType from elasticsearch_dsl.connections import connections connections.create_connection(hosts=['192.168.52.138']) from elasticsearch import Elasticsearch es = Elasticsearch() class AticleType(DocType): page_from = Keyword() # domain报错 domain=Keyword() cra_url=Keyword() spider = Keyword() cra_time = Keyword() page_release_time = Keyword() PAGE_TITLEThe Text = (Analyzer = " ik_max_word " ) PAGE_CONTENT = the Text (Analyzer = " ik_max_word " ) class Meta -: index = " Scrapy " DOC_TYPE = " sinafinance " # The following settings and did not function mappings, let us write down settings = { " number_of_shards " :. 3 , } Mappings = { ' the _id ' : { ' path ':'cra_url'} } class ExchangeratespiderESPipeline(DocType): from elasticsearch5 import Elasticsearch ES = ['192.168.52.138:9200'] es = Elasticsearch(ES,sniff_on_start=True) def process_item(self, item, spider): spider.logger.info("-----enter into insert ES") article = AticleType() article.page_from=item['page_from'] article.domain=item['domain'] article.cra_url =item['cra_url'] article.spider =item['spider'] article.cra_time =item['cra_time'] article.page_release_time =item['page_release_time'] article.page_title =item['page_title'] article.page_content =item['page_content'] article.save() return item
The method of writing the data ES above, but if it is repeated crawling, repeated insertion of data, because the primary key "_id" ES is self-generated, can not find the custom _id inlet. So give up.
Method two: writing implement custom primary key, cover the insertion
# - * - Coding: UTF-8 - * - # the Define your Item Pipelines here Wallpaper # # the Do not forget to your Pipeline to the Add Setting at The ITEM_PIPELINES # See: https://docs.scrapy.org/en/latest/topics /item-pipeline.html from elasticsearch5 Import elasticsearch class SinafinancespiderPipeline (Object): DEF process_item (Self, Item, Spider): return Item # written to the es, the need to enable the settings in this class ExchangeratespiderESPipeline # need to install pip install elasticsearch- dsl == 5.1.0 version requires attention and es correspondence class SinafinancespiderESPipeline (): DEF __init__(Self): self.ES = [ ' 192.168.52.138:9200 ' ] # create es client self.es = elasticsearch ( self.ES, # before the start sniffing es cluster server sniff_on_start = True, # es cluster server node when abnormal connection node information whether to refresh es sniff_on_connection_fail = True, # every 60 seconds to refresh the node information sniffer_timeout = 60 ) DEF process_item (Self, Item, Spider): spider.logger.info ( " ----- Enter INSERT INTO ES " ) DOC = { 'page_from': item['page_from'], 'domain': item['domain'], 'spider': item['spider'], 'page_release_time': item['page_release_time'], 'page_title': item['page_title'], 'page_content': item['page_content'], 'cra_url': item['cra_url'], 'cra_time': item['cra_time'] } self.es.index(index='scrapy', doc_type='sinafinance', body=doc, id=item['cra_url']) return item
Data search method:
# Dictionary provided in the form body
Query = { ' Query ' : { ' BOOL ' : { ' MUST ' : [ { ' match ' : { ' _all ' : ' Python Web ' }} ], ' filter ' : [ { ' Term ' : { ' Status ' : 2 }} ] } } } RET = ES.search(index='Articles ' , DOC_TYPE = ' Article This article was ' , body = Query)
# query data
Data = es.search (index = ' Articles ', DOC_TYPE = ' Article This article was ', body = body )
Print (Data)
# increase
es.index (.. .)
# modify
es.update (...)
# delete
es.delete ()
after finishing
Registered in the custom class module settings.py
= ITEM_PIPELINES { # 'sinafinancespider.pipelines.SinafinancespiderPipeline': 300, ' sinafinancespider.pipelines.SinafinancespiderESPipeline ' : 300 , }