Scrapy framework for persistence storage

Hard disk storage

(1) Based on the instruction terminal

* Ensure the parse method returns an object type may be iterative (resolves to store page content)
* Use specified terminal to complete the operation of the data storage disk file
     scrapy crawl crawler disk file -o file name suffix

DEF the parse (Self, Response):
     # recommended xpath parsing (xpath integrated analytical framework interfaces) 
    div_list response.xpath = ( ' // div [@ ID = "Content-left"] / div ' )
     # storage parsed the page data 
    DATA_LIST = []
     for div in div_list:
         # designated contents stored in the xpath to resolve the Selector object 
        # Extract () method which can store the data values stored in Selector object to get the 
        author div.xpath = ( ' . / div / A [2] / H2 / text () ' ) .extract_first ()
         # extract_first = Extract () [0] 
        Content = div.xpath ( ' .//div[@class="content"]/span/ text () ').extract_first()
        data_dict = {
            'author':author,
            'content':content
        }
        data_list.append(data_dict)
    return data_list

(2) based on the pipe

* Items: parsing the page data stored in
* piplines: Processing operations related to persistent storage
* code flow:

  1. Resolves to the page data stored items Object
  2. Use the keyword yield to submit items to the pipeline for processing files
  3. Pipe file write code for data stored in the operation
  4. Open pipeline operation in the configuration file
class QiubaiSpider (scrapy.Spider): 
    name = ' Qiubai ' 
    # allowed_domains = [ 'www.qiushibaike.com/text'] 
    start_urls = [ ' https://www.qiushibaike.com/text/ ' ] 

    DEF the parse (Self, Response):
         # recommended xpath parsing (xpath integrated analytical framework interfaces) 
        div_list response.xpath = ( ' // div [@ ID = "Content-left"] / div ' )
         # stored in the page data resolved 
        data_list = []
         for div in div_list:
             # xpath resolves to store the contents of the specified Selector object 
            #extract () method which can store the data values stored in Selector object to get the 
            author = div.xpath ( ' ./div/a [2] / H2 / text () ' ) .extract_first ()
             # extract_first = extract () [0] 
            Content = div.xpath ( ' .// div [@ class = "Content"] / span / text () ' ) .extract_first () 

            # parsed data values (author and content) stored in the object items 
            = item SpiderqiubaiItem () 
            item [ ' author ' ] = author 
            [item ' Content ' ] = Content
             # submit the object to the piping item 
            the yield item
qiubai.py
import scrapy
class SpiderqiubaiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    author = scrapy.Field()
    content = scrapy.Field()
items.py
class SpiderqiubaiPipeline (Object): 
    fp = None 

    # throughout the reptile process, the method will only be called once at the beginning of reptiles 
    DEF open_spider (Self, Spider):
         Print ( ' start reptile ' ) 
        self.fp = Open ( ' ./qiubai_pipe.txt ' , ' W ' , encoding = ' UTF-. 8 ' ) 

    # objects which may receive submissions over reptiles, and the page data item to be stored in the persistent object storage 
    # parameters: item represented by the object is received item 
    # whenever reptile documents submitted to the pipeline first item, the method will be executed once 
    DEF process_item (Self, item, Spider):
         #Remove item values stored in the object data 
        author = item [ ' author ' ] 
        Content = item [ ' Content ' ]
         # persistent storage 
        self.fp.write (author + " : " + Content + " \ n-\ n-\ n- " )
         return Item 

    # this method will only be called when the end of a reptile 
    DEF close_spider (Self, Spider):
         Print ( " reptile end " ) 
        self.fp.close ()
piplines.py

 

Database storage

* Code process:

  1. Resolves to the page data stored items Object
  2. Use the keyword yield to submit items to the pipeline for processing files
  3. Completion of the operation to write code data stored in the pipeline file (stored in the database)
  4. Open pipeline operation in the configuration file
class SpiderqiubaiPipeline (Object): 
    Conn = None 

    # throughout the course of the crawler, the method is called only once at the beginning of the crawler 
    DEF open_spider (Self, Spider):
         # connect to the database 
        self.conn = pymysql.Connect (= Host ' 192.168.1.10 ' , Port = 3306, = User ' the root ' , password = ' cs1993413 ' , DB = ' Qiubai ' ) 

    # the method may crawler submissions received over the object, and the page data item to be stored in the persistent object storage 
    # parameters: item represented by item object is received 
    # whenever a document submitted to the pipe crawler once item, the method will be executed once 
    DEF process_item (Self, item, Spider):
        # 1 connected to the database 
        # 2 execute sql statement 
        sql = ' INSERT INTO ( "% S", "% S") values Qiubai ' % (Item [ ' author ' ], Item [ ' Content ' ]) 
        self.cursor = Self. conn.cursor ()
         the try : 
            self.cursor.execute (SQL) 
            
            self.conn.commit () 
        the except Exception AS E: 
            self.conn.rollback () 
        # . 3 commit transaction 
        # data item values stored in the object extraction 
        return item 

    # this method will only be called once when the end of the reptiles 
    DEF close_spider (Self, Spider): 
       self.conn.close ()

redis storage

class SpiderqiubaiPipeline (Object): 
    Conn = None 

    # throughout the course of the crawler, the method is called only once at the beginning of the crawler 
    DEF open_spider (Self, Spider):
         # connect to the database 
        self.conn = redis.Redis (= Host ' 192.168.1.10 ' , Port = 6379 ) 

    DEF process_item (Self, Item, Spider): 
        data_dict = {
             ' author ' : Item [ ' author ' ],
             ' Content ' : Item [ ' Content ' ] 
        } 
        self.conn.lpush ( 'data', data_dict)
        return item

pipline Advanced Operations

And local data exist on databases and redis

# The value of the stored data to the local disk 
class QiubaiByFiels (Object): 
    FP = None 

    DEF open_spider (Self, Spider):
         Print ( ' start crawlers ' ) 
        self.fp = Open ( ' ./qiubai_pipe.txt ' , ' W ' , encoding = ' UTF-. 8 ' ) 

    DEF process_item (Self, Item, Spider): 
        author = Item [ ' author ' ] 
        Content = Item [ ' Content ' ]
        self.fp.write(author + ":" + content + "\n\n\n")
        return item

    def close_spider(self, spider):
        print("爬虫结束")
        self.fp.close()

The data values ​​stored in the database mysql

class QiubaiByMysql(object):
    conn = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='192.168.1.10', port=3306, user='root', password='cs1993413', db='qiubai')

    def process_item(self, item, spider):
        sql = 'insert into qiubai values("%s", "%s")' % (item['author'], item['content'])
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute(sql)

            self.conn.commit()
        except Exception as e:
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        self.conn.close()

settings.py

ITEM_PIPELINES = {
   'spiderqiubai.pipelines.SpiderqiubaiPipeline': 300,
   'spiderqiubai.pipelines.QiubaiByMysql': 200,
   'spiderqiubai.pipelines.QiubaiByFiels': 100,
}

 

Guess you like

Origin www.cnblogs.com/harryblog/p/11356354.html