Hard disk storage
(1) Based on the instruction terminal
* Ensure the parse method returns an object type may be iterative (resolves to store page content)
* Use specified terminal to complete the operation of the data storage disk file
scrapy crawl crawler disk file -o file name suffix
DEF the parse (Self, Response): # recommended xpath parsing (xpath integrated analytical framework interfaces) div_list response.xpath = ( ' // div [@ ID = "Content-left"] / div ' ) # storage parsed the page data DATA_LIST = [] for div in div_list: # designated contents stored in the xpath to resolve the Selector object # Extract () method which can store the data values stored in Selector object to get the author div.xpath = ( ' . / div / A [2] / H2 / text () ' ) .extract_first () # extract_first = Extract () [0] Content = div.xpath ( ' .//div[@class="content"]/span/ text () ').extract_first() data_dict = { 'author':author, 'content':content } data_list.append(data_dict) return data_list
(2) based on the pipe
* Items: parsing the page data stored in
* piplines: Processing operations related to persistent storage
* code flow:
- Resolves to the page data stored items Object
- Use the keyword yield to submit items to the pipeline for processing files
- Pipe file write code for data stored in the operation
- Open pipeline operation in the configuration file
class QiubaiSpider (scrapy.Spider): name = ' Qiubai ' # allowed_domains = [ 'www.qiushibaike.com/text'] start_urls = [ ' https://www.qiushibaike.com/text/ ' ] DEF the parse (Self, Response): # recommended xpath parsing (xpath integrated analytical framework interfaces) div_list response.xpath = ( ' // div [@ ID = "Content-left"] / div ' ) # stored in the page data resolved data_list = [] for div in div_list: # xpath resolves to store the contents of the specified Selector object #extract () method which can store the data values stored in Selector object to get the author = div.xpath ( ' ./div/a [2] / H2 / text () ' ) .extract_first () # extract_first = extract () [0] Content = div.xpath ( ' .// div [@ class = "Content"] / span / text () ' ) .extract_first () # parsed data values (author and content) stored in the object items = item SpiderqiubaiItem () item [ ' author ' ] = author [item ' Content ' ] = Content # submit the object to the piping item the yield item
import scrapy class SpiderqiubaiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() author = scrapy.Field() content = scrapy.Field()
class SpiderqiubaiPipeline (Object): fp = None # throughout the reptile process, the method will only be called once at the beginning of reptiles DEF open_spider (Self, Spider): Print ( ' start reptile ' ) self.fp = Open ( ' ./qiubai_pipe.txt ' , ' W ' , encoding = ' UTF-. 8 ' ) # objects which may receive submissions over reptiles, and the page data item to be stored in the persistent object storage # parameters: item represented by the object is received item # whenever reptile documents submitted to the pipeline first item, the method will be executed once DEF process_item (Self, item, Spider): #Remove item values stored in the object data author = item [ ' author ' ] Content = item [ ' Content ' ] # persistent storage self.fp.write (author + " : " + Content + " \ n-\ n-\ n- " ) return Item # this method will only be called when the end of a reptile DEF close_spider (Self, Spider): Print ( " reptile end " ) self.fp.close ()
Database storage
* Code process:
- Resolves to the page data stored items Object
- Use the keyword yield to submit items to the pipeline for processing files
- Completion of the operation to write code data stored in the pipeline file (stored in the database)
- Open pipeline operation in the configuration file
class SpiderqiubaiPipeline (Object): Conn = None # throughout the course of the crawler, the method is called only once at the beginning of the crawler DEF open_spider (Self, Spider): # connect to the database self.conn = pymysql.Connect (= Host ' 192.168.1.10 ' , Port = 3306, = User ' the root ' , password = ' cs1993413 ' , DB = ' Qiubai ' ) # the method may crawler submissions received over the object, and the page data item to be stored in the persistent object storage # parameters: item represented by item object is received # whenever a document submitted to the pipe crawler once item, the method will be executed once DEF process_item (Self, item, Spider): # 1 connected to the database # 2 execute sql statement sql = ' INSERT INTO ( "% S", "% S") values Qiubai ' % (Item [ ' author ' ], Item [ ' Content ' ]) self.cursor = Self. conn.cursor () the try : self.cursor.execute (SQL) self.conn.commit () the except Exception AS E: self.conn.rollback () # . 3 commit transaction # data item values stored in the object extraction return item # this method will only be called once when the end of the reptiles DEF close_spider (Self, Spider): self.conn.close ()
redis storage
class SpiderqiubaiPipeline (Object): Conn = None # throughout the course of the crawler, the method is called only once at the beginning of the crawler DEF open_spider (Self, Spider): # connect to the database self.conn = redis.Redis (= Host ' 192.168.1.10 ' , Port = 6379 ) DEF process_item (Self, Item, Spider): data_dict = { ' author ' : Item [ ' author ' ], ' Content ' : Item [ ' Content ' ] } self.conn.lpush ( 'data', data_dict) return item
pipline Advanced Operations
And local data exist on databases and redis
# The value of the stored data to the local disk class QiubaiByFiels (Object): FP = None DEF open_spider (Self, Spider): Print ( ' start crawlers ' ) self.fp = Open ( ' ./qiubai_pipe.txt ' , ' W ' , encoding = ' UTF-. 8 ' ) DEF process_item (Self, Item, Spider): author = Item [ ' author ' ] Content = Item [ ' Content ' ] self.fp.write(author + ":" + content + "\n\n\n") return item def close_spider(self, spider): print("爬虫结束") self.fp.close()
The data values stored in the database mysql
class QiubaiByMysql(object): conn = None def open_spider(self, spider): self.conn = pymysql.Connect(host='192.168.1.10', port=3306, user='root', password='cs1993413', db='qiubai') def process_item(self, item, spider): sql = 'insert into qiubai values("%s", "%s")' % (item['author'], item['content']) self.cursor = self.conn.cursor() try: self.cursor.execute(sql) self.conn.commit() except Exception as e: self.conn.rollback() return item def close_spider(self, spider): self.conn.close()
settings.py
ITEM_PIPELINES = { 'spiderqiubai.pipelines.SpiderqiubaiPipeline': 300, 'spiderqiubai.pipelines.QiubaiByMysql': 200, 'spiderqiubai.pipelines.QiubaiByFiels': 100, }