Incremental

When we visit the related website will find that some sites will be updated regular batch of data pages in the original data base, for example, a movie site updated in real time a number of recent popular films. Novel site updated in real time the latest chapter data, etc. according to the progress of the creator. Then, a similar situation, when we encounter in the course of reptiles, we are not required to regularly update the program in order to be able to crawl to the data Web site recently updated it?

1. Incremental reptiles

  • The concept: a website by monitoring crawlers updated data, so that you can crawl to the site update the new data.

  • How to incremental crawling work:

    • Analyzing this URL before sending the request is not crawl through before

    • After the analytical content determination before this part is not crawl through

    • Analyzing the content of the storage medium is written are not already present in the medium

      • analysis:

        Not difficult to find, in fact, the core of the incremental crawling go heavy , as heavy play a role in the operation to which steps can only say that their pros and cons. In my opinion, the first two ideas need to take a (also probably use) according to the actual situation. The first idea for the site, there have been new page that appears, for example, the new chapter of the novel, the latest daily news and so on; the second approach is suitable for Web page content will be updated. The third idea is equivalent to the last line of defense. This will achieve the purpose of de-duplication to the maximum extent.

  • De-emphasis method

    • The url crawling generated during storage, the stored set redis. The next time the data crawling, first request is about to be initiated by a corresponding set url url do in memory of the judgment, if there is no request, otherwise just a request.

    • Web content to be crawled developed uniquely identified, and then stored to the unique representation of redis set in. When the next page crawled data, before performing the persistent storage may first be determined uniquely identifies the first data exists in the set redis, in determining whether persistent storage.

2. Projects

- Demand: crawling all movie details data 4567tv website.

Reptile file:

 import scrapy
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 from redis import Redis
 from incrementPro.items import IncrementproItem
 class MovieSpider(CrawlSpider):
     name = 'movie'
     # allowed_domains = ['www.xxx.com']
     start_urls = ['http://www.4567tv.tv/frim/index7-11.html']
     rules = (
         Rule(LinkExtractor(allow=r'/frim/index7-\d+\.html'), callback='parse_item', follow=True),
     )
     # Create a link object redis
     conn = Redis(host='127.0.0.1',port=6379)
     def parse_item(self, response):
         li_list = response.xpath('//li[@class="p1 m1"]')
         for li in li_list:
             # Url obtain details page
             detail_url = 'http://www.4567tv.tv'+li.xpath('./a/@href').extract_first()
             # The story page url into the set redis
             ex = self.conn.sadd('urls',detail_url)
             if ex == 1:
                 print ( 'url which had not been crawled, the data can crawl')
                 yield scrapy.Request(url=detail_url,callback=self.parst_detail)
             else:
                 print ( 'data has not been updated, no new data can crawl!')
     # Resolve the name and type of the movie details page, persistent storage
     def parst_detail(self,response):
         item = IncrementproItem()
         item['name'] = response.xpath('//dt[@class="name"]/text()').extract_first()
         item['kind'] = response.xpath('//div[@class="ct-c"]/dl/dt[4]//text()').extract()
         item['kind'] = ''.join(item['kind'])
         yield item

Pipe file:

 from redis import Redis
 class IncrementproPipeline(object):
     conn = None
     def open_spider(self,spider):
         self.conn = Redis(host='127.0.0.1',port=6379)
     def process_item(self, item, spider):
         dic = {
             'name':item['name'],
             'kind':item['kind']
         }
         print (say)
         self.conn.lpush('movieData',dic)
         return item

- Demand: crawling scripts and author of Encyclopedia of embarrassments data.

Reptile file:

 import scrapy
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 from incrementByDataPro.items import IncrementbydataproItem
 from redis import Redis
 import hashlib
 class QiubaiSpider(CrawlSpider):
     name = 'qiubai'
     # allowed_domains = ['www.xxx.com']
     start_urls = ['https://www.qiushibaike.com/text/']
     rules = (
         Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
         Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True),
     )
     # Create a link object redis
     conn = Redis(host='127.0.0.1',port=6379)
     def parse_item(self, response):
         div_list = response.xpath('//div[@id="content-left"]/div')
         for div in div_list:
             item = IncrementbydataproItem()
             item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
             item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first()
             # Parsed data to generate a unique identification value for storage redis
             source = item['author']+item['content']
             source_id = hashlib.sha256(source.encode()).hexdigest()
             # Will parse the content stored in the unique representation of data_id redis
             ex = self.conn.sadd('data_id',source_id)
             if ex == 1:
                 print ( 'no crawling over the piece of data to be crawled ......')
                 yield item
             else:
                 print ( 'piece of data has been crawling over, does not require re-crawling the!')

Pipe file:

 from redis import Redis
 class IncrementbydataproPipeline(object):
     conn = None
     def open_spider(self, spider):
         self.conn = Redis(host='127.0.0.1', port=6379)
     def process_item(self, item, spider):
         dic = {
             'author': item['author'],
             'content': item['content']
         }
         # Print (say)
         self.conn.lpush('qiubaiData', dic)
         return item

Example 3. (url to weight)

  • Reptile file

 import scrapy
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 from redis import Redis # In order to connect to the database
 from increment1_Pro.items import Increment1ProItem
 class MovieSpider(CrawlSpider):
     name = 'movie'
     # allowed_domains = ['www.xxx.com']
     start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html']
     rules = (
         Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/\d+\.html'), callback='parse_item', follow=True),
     )
     def parse_item(self, response):
         # Connection object
         conn = Redis(host='127.0.0.1',port=6379)
         detail_url_list = 'https://www.4567tv.tv'+response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract()
         Before requesting # make sure there is no url climb, approach is to url to the database
         # To set the data stored in redis, if successful deposit, returns 1
         for url in detail_url_list:
             #ex == 1: set no store url
             ex = conn.sadd ( 'movies_url', url) # sadd redis added to the set of one or more members of the collection
             If # is equal to 1, to keep the set url
             if ex == 1:
                 yield scrapy.Request(url=url,callback=self.parse_detail)
             else:
                 print ( 'sites do not update the data, no new data can climb!')
     def parse_detail(self,response):
         item = Increment1ProItem()
         # Movie name will resolve to the value assigned to the property in the name item
         item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
         item['actor'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first()
         yield item
  • iterm

 import scrapy
 class Increment1ProItem(scrapy.Item):
     # define the fields for your item here like:
     name = scrapy.Field()
     actor = scrapy.Field()
  • pipeline

 from redis import Redis
 class Increment1ProPipeline(object):
     conn = None
     # Connection object
     def open_spider(self,spider):
         self.conn = Redis(host='127.0.0.1',port=6379)
     # The data item is stored in a database
     def process_item(self, item, spider):
         # The two values ​​are encapsulated into a dictionary item 
         dic = {
              'Name': item [ 'name'], # movie name
              'Actor': item [ 'actor'] # actor
          }
         print ( 'new data is to crawl, is storage ......')
         # 'Movie_data' queue named
         self.conn.lpush('movie_data',item)
         return item

Example 4. (data deduplication)

  • Reptile file

 import scrapy
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 from increment2_Pro.items import Increment2ProItem
 from redis import Redis
 import hashlib
 class QiubaiSpider(CrawlSpider):
     name = 'qiubai'
     # allowed_domains = ['www.xxx.com']
     start_urls = ['https://www.qiushibaike.com/text/']
     rules = (
         Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
     )
     def parse_item(self, response):
         div_list = response.xpath('//div[@class="article block untagged mb15 typs_hot"]')
         conn = Redis(host='127.0.0.1',port=6379)
         for div in div_list:
             item = Increment2ProItem()
             item['content'] = div.xpath('.//div[@class="content"]/span//text()').extract()
             # The list into a string
             item['content'] = ''.join(item['content'])
             # The second is to take the anonymous user xpath
             item['author'] = div.xpath('./div/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
             source = item['author']+item['content']
             # Own it developed a form of fingerprint data
             # B bytes behind string type, network programming, the server and the browser only to recognize the type of data bytes
             hashValue = hashlib.sha256(source.encode()).hexdigest()
             # From a se name qiubai_hash
             ex = conn.sadd('qiubai_hash',hashValue)
             if ex == 1:
                 yield item
             else:
                 print ( 'no updated data is available to climb!')
  • item

 import scrapy
 class Increment2ProItem(scrapy.Item):
     # define the fields for your item here like:
     content = scrapy.Field()
     author = scrapy.Field()
  • pipeline

from redis import Redis
 class Increment2ProPipeline(object):
     conn = None
     def open_spider(self,spider):
         self.conn = Redis(host='127.0.0.1',port=6379)
     def process_item(self, item, spider):
         # Package can not be directly stored into the dictionary item
         dic = {
             'author':item['author'],
             'content':item['content']
         }
         # Queue name is called quibaidata
         self.conn.lpush ( 'qiubaiData', dic) # dic where the item is not directly
         print ( 'crawling to a data storage being ......')
         return item

Guess you like

Origin www.cnblogs.com/yzg-14/p/12208055.html