使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲

run.py

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 __author__ = 'Zqf'
 4 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider
 5 from scrapy.crawler import CrawlerProcess
 6 from scrapy.utils.project import get_project_settings
 7 
 8 # 获取settings.py模块的设置
 9 settings = get_project_settings()
10 process = CrawlerProcess(settings=settings)
11 
12 # 可以添加多个spider
13 process.crawl(DingdianSimpleSpider)
14 
15 # 启动爬虫,会阻塞,直到爬取完成
16 process.start()

kugou.py

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 import re
 4 
 5 __author__ = 'Zqf'
 6 
 7 import scrapy
 8 from kugoumusic.items import KugoumusicItem
 9 from scrapy.linkextractors import LinkExtractor
10 from scrapy.spiders import Rule
11 
12 
13 class KugouSpiders(scrapy.spiders.CrawlSpider):
14     name = 'kugou'
15     
16     start_urls = ['http://www.kugou.com/']
17 
18     rules = (
19         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',
20                                   'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])),
21         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item')
22     )
23     
24     def parse_item(self, response):
25         singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()
26         print(singer)
27         songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()
28         print(songs)
29     
30         item = KugoumusicItem()
31         item['singer'] = singer
32         item['songs'] = songs
33         
34         yield item

items.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class KugoumusicItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     singer = scrapy.Field()
15     songs = scrapy.Field()

pipelines.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 from pymongo import MongoClient
 8 
 9 
10 class KugoumusicPipeline(object):
11     
12     def open_spider(self, spider):
13         # mongo_config = spider.settings['MONGO_CONFIG']
14         # host = '127.0.0.1', port = 27017
15         self.client = MongoClient(host='127.0.0.1', port=27017)
16         self.coll = self.client['student_db']['kugou']
17         self.li = []
18         
19     def close_spider(self, spider):
20         self.insert()
21         self.client.close()
22         
23     def insert(self):
24         self.coll.insert_many(self.li)
25     
26     def process_item(self, item, spider):
27         if len(self.li) >= 100:
28             self.insert()
29             self.li = []
30             print("成功插入100条数据-------------------------------------")
31         else:
32             self.li.append(dict(item))
33         
34         return item

settings.py

  1 # -*- coding: utf-8 -*-
  2 
  3 # Scrapy settings for kugoumusic project
  4 #
  5 # For simplicity, this file contains only settings considered important or
  6 # commonly used. You can find more settings consulting the documentation:
  7 #
  8 #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 
 12 BOT_NAME = 'kugoumusic'
 13 
 14 SPIDER_MODULES = ['kugoumusic.spiders']
 15 NEWSPIDER_MODULE = 'kugoumusic.spiders'
 16 
 17 # MONGO_CONFIG = ['192.168.62.35:1806, '
 18 #               '192.168.62.240:1806, '
 19 #               '192.168.62.23:1806, '
 20 #               '192.168.62.32:1806, '
 21 #               '192.168.62.25:1806, '
 22 #               '192.168.62.28:1806, '
 23 #               '192.168.62.241:1806']
 24 
 25 # MONGO_CONFIG = {
 26 #     'host': '127.0.0.1',
 27 #     'port': 27017
 28     # 'user': 'root',
 29     # 'password': '123456',
 30     # 'db': 's1806',
 31     # 'charset': 'utf8'
 32 # }
 33 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 34 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)'
 35 
 36 # Obey robots.txt rules
 37 ROBOTSTXT_OBEY = False
 38 
 39 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 40 #CONCURRENT_REQUESTS = 32
 41 
 42 # Configure a delay for requests for the same website (default: 0)
 43 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 44 # See also autothrottle settings and docs
 45 #DOWNLOAD_DELAY = 3
 46 # The download delay setting will honor only one of:
 47 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 48 #CONCURRENT_REQUESTS_PER_IP = 16
 49 
 50 # Disable cookies (enabled by default)
 51 #COOKIES_ENABLED = False
 52 
 53 # Disable Telnet Console (enabled by default)
 54 #TELNETCONSOLE_ENABLED = False
 55 
 56 # Override the default request headers:
 57 DEFAULT_REQUEST_HEADERS = {
 58     'Connection': 'keep-alive',
 59     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
 60     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 61     'Accept-Encoding': 'gzip, deflate, br',
 62     'Accept-Language': 'zh-CN,zh;q=0.9',
 63 }
 64 
 65 # Enable or disable spider middlewares
 66 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 67 #SPIDER_MIDDLEWARES = {
 68 #    'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,
 69 #}
 70 
 71 # Enable or disable downloader middlewares
 72 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 73 #DOWNLOADER_MIDDLEWARES = {
 74 #    'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,
 75 #}
 76 
 77 # Enable or disable extensions
 78 # See https://doc.scrapy.org/en/latest/topics/extensions.html
 79 #EXTENSIONS = {
 80 #    'scrapy.extensions.telnet.TelnetConsole': None,
 81 #}
 82 
 83 # Configure item pipelines
 84 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 85 ITEM_PIPELINES = {
 86    'kugoumusic.pipelines.KugoumusicPipeline': 300,
 87 }
 88 
 89 # Enable and configure the AutoThrottle extension (disabled by default)
 90 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 91 #AUTOTHROTTLE_ENABLED = True
 92 # The initial download delay
 93 #AUTOTHROTTLE_START_DELAY = 5
 94 # The maximum download delay to be set in case of high latencies
 95 #AUTOTHROTTLE_MAX_DELAY = 60
 96 # The average number of requests Scrapy should be sending in parallel to
 97 # each remote server
 98 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 99 # Enable showing throttling stats for every response received:
100 #AUTOTHROTTLE_DEBUG = False
101 
102 # Enable and configure HTTP caching (disabled by default)
103 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
104 #HTTPCACHE_ENABLED = True
105 #HTTPCACHE_EXPIRATION_SECS = 0
106 #HTTPCACHE_DIR = 'httpcache'
107 #HTTPCACHE_IGNORE_HTTP_CODES = []
108 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

猜你喜欢

转载自www.cnblogs.com/tttzqf/p/9638545.html