\ArticleSpider\spiders\jobbole.py(爬虫下的代码)

# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http import Request
from urllib import parse
from ..items import JobboleArticleItem,ArticleItemLoader
from ..utils.common import get_md5
import datetime
from scrapy.loader import ItemLoader


class JobbleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']

    def parse(self, response):

        post_nodes = response.css("#archive .floated-thumb .post-thumb a")
        for post_node in post_nodes:
            image_url = post_node.css("img::attr(src)").extract_first("")
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url},callback=self.parse_detail)

        next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url,post_url),callback=self.parse)


    def parse_detail(self,response):
        #article_item = JobboleArticleItem()
        # title = response.xpath('//*[@id="post-112048"]/div[1]/h1/text()').extract_first()
        # create_date = response.xpath('//*[@id="post-112048"]/div[2]/p/text()[1]').extract()[0].strip().replace('.','')
        # praise_nums = response.xpath('//span[contains(@class," btn-bluet-bigger href-style vote-post-up   register-user-only ")]/h10/text()').extract()[0]
        # fav_nums =  response.xpath('//span[contains(@class,"btn-bluet-bigger href-style bookmark-btn  register-user-only ")]/text()').extract()[0]
        # match_re = re.match(".*(\d+).*",fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums =  response.xpath('//span[contains(@class,"btn-bluet-bigger href-style hide-on-480")]/text()').extract()[0]
        # match_re = re.match(".*(\d+).*",comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content =  response.xpath('//div[@class="entry"]').extract()[0]
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        #css选择器
        # front_image_url = response.meta.get("front_image_url","")
        # title =  response.css('.entry-header h1::text').extract()[0]
        # create_date =  response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('.','')
        # praise_nums = response.css('.vote-post-up h10::text').extract()[0]
        # fav_nums = response.css('.bookmark-btn::text').extract()[0]
        # match_re = re.match(".*(\d+).*",fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums =  response.css('a[href="#article-comment"] span::text').extract()[0]
        # match_re = re.match(".*(\d+).*",comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css('div.entry').extract()[0]
        # tags = response.css('p.entry-meta-hide-on-mobile a::text').extract()[0]
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date,"%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content


        #通过item loader加载item
        front_image_url = response.meta.get("front_image_url","")
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),response=response)
        item_loader.add_css("title",".entry-header h1::text")
        item_loader.add_value("url",response.url)
        item_loader.add_value("url_object_id",get_md5(response.url))
        item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url",[front_image_url])
        item_loader.add_css("praise_nums",".vote-post-up h10::text")
        item_loader.add_css("comment_nums",'a[href="#article-comment"] span::text')
        item_loader.add_css("fav_nums",".bookmark-btn::text")
        item_loader.add_css("tags","p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content","div.entry")

        article_item = item_loader.load_item()
        yield article_item

ArticleSpider\utils\common.py

import hashlib

def get_md5(url):
    if isinstance(url,str):
        url = url.encode("utf-8")
    m = hashlib.md5()
    m.update(url)
    return m.hexdigest()

if __name__=="__main__":
    print(get_md5("http://blog.jobbole.com".encode("utf-8")))

\ArticleSpider\items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy.loader.processors import MapCompose,TakeFirst,Join

import datetime
from scrapy.loader import ItemLoader
import re

class ArticlespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    pass
def add_jobbole(value):
    return value +"-jhy"

def date_convert(value):
     try:
        create_date = datetime.datetime.strptime(value,"%Y/%m/%d").date()
     except Exception as e:
        create_date = datetime.datetime.now().date()
     return create_date
def get_nums(value):
    match_re = re.match(".*(\d+).*",value)
    if match_re:
        nums = int(match_re.group(1))
    else:
        nums = 0
    return nums

def remove_comment_tags(value):
    #去掉tag提取的评论
    if "评论" in value:
        return ""
    else:
        return value
def return_value(value):
    return value

class ArticleItemLoader(ItemLoader):

      #自定义itemloader
    default_output_processor = TakeFirst()

class JobboleArticleItem(scrapy.Item):
    title = scrapy.Field()
    create_date = scrapy.Field(
        input_processor = MapCompose(date_convert)
    )
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    front_image_url = scrapy.Field(
        output_processor = MapCompose(return_value)
    )
    front_image_path = scrapy.Field()
    praise_nums = scrapy.Field(
        input_processor = MapCompose(get_nums)
    )
    comment_nums = scrapy.Field(
        input_processor = MapCompose(get_nums)
    )
    fav_nums = scrapy.Field(
        input_processor = MapCompose(get_nums)
    )
    tags = scrapy.Field(
        input_processor = MapCompose(remove_comment_tags),
        output_processor = Join(",")

    )
    content = scrapy.Field()



\ArticleSpider\pipelines.py

-- coding: utf-8 --

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.pipelines.images import ImagesPipeline
import codecs
import json
from scrapy.exporters import JsonItemExporter
import MySQLdb
from twisted.enterprise import adbapi
import MySQLdb.cursors

class ArticlespiderPipeline(object):
def process_item(self, item, spider):
return item

class JsonWithEncodingPipeline(object):
#自定义json的文件到处
def init(self):
self.file = codecs.open(‘article.json’,’w’,encoding=”utf-8”)
def process_item(self, item, spider):
lines = json.dumps(dict(item),ensure_ascii=False) + “\n”
self.file.write(lines)
return item
def spider_closed(self,spider):
self.file.close()

采用同步存储mysql

class MysqlPipeline(object):
def init(self):
self.conn = MySQLdb.connect(‘localhost’,’root’,’jhy’,’article_spider’,charset=”utf8”,use_unicode=True)
self.cursor = self.conn.cursor()

def process_item(self, item, spider):
    insert_sql = """
         insert into jobbole_article(title,url,create_date,fav_nums)
         values(%s,%s,%s,%s)
    """
    self.cursor.execute(insert_sql,(item["title"],item["url"],item["create_date"],item["fav_nums"]))
    self.conn.commit()

采用异步存储mysql

class MysqlTwistedPipline(object):
def init(self,dbpool):
self.dbpool = dbpool

@classmethod
def from_settings(cls,settings):
    dbparms = dict(
        host = settings["MYSQL_HOST"],
        db = settings["MYSQL_DBNAME"],
        user = settings["MYSQL_USER"],
        passwd = settings["MYSQL_PASSWORD"],
        charset = 'utf8',
        cursorclass = MySQLdb.cursors.DictCursor,
        use_unicode = True,
    )
    dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
    return cls(dbpool)

def process_item(self, item, spider):
   #使用twised将mysql插入变成异步插入
   query = self.dbpool.runInteraction(self.do_insert,item)
   query.addErrback(self.handle_error)     #处理异常

def handle_error(self,failure):
    print(failure)

def do_insert(self,cursor,item):
    #执行具体的插入
    insert_sql = """
         insert into jobbole_article(title,url,create_date,fav_nums)
         values(%s,%s,%s,%s)
    """
    cursor.execute(insert_sql,(item["title"],item["url"],item["create_date"],item["fav_nums"]))

class JsonExporterPipleLine(object):
#调用scrapy到处json文件
def init(self):
self.file = open(‘articleexport.json’,’wb’)
self.exporter = JsonItemExporter(self.file,encoding=”utf-8”,ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item

class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
if “front_image_url” in item:
for ok, value in results:
image_file_path = value[“path”]
item[“front_image_path”] = image_file_path

    return item



\ArticleSpider\settings.py

-- coding: utf-8 --

import os

Scrapy settings for ArticleSpider project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

http://doc.scrapy.org/en/latest/topics/settings.html

http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = ‘ArticleSpider’

SPIDER_MODULES = [‘ArticleSpider.spiders’]
NEWSPIDER_MODULE = ‘ArticleSpider.spiders’

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘ArticleSpider (+http://www.yourdomain.com)’

Obey robots.txt rules

ROBOTSTXT_OBEY = False

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

COOKIES_ENABLED = False

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,

‘Accept-Language’: ‘en’,

}

Enable or disable spider middlewares

See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘ArticleSpider.middlewares.ArticlespiderSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘ArticleSpider.middlewares.MyCustomDownloaderMiddleware’: 543,

}

Enable or disable extensions

See http://scrapy.readthedocs.org/en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
# ‘ArticleSpider.pipelines.JsonExporterPipleLine’: 2,
# #’scrapy.pipelines.images.ImagesPipeline’:1,
# ‘ArticleSpider.pipelines.ArticleImagePipeline’: 1,
‘ArticleSpider.pipelines.MysqlTwistedPipline’: 1,
}
IMAGES_URLS_FIELD = “front_image_url”
project_dir = os.path.abspath(os.path.dirname(file))
IMAGES_STORE = os.path.join(project_dir,”images”)

IMAGES_MIN_HEIGHT = 100

IMAGES_MIN_WIDTH = 100

Enable and configure the AutoThrottle extension (disabled by default)

See http://doc.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

MYSQL_HOST = “localhost”
MYSQL_DBNAME = “article_spider”
MYSQL_USER = “root”
MYSQL_PASSWORD = “jhy”

from scrapy.cmdline import execute
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(file)))
execute([“scrapy”,”crawl”,”jobbole”])

\main.py

“`

scrapy爬取jobbole

-- coding: utf-8 --

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

采用同步存储mysql

采用异步存储mysql

-- coding: utf-8 --

Scrapy settings for ArticleSpider project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

http://doc.scrapy.org/en/latest/topics/settings.html

http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘ArticleSpider (+http://www.yourdomain.com)’

Obey robots.txt rules

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

COOKIES_ENABLED = False

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,

‘Accept-Language’: ‘en’,

}

Enable or disable spider middlewares

See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘ArticleSpider.middlewares.ArticlespiderSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘ArticleSpider.middlewares.MyCustomDownloaderMiddleware’: 543,

}

Enable or disable extensions

See http://scrapy.readthedocs.org/en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html

IMAGES_MIN_HEIGHT = 100

IMAGES_MIN_WIDTH = 100

Enable and configure the AutoThrottle extension (disabled by default)

See http://doc.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

猜你喜欢