Scrapy实战之存储在MySQL中

爬取自己在CSDN博客的数据(https://blog.csdn.net/lixinkuan328/article/list/1),并保存在MongoDB中。

  • 使用Scrapy命令生成项目工程和爬虫类:

scrapy startproject csdn

scrapy genspider csdnspider blog.csdn.net

编码步骤

1.items.py

# -*- coding: utf-8 -*-

import scrapy

class CsdnItem(scrapy.Item):
    #w文章名称
    name = scrapy.Field()
    #网址
    csdnUrl = scrapy.Field()
    #内容概括
    content = scrapy.Field()
    #发布时间
    creatTime = scrapy.Field()
    #阅读数
    readNum = scrapy.Field()

2.spiders/DoubanSpider.py

# -*- coding: utf-8 -*-
import scrapy
from csdn.items import CsdnItem

class CsdnspiderSpider(scrapy.Spider):
    name = 'csdnspider'
    allowed_domains = ['blog.csdn.net']
    offset = 1
    url = "https://blog.csdn.net/lixinkuan328/article/list/"
    start_urls = (
        url + str(offset),
    )

    def parse(self, response):
        item = CsdnItem()
        articles = response.xpath("//div[@class='article-item-box csdn-tracking-statistics']")


        for each in articles:
            item['name'] = each.xpath("./h4/a//text()").extract()[2].strip()
            item['csdnUrl'] = each.xpath("./h4/a/@href").extract()[0].strip()
            item['content'] = each.xpath("./p[@class='content']/a/text()").extract()[0].strip()
            item['creatTime'] = each.xpath("./div//span[@class='date']/text()").extract()[0].strip()
            item['readNum'] = each.xpath("./div//span[@class='num']/text()").extract()[0].strip()
            yield item
        if self.offset < 13:
            self.offset += 1
            yield scrapy.Request(self.url + str(self.offset), callback=self.parse)

3.pipelines.py

# -*- coding: utf-8 -*-
import pymysql

class CsdnPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(
            host='192.168.18.102',
            port=3306,
            db='lxk',
            user='root',
            password='123',
            charset='utf8',
            use_unicode=True
        )
        # 通过cursor执行增删查改
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        self.cursor.execute(
            # 纯属python操作mysql知识,不熟悉请恶补
            """insert into csdn(name, csdnurl, content, creattime, readnum) value (%s, %s, %s, %s,%s)""",
            (item['name'].encode('utf8','ignore'), item['csdnUrl'].encode('utf8','ignore'),  item['content'].encode('utf8','ignore'), item['creatTime'], item['readNum']))
        # 提交事务
        self.connect.commit()
        # 必须实现返回
        return item

4.settings.py

ITEM_PIPELINES = {
    'csdn.pipelines.CsdnPipeline': 300,
}

5.运行文件start.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from scrapy import cmdline
cmdline.execute("scrapy crawl csdnspider".split())

验证结果如下: 

发布了491 篇原创文章 · 获赞 435 · 访问量 12万+

猜你喜欢

转载自blog.csdn.net/lixinkuan328/article/details/104361936