[Python爬虫]Scrapy框架爬取网易国内新闻

启动文件main.py

from scrapy.cmdline import execute

execute('scrapy crawl wangyi'.split())

执行spider文件下的爬取文件

# -*- coding: utf-8 -*-
import scrapy,re
from ..piaot import *       #导入自定义包
from ..items import wangye_mysql


class WangyiSpider(scrapy.Spider):
    name = 'wangyi'
    allowed_domains = ['news.163.com']

    # 初始化
    def start_requests(self):

        # 输入页数
        ye=int(input('请输入页数:'))

        # 循环页数
        for i in range(1,ye+1):

            if i == 1:

                url='http://temp.163.com/special/00804KVA/cm_guonei.js?callback=data_callback'

            else:

                if i <10:

                    i='0'+str(i)

                url='http://temp.163.com/special/00804KVA/cm_guonei_{}.js?callback=data_callback'.format(i)

            # 添加报头
            form={
                "User-Agent":pa(),
            }

            # 用get发送请求
            req=scrapy.Request(url=url,callback=self.nryemian,headers=form)

            # 将循环的请求全部存到yield里并且发送下一个函数
            yield  req

    # 将返回的信息遍历出内容页的地址url
    def nryemian(self,response):

        # response是返回值,进行用GBK转码,请注意如果查看他的编码时是发现它是cp1252的编码格式
        html = response.body.decode('GBK')

        # 返回的格式:'data_callback([{.....}])'我们需要把它的字母干掉,只剩元组'([{.......}])'
        req = html.replace("data_callback", "")

        # 将它的字符串干掉
        req = eval(req)

        # 把处理完的进行循环遍历
        for i in req:

            # 拿出需要的url路径
            nr_url = i['docurl']

            # 添加报头
            form = {
                "User-Agent": pa(),
            }
            # 发送一个get请求
            nr = scrapy.Request(url=nr_url, callback=self.parse,headers=form)

            # 发送给parse
            yield nr


    def parse(self,response):
        # 将items里自定义wangye_mysql类实例化
       mysql=wangye_mysql()

       # 使用xpath匹配出标题
       name=response.xpath('//*[@id="epContentLeft"]/h1/text()').extract_first()

       # 使用xpath匹配到内容
       nr=response.xpath('//*[@id="endText"]/p/text()').extract()
       a=''
        # 遍历内容,进行字符串格式化,过滤掉\n\r\t各种
       for i in nr:
            ll = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——【】~@#¥%&*]+", "", i)
            if ll != '':
                a += ll

       mysql['name']=name
       mysql['nr']=a

        # 将实例化发送到pipelines.py
       yield mysql

items文件

# 自定义一个类名wangye_mysql
class wangye_mysql(scrapy.Item):
    # define the fields for your item here like:
    # 构造(创建)
    name = scrapy.Field()
    nr=scrapy.Field()

settings.py配置文件

将改的地方写了下来,没写的说明没有改动

#导包
from ..piaot import *

#是否遵循规则,不懂请百度
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False

#开启报头
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',       
  'User-Agent':pa(),     #pa()是从piaot 导过来的报头

}

ITEM_PIPELINES = {
  'wanyi.pipelines.Mysql': 300,
}

存储文件pipelines.py

# -*- coding: utf-8 -*-
import pymysql,datetime
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class WanyiPipeline(object):
    def process_item(self, item, spider):
        return item


class Mysql(object):
    def process_item(self, item, spider):

        # 获得当前日期
        req = datetime.date.today()

        #构造 sql语句
        sql="insert into xq_3 values(NULL,'{}','{}','{}')".format(item['name'],item['nr'],req)

        # 打开数据库连接,
        db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
        # 使用 cursor() 方法创建一个游标对象 cursor
        cursor = db.cursor()

        # 使用 execute() 方法获取一条数据
        data = cursor.execute(sql)

        # 返回添加几条数据,1等于成功
        print("Database version : %s " % data)

        # 提交到数据库执行
        db.commit()
        db.close()

猜你喜欢

转载自blog.csdn.net/Black_God1/article/details/82146955