启动文件main.py
from scrapy.cmdline import execute
execute('scrapy crawl wangyi'.split())
执行spider文件下的爬取文件
# -*- coding: utf-8 -*-
import scrapy,re
from ..piaot import * #导入自定义包
from ..items import wangye_mysql
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
allowed_domains = ['news.163.com']
# 初始化
def start_requests(self):
# 输入页数
ye=int(input('请输入页数:'))
# 循环页数
for i in range(1,ye+1):
if i == 1:
url='http://temp.163.com/special/00804KVA/cm_guonei.js?callback=data_callback'
else:
if i <10:
i='0'+str(i)
url='http://temp.163.com/special/00804KVA/cm_guonei_{}.js?callback=data_callback'.format(i)
# 添加报头
form={
"User-Agent":pa(),
}
# 用get发送请求
req=scrapy.Request(url=url,callback=self.nryemian,headers=form)
# 将循环的请求全部存到yield里并且发送下一个函数
yield req
# 将返回的信息遍历出内容页的地址url
def nryemian(self,response):
# response是返回值,进行用GBK转码,请注意如果查看他的编码时是发现它是cp1252的编码格式
html = response.body.decode('GBK')
# 返回的格式:'data_callback([{.....}])'我们需要把它的字母干掉,只剩元组'([{.......}])'
req = html.replace("data_callback", "")
# 将它的字符串干掉
req = eval(req)
# 把处理完的进行循环遍历
for i in req:
# 拿出需要的url路径
nr_url = i['docurl']
# 添加报头
form = {
"User-Agent": pa(),
}
# 发送一个get请求
nr = scrapy.Request(url=nr_url, callback=self.parse,headers=form)
# 发送给parse
yield nr
def parse(self,response):
# 将items里自定义wangye_mysql类实例化
mysql=wangye_mysql()
# 使用xpath匹配出标题
name=response.xpath('//*[@id="epContentLeft"]/h1/text()').extract_first()
# 使用xpath匹配到内容
nr=response.xpath('//*[@id="endText"]/p/text()').extract()
a=''
# 遍历内容,进行字符串格式化,过滤掉\n\r\t各种
for i in nr:
ll = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——【】~@#¥%&*]+", "", i)
if ll != '':
a += ll
mysql['name']=name
mysql['nr']=a
# 将实例化发送到pipelines.py
yield mysql
items文件
# 自定义一个类名wangye_mysql
class wangye_mysql(scrapy.Item):
# define the fields for your item here like:
# 构造(创建)
name = scrapy.Field()
nr=scrapy.Field()
settings.py配置文件
将改的地方写了下来,没写的说明没有改动
#导包
from ..piaot import *
#是否遵循规则,不懂请百度
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False
#开启报头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':pa(), #pa()是从piaot 导过来的报头
}
ITEM_PIPELINES = {
'wanyi.pipelines.Mysql': 300,
}
存储文件pipelines.py
# -*- coding: utf-8 -*-
import pymysql,datetime
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class WanyiPipeline(object):
def process_item(self, item, spider):
return item
class Mysql(object):
def process_item(self, item, spider):
# 获得当前日期
req = datetime.date.today()
#构造 sql语句
sql="insert into xq_3 values(NULL,'{}','{}','{}')".format(item['name'],item['nr'],req)
# 打开数据库连接,
db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 使用 execute() 方法获取一条数据
data = cursor.execute(sql)
# 返回添加几条数据,1等于成功
print("Database version : %s " % data)
# 提交到数据库执行
db.commit()
db.close()