scrapy框架-新智联招聘爬取数据(仅供学习参考),熟知-jsonpath的应用

一,创建项目:

scrapy startproject  项目名称 》cd 项目目录     手动或者命令(scrapy genspider 爬虫名称 域名)创建spider文件

tree结构图如下:

│  main.py

│  scrapy.cfg

│  __init__.py

├─zhilian

│  │  items.py

│  │  middlewares.py

│  │  MYmiddlewares.py

│  │  pipelines.py

│  │  settings.py

扫描二维码关注公众号,回复: 1892134 查看本文章

│  │  __init__.py

│  │

│  ├─spiders

│  │  │  zhilianzhaopin.py

│  │  │  __init__.py

│  │  │

│  │  └─__pycache__

│  │          zhilianzhaopin.cpython-36.pyc

│  │          __init__.cpython-36.pyc

│  │

│  └─__pycache__

│          items.cpython-36.pyc

│          MYmiddlewares.cpython-36.pyc

│          pipelines.cpython-36.pyc

│          settings.cpython-36.pyc

│          __init__.cpython-36.pyc

└─__pycache__

        __init__.cpython-36.pyc    


spider文件:zhilianzhaopin.py详解:

import scrapy
import jsonpath
import json
import requests
from urllib import parse
from zhilian.items import ZhilianItem
from lxml import etree

class Zhilian(scrapy.Spider):
   name = 'zhilian'
   allowed_domains = ['zhaopin.com']
   start_urls=['https://www.zhaopin.com/']
   #默认全国
   base_url='https://fe-api.zhaopin.com/c/i/sou?start=%d&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&{}&kt=3 '
   # base_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=60&cityId=%d&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%s&kt=3'
   custom_settings = {
      'DOWNLOADER_MIDDLEWARES': {
         # 'zhilian.MYmiddlewares.MiddleAgent': 800,
         'zhilian.MYmiddlewares.RandomProxyMysql': 900,
      },
      'COOKIES_ENABLED':False,
      # 'RETRY_TIMES': 2,  # 下载器重试次数
      # 'DOWNLOAD_TIMEOUT':3  # 3秒以后请求超时

   }
   headers = {
      "Accept": "application/json, text/plain, */*",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Host": " fe-api.zhaopin.com",
      "Origin": " https://sou.zhaopin.com",
      "Referer": " https://sou.zhaopin.com/?kw=Java%E5%BC%80%E5%8F%91&jl=489&",
      "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
      "Cookie":"adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; __utmt=1; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; dyweb=95841923.6.10.1530634535; __utmb=269921210.6.10.1530634535; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635448; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22c3a9c233-0074-42ea-a976-9c82a8802d14-sou%22%2C%22funczone%22:%22smart_matching%22}}; GUID=52e3369c691f4672a808374ad5d77426; sts_evtseq=9",
   }
   #只爬IT的
   def parse(self, response):
      data = response.xpath('//ol[@class="zp-jobNavigater-list"]/li[1]//div[@class="zp-jobNavigater-pop-list"]/a/text()').extract()
      for i in data:
         qs = {
            'kw': i,
         }
         qs = parse.urlencode(qs)
         url_1 = self.base_url % 0
         baseurl = url_1.format(qs)
         # print(baseurl)
         response = requests.get(baseurl)
         data = response.text
         data = json.loads(data)
         res = jsonpath.jsonpath(data,'$..numFound')[0]
         for i in range(0,res,60):
            url_2 = self.base_url % i
            fullurl = url_2.format(qs)
            print(fullurl)
            yield scrapy.Request(fullurl, callback=self.parse_page,headers=self.headers)


   def parse_page(self,response):
      data = response.text
      data = json.loads(data)
      result = jsonpath.jsonpath(data, '$..results[*]')

      for i in result:
         print(i)
         item = ZhilianItem()
         company_job = jsonpath.jsonpath(i,'$..jobName')[0]
         company = jsonpath.jsonpath(i,'$..company.name')[0]
         money = jsonpath.jsonpath(i,'$..salary')[0]
         adress = jsonpath.jsonpath(i,'$..city.display')[0]
         date_time = jsonpath.jsonpath(i,'$..createDate')[0]
         tag_list = jsonpath.jsonpath(i,'$..welfare')[0]
         tag_list= ','.join(tag_list)
         point_1 = jsonpath.jsonpath(i,'$..workingExp.name')[0]
         point_2 = jsonpath.jsonpath(i,'$..eduLevel.name')[0]
         point = point_1 +','+ point_2
         pos_url = jsonpath.jsonpath(i, '$..positionURL')[0]
         # print(company_job)
         # print(company)
         # print(money)
         # print(adress)
         # print(date_time)
         # print(tag_list)
         # print(point)
         # print(pos_url)
         item['spidername'] = 'zhilianzhaopin'
         item['company_job'] = company_job
         item['company'] = company
         item['money'] = money
         item['adress'] = adress
         item['date_time'] = date_time
         item['tag_list'] = tag_list
         item['point'] = point
         item['url'] = pos_url
         yield scrapy.Request(pos_url, callback=self.parse_detail,meta={'item':item})



   def parse_detail(self,response):
      item = response.meta['item']
      info = response.css('div.tab-inner-cont p::text').extract()
      info = ','.join(info).strip().strip(',').strip()
      item['info'] = info
      yield item

主要思路:

1、通过实验可以看到,数据并没有在源码中,需要我们通过Fidder抓取实际数据传输的网址(这个过程需要我们认真分析,把URl的参数查找完全例如:页码参数、关键字参数),从而得到可以获取所有的json数据的url.(上面的url我只是用起始页和关键字进行变量,其他参数都是固定的,爬取一部分数据)

2、进行jsonpath解析数据,得到自己想到的数据,并得到详情页的url,用parse_detail处理

3、进入详情页面需要数据,许多数据并不在分析的标签元素例如:收入money,详情;(需要自己查看源码,得到准确的位置和标签,利用css选择器等方法,进行提取),注意:能在json数据提取的数据尽量在json数据提取,避免了很多工作量。

反爬措施:

1、自己定义抬头,自己重写中间件;

2、构建一个代理池,随机选择一个;

3、settings文件的相关配置。


重新下载中间件MYmiddlewares.py

    

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from fake_useragent import UserAgent
import random,base64

from scrapy.conf import settings
import pymysql
import random
#定义一个中间件就是定义一个类

#随机选择抬头
class MiddleAgent(object):
    cookie = [
        {
            "Cookie": "adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; __utmt=1; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; dyweb=95841923.6.10.1530634535; __utmb=269921210.6.10.1530634535; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635448; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22c3a9c233-0074-42ea-a976-9c82a8802d14-sou%22%2C%22funczone%22:%22smart_matching%22}}; GUID=52e3369c691f4672a808374ad5d77426; sts_evtseq=9"},
        {
            "Cookie": "adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; dyweb=95841923.6.10.1530634535; __utmb=269921210.6.10.1530634535; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; GUID=0bdb4e3bfb2845bc834bd9ef6b9341af; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22a6e039d1-dce0-48c9-bf18-47123bd339f8-sou%22%2C%22funczone%22:%22smart_matching%22}}; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635458; sts_evtseq=10"},
        {
            "cookie": "adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; BLACKSTRIP=yes; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; Hm_lvt_80e552e101e24fe607597e5f45c8d2a2=1530519653,1530519725,1530598259,1530632108; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; Hm_lpvt_80e552e101e24fe607597e5f45c8d2a2=1530635319; stayTimeCookie=0; referrerUrl=https%3A//jobs.zhaopin.com/CZ478053280J00059255513.htm; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; GUID=0bdb4e3bfb2845bc834bd9ef6b9341af; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22a6e039d1-dce0-48c9-bf18-47123bd339f8-sou%22%2C%22funczone%22:%22smart_matching%22}}; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635458; sts_evtseq=11; dyweb=95841923.7.10.1530634535; __utmt=1; __utmb=269921210.7.10.1530634535"}
    ]
    def __init__(self):
        self.ua = UserAgent()
        self.cookie = random.choice(self.cookie)
    def process_request(self,request,spider):
        request.headers['User-Agent'] = self.ua.random
        request.headers['User-Agent'] = self.cookie

# 随机代理中间件,混合免费代理和认证代理  (数据库版)
class RandomProxyMysql(object):
    def __init__(self):
        my = settings['MYSQL']
        self.conn = pymysql.connect(my['host'], my['user'], my['password'], my['db'], charset='utf8')
        self.cursor = self.conn.cursor()
    # 发起请求前执行
    def process_request(self,request,spider):
        # print(settings['PROXIES'])
        # print(random.choice(settings['PROXIES']))
        # 获取代理
        proxy = self.random_proxy()
        print(proxy)
        # 设置代理信息
        request.meta['proxy'] = 'http://%s:%s' % (proxy[1],proxy[2])

    # 响应以后执行
    def process_response(self,request ,response,spider):
        print(response.status)
        return response

    def random_proxy(self):
        sql = 'select * from py09_proxy ORDER BY rand() limit 1'
        self.cursor.execute(sql)
        proxy = self.cursor.fetchone()
        return proxy




管道文件pipelines.py:


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

class ZhilianPipeline(object):
    def process_item(self, item, spider):
        return item

#数据写入库

class MysqlPipeline(object):
    def open_spider(self, spider):
        # 开启数据库连接
        print('进入数据库')
        self.conn = pymysql.connect('127.0.0.1', 'root', '123456', 'temp', charset='utf8')
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        # 存储数据
        try:
            sql, data = item.get_sql()
            self.cursor.execute(sql, data)
            self.conn.commit()
            print('文件写入')
        except Exception as e:
            print('------',e)

        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()


模板文件items.py:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ZhilianItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class ZhilianItem(scrapy.Item):
    spidername = scrapy.Field()
    company_job = scrapy.Field()
    company = scrapy.Field()
    money = scrapy.Field()
    adress = scrapy.Field()
    date_time = scrapy.Field()
    tag_list = scrapy.Field()
    point = scrapy.Field()
    url = scrapy.Field()
    info = scrapy.Field()

    def get_sql(self):
        sql = 'insert into py09_zhilian(spidername,company_job,company ,money ,adress ,date_time ,tag_list,point ,url,info) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        data = (self['spidername'],self['company_job'],self['company'],self['money'],self['adress'],self['date_time'],self['tag_list'],self['point'],self['url'],self['info'])
        return sql,data

全局settings.py:

# -*- coding: utf-8 -*-

# Scrapy settings for zhilian project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'zhilian'

SPIDER_MODULES = ['zhilian.spiders']
NEWSPIDER_MODULE = 'zhilian.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'zhilian (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'zhilian.middlewares.ZhilianSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'zhilian.middlewares.ZhilianDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'zhilian.pipelines.ZhilianPipeline': 300,
   'zhilian.pipelines.MysqlPipeline': 200,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
MYSQL = {
    'host' : '127.0.0.1',
    'user' : 'root',
    'password' : '123456',
    'db' : 'temp'
}

结语:

    爬取网页,不要急于入手写代码,重点是分析网页,写代码的逻辑比较常规,但是网页的结构和数据是多变。








猜你喜欢

转载自blog.csdn.net/weixin_41218014/article/details/80909623