scrapy模拟登陆pos系统,爬取登陆后客户列表页面信息到excel表中(网址用XXXX代替)

1、创建Scrapy项目

scrapy startproject PosClient

2.进入项目目录,使用命令genspider创建Spider

scrapy genspider posclient XXXX.com

3、定义要抓取的数据(处理items.py文件)

# -*- coding: utf-8 -*-
import scrapy

class PosclientItem(scrapy.Item):
    # 序号
    number_list= scrapy.Field()
    # 客户手机号
    client_phone = scrapy.Field()
    # 客户姓名
    client_name = scrapy.Field()
    # 客户地址
    client_add = scrapy.Field()
    # 注册时间
    client_date = scrapy.Field()
    # 采购金额
    client_sale = scrapy.Field()
    # 采购次数
    client_sale_num = scrapy.Field()
    # 种植面积
    client_area = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:posclient.py)

# -*- coding: utf-8 -*-
import scrapy
from PosClient.items import PosclientItem

class PosclientSpider(scrapy.Spider):
    name = 'posclient'
    allowed_domains = ['XXXX.com']
    # 登陆界面网址
    login_page = 'https://pos.XXXX.com/login.html'
    offset = 1
    # 登陆后需要爬取的页面网址前半部分
    url = 'https://pos.XXXX.com/client/p='
    # 拼接爬取的页面网址
    start_urls = [url + str(offset)]
    username = input("请输入账号:")
    password = input("请输入密码:")
    # 名字不可更改
    def start_requests(self):
        yield scrapy.Request(url=self.login_page, callback=self.login)
    # 登陆,处理form表单
    def login(self, response):
        yield scrapy.FormRequest.from_response(
            response,
            formdata={"j_username": self.username, "j_password": self.password},
            callback=self.parse_page
        )

    # 获取登录成功状态,访问需要登录后才能访问的页面
    def parse_page(self, response):
        if "loginerror" in response.body.decode('utf-8'):
            print("登录失败,错误的手机号码或密码!")
        if "</span>首页" in response.body.decode('utf-8'):
            print("欢迎您'%s',成功登陆POS管理系统!" % self.username)
            # 登录成功后获取商品列表页,并回调parse()函数处理数据
            full_url = self.url + str(self.offset)
            yield scrapy.Request(full_url, callback=self.parse)

    def parse(self, response):
        # 获取下一页的链接地址
        next_url_list = response.xpath('//div[@class="dataTables_paginate paging_full_numbers"]/span/span/a/@href').extract()
        for each in response.xpath('//div[@class="dataTables_wrapper"]'):
            item = PosclientItem()
            # 序号,首尾有空格
            number_list = each.xpath('.//td[1]/text()').extract()
            # 客户手机号
            client_phone = each.xpath('.//td[2]/a[1]/text()').extract()
            # 客户姓名
            client_name = each.xpath('.//td[2]/a[2]/text()').extract()
            # 客户地址
            client_add = each.xpath('.//td[3]/a/text()').extract()
            # 注册时间
            client_date = each.xpath('.//tbody//td[4]/a/text()').extract()
            # 采购金额,首尾有空格
            client_sale = each.xpath('.//tbody//td[5]/a/text()').extract()
            # 采购次数
            client_sale_num = each.xpath('.//tbody//td[6]/a/text()').extract()
            # 种植面积,首尾有空格
            client_area = each.xpath('.//tbody//td[7]/text()').extract()
            for i in range(len(client_phone)):
                item['number_list'] = number_list[i].strip()
                item['client_phone'] = client_phone[i].strip()
                item['client_name'] = client_name[i].strip()
                item['client_add'] = client_add[i].strip()
                # 日期结尾有个.0去掉,2017-11-10 11:04:40.0
                item['client_date'] = client_date[i].strip()[:-2]
                item['client_sale'] = client_sale[i].strip()
                item['client_sale_num'] = client_sale_num[i].strip()
                item['client_area'] = client_area[i].strip()
                yield item
        # 处理下一页
        for url in next_url_list:
            full_url = 'https://pos.XXXX.com/client.html'+ str(url)
            yield scrapy.Request(url=full_url,callback=self.parse)

5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)

# -*- coding: utf-8 -*-
import json
from openpyxl import Workbook

# 转码操作,继承json.JSONEncoder的子类
class MyEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, bytes):
            return str(o, encoding='utf-8')
        return json.JSONEncoder.default(self, o)

class PosclientPipeline(object):
    def __init__(self):
        self.wb = Workbook()
        self.ws = self.wb.active
        # 创建表头
        self.ws.append(['序号','客户手机号','客户姓名','客户地址','注册时间','采购金额','采购次数','种植面积'])

    def process_item(self, item, spider):
        text = [item['number_list'], item['client_phone'], item['client_name'], item['client_add'],
                item['client_date'], item['client_sale'], item['client_sale_num'],item['client_area']]
        self.ws.append(text)
        return item
    def close_spider(self,spider):
        self.wb.save('pos_client.xlsx')
        print("数据处理完毕,谢谢使用!")

6.配置settings文件(settings.py)

# Obey robots.txt rules,具体含义参照:https://blog.csdn.net/z564359805/article/details/80691677      
ROBOTSTXT_OBEY = False  

# 下载延迟
DOWNLOAD_DELAY = 2  
# Override the default request headers:添加User-Agent信息      
DEFAULT_REQUEST_HEADERS = {      
  'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',      
  # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
  # 'Accept-Language': 'en',      
}  
      
# Configure item pipelines去掉下面注释,打开管道文件      
ITEM_PIPELINES = {
   'PosClient.pipelines.PosclientPipeline': 300,
}
      
# 还可以将日志存到本地文件中(可选添加设置)      
LOG_FILE = "stats.log"      
LOG_LEVEL = "DEBUG" 
# 包含打印信息也一起写进日志里
LOG_STDOUT = True

7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

scrapy crawl posclient

猜你喜欢

转载自blog.csdn.net/z564359805/article/details/80874045