爬虫-建设银行信用卡商家优惠数据抓取(2018-11-19)

文章目录

爬虫地址
爬虫环境和技术
爬虫代码
致谢

爬虫地址

http://creditcard.ccb.com/cn/creditcard/creditFavarite.html#card_province=1020&card_city=196&startNum=1&endNum=10

爬虫环境和技术

python3.6
技术使用参照另外两篇文章：
爬虫-中国银行卡-优惠商户活动数据（2018-11-15）
爬虫-新浪财经-信用卡优惠商店数据（2018-11-15）

爬虫代码

# -*-coding:utf-8-*-
import json
import os
import sys

import requests
import xlrd
import xlwt
from xlutils.copy import copy


def get_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response
    except:
        return None


def write_data(sheet, row, lst):
    for data_infos in lst:
        j = 0
        for data in data_infos:
            sheet.write(row, j, data)
            j += 1
        row += 1


def save(file_name, data):
    if os.path.exists(file_name):
        # 打开excel
        rb = xlrd.open_workbook(file_name, formatting_info=True)
        # 用 xlrd 提供的方法获得现在已有的行数
        rn = rb.sheets()[0].nrows
        # 复制excel
        wb = copy(rb)
        # 从复制的excel文件中得到第一个sheet
        sheet = wb.get_sheet(0)
        # 向sheet中写入文件
        write_data(sheet, rn, data)
        # 删除原先的文件
        os.remove(file_name)
        # 保存
        wb.save(file_name)
    else:
        header = ['biz_id', 'biz_name', 'cate_id', 'cate_name', 'catechild_id', 'catechild_name', 'province_id',
                  'province,city_id', 'city,biz_addr', 'biz_desc', 'start_level', 'life_id', 'life', 'biz_phone',
                  'biz_cmsg', 'url']
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('建设银行卡-优惠商户活动数据')
        # 向 excel 中写入表头
        for h in range(len(header)):
            sheet.write(0, h, header[h])
        # 向sheet中写入内容
        write_data(sheet, 1, data)
        book.save(file_name)


def main():
    base_url = 'http://creditcard.ccb.com/cn/creditcard/creditFavarite.html#card_province={prov_code}&card_city={city_code}'
    cities_js = 'http://creditcard.ccb.com/cn/creditcard/v3/js/citys.js'
    citis_resp = get_page(cities_js)
    citis_resp = citis_resp.content.decode('utf-8').split('=')[1]
    # print(citis_resp)
    citis_json = json.loads(citis_resp)
    # print(citis_json)
    path = (input('请输入要保存的地址(例如：C:\\Users\\xhdong1\\Desktop\\)，不输入则保存到当前地址：\n'))
    file_name = path + '建设银行卡-优惠商户活动数据.xls'
    print(file_name)
    province_total = len(citis_json)
    for dictinct in citis_json:
        # print(dictinct)
        prov_code = dictinct['prov_code']
        prov_name = dictinct['prov_name']
        # print(prov_code)
        # print(prov_name)
        cities = dictinct['citys']
        city_total = len(cities)
        for city in cities:
            all_company_info_list = []
            city_code = city['city_code']
            city_name = city['city_name']
            print('总过有【{province_total}】个城市，现在正在爬取【{prov_name}】，该城市一共有【{city_total}】个城市，现在正在爬虫【{city}】的数据'.format(
                province_total=province_total, prov_name=prov_name, city_total=city_total, city=city_name))
            # print(city_code)
            # print(city_name)
            url = 'http://creditcard.ccb.com/webtran/get_crd_info.gsp?table_type=2&card_province={prov_code}&card_city={city_code}&startNum=1&endNum=1000000'.format(
                prov_code=prov_code, city_code=city_code)
            # print(_url)
            companies_list = get_page(url)
            companies_list = companies_list.content.decode('utf-8')
            # print(companies_list)

            try:
                companies_json = json.loads(companies_list)
            except:
                continue
            if companies_json:
                # print(companies_json)
                companies = companies_json['obj']
            else:
                continue
            # print(companies)
            for company in companies:
                biz_id = company.get('biz_id')
                biz_name = company.get('biz_name')
                cate_id = company.get('cate_id')
                cate_name = company.get('cate_name')
                catechild_id = company.get('catechild_id')
                catechild_name = company.get('catechild_name')
                province_id = company.get('province_id')
                province = company.get('province')
                city_id = company.get('city_id')
                city = company.get('city')
                biz_addr = company.get('biz_addr')
                biz_desc = company.get('biz_desc')
                start_level = company.get('start_level')
                life_id = company.get('life_id')
                life = company.get('life')
                biz_phone = company.get('biz_phone')
                biz_cmsg = company.get('biz_cmsg')
                url = 'http://creditcard.ccb.com/cn/creditcard/favorable/' + biz_id + '.html'
                company_info = [biz_id, biz_name, cate_id, cate_name, catechild_id, catechild_name, province_id,
                                province, city_id, city, biz_addr, biz_desc, start_level, life_id, life, biz_phone,
                                biz_cmsg, url]
                # for info in company_info:
                #     print(info)
                all_company_info_list.append(company_info)
                # print('--------------')
            save(file_name, all_company_info_list)
    print('爬完')


if __name__ == '__main__':
    main()

致谢

感谢生活。

爬虫-建设银行信用卡商家优惠数据抓取(2018-11-19)

文章目录

爬虫地址

爬虫环境和技术

爬虫代码

致谢

猜你喜欢