爬虫-金采网数据(2018-11-19)

文章目录

爬虫地址
环境
爬虫代码
致谢

爬虫地址

http://www.cfcpn.com/plist/caigou?pageNo=1&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo=,,

环境

python3.6.5

爬虫代码

# -*- coding:utf-8*-
import csv
import os
import re
import time

import lxml
import xlrd as xlrd
import xlwt as xlwt
from lxml import etree
import requests
import sys
from xlutils.copy import copy

sys.getdefaultencoding()


def get_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # response.encoding = 'utf-8'
            html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
            # print(sys.getdefaultencoding())
            # print(html)
            return html
    except requests.ConnectionError:
        return None


def parse_page(html):
    pattern1 = '<.*?(href=".*?/\d+").*?'
    href_url = re.findall(pattern1, html, re.I)
    # print(href_url)
    url_list = []
    for url in href_url:
        url1 = url.replace('href=', 'http://www.cfcpn.com').replace('"', '')
        # print(url1)
        url_list.append(url1)
    return url_list


def get_detail_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'').replace(u'\xa0', u'')
            # print(response.status_code)
            # print('--------------------------------')
            mytree = lxml.etree.HTML(html)
            return mytree
    except requests.ConnectionError:
        return None


def parse_detail_page(detail_html):
    item_name = ""
    purchase = ""
    item_type = ""
    money = ""
    date = ""
    company = ""
    p_list = detail_html.xpath('//*[@id="news_content"]/p')
    item_name_list = detail_html.xpath('//*[@id="news_head"]/p[1]//text()')
    for item_name_src in item_name_list:
        index = item_name_src.find('目')
        if (index == -1):
            index = item_name_src.find('案')
        item_type = item_name_src[index + 1:]
        if index < 1:
            item_name = item_name_src
        else:
            item_name = item_name_src[:index + 1]
    date_list = detail_html.xpath('//*[@id="news_head"]/p[2]//text()')

    for dt in date_list:
        date = dt[5:16]

    for p in p_list:
        p_content = p.xpath('.//text()')
        context = ""
        for text in p_content:
            context = context + text.strip().replace(u'\xa0', u'').replace(u'\xa5', u'').replace('\r\n', '')
        # print(context.strip("\n"))
        if '元' in context or '金额' in context:
            money = context + money
        elif '价格' in context:
            money = context + money
            # print(money)
        if '中标' in context:
            company = company + context
        elif '供应商' in context:
            company = company + context
        elif '中选单位' in context:
            company = company + context
        elif '公司' in context:
            company = company + context

        if '采购方式' in context:
            try:
                purchase = context.split("：")[1]
            except:
                purchase = context

    info_list = [item_name, item_type, purchase, company, money, date]
    return info_list


def write_data(sheet, row, lst):
    for data_infos in lst:
        j = 0
        for data in data_infos:
            sheet.write(row, j, data)
            j += 1
        row += 1


def save(file_name, data):
    if os.path.exists(file_name):
        # 打开excel
        rb = xlrd.open_workbook(file_name, formatting_info=True)
        # 用 xlrd 提供的方法获得现在已有的行数
        rn = rb.sheets()[0].nrows
        # 复制excel
        wb = copy(rb)
        # 从复制的excel文件中得到第一个sheet
        sheet = wb.get_sheet(0)
        # 向sheet中写入文件
        write_data(sheet, rn, data)
        # 删除原先的文件
        os.remove(file_name)
        # 保存
        wb.save(file_name)
    else:
        header = ['company_name', 'company_desc', 'company_type', 'card_type', 'activity_deadtime', 'company_address',
                  'company_phone', 'activity_info']
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('金采网')
        # 向 excel 中写入表头
        for h in range(len(header)):
            sheet.write(0, h, header[h])
        # 向sheet中写入内容
        write_data(sheet, 1, data)
        book.save(file_name)


def main():
    print('*' * 80)
    print('\t\t\t\t金采网数据下载')
    print('作者：谢华东  2018.11.8')
    print('--------------')
    table = int(float(input('请输入公告类型(1:1采购公告，2-结果公告)：\n')))
    while (table < 1 or table > 2):
        quarter = int(float(input('公告类型输入错误，请重新输入：\n')))
    path = (input('请输入要保存的地址(例如：C:\\Users\\xhdong1\\Desktop\\)，不输入直接按回车表示默认当前位置：\n'))
    dict_tables = {1: 'caigou', 2: 'jieguo'}
    file_name = path + '金采网' + dict_tables[table] + '.xls'

    # 计算总共有多少页

    minfrom = int(input('请输入你需要从哪一个开始爬:\n'))
    maxto = int(input('请输入你需要截止到哪一页:\n'))
    for i in range(minfrom, maxto):
        print('正在爬取' + dict_tables[table] + '下的第' + str(i) + '页数据')
        base = 'http://www.cfcpn.com/plist/{type}?pageNo={page_num}&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo='
        url = base.format(type=dict_tables[table], page_num=i)
        time.sleep(1)
        all_info_list = []
        html = get_page(url)
        if html == None:
            print('该页没有数据')
            continue
        url_list = parse_page(html)
        # print(url_list)
        for url in url_list:
            # print(url)
            detail_html = get_detail_page(url)
            context_list = parse_detail_page(detail_html)
            context_list.append(url)
            # print(i)
            # print(context_list)
            all_info_list.append(context_list)
        save(file_name, all_info_list)


if __name__ == '__main__':
    main()

致谢

感谢自己。

爬虫-金采网数据(2018-11-19)

文章目录

爬虫地址

环境

爬虫代码

致谢

猜你喜欢