python 爬虫demo

# coding: utf-8

import datetime
import urllib.parse
import urllib.request
from urllib.error import *
from bs4 import BeautifulSoup
import re
import os


def get_html(url, values):
    html = ''
    status_code = 200
    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
    headers = {'User-Agent': user_agent}
    data = urllib.parse.urlencode(values).encode(encoding='UTF8')
    for i in range(1,3):
        req = urllib.request.Request(url=url, headers=headers, data=data)
        try:
            response = urllib.request.urlopen(req)
        except HTTPError as e:
            print(url, values)
            print('The server couldn\'t fulfill the request.')
            print('HTTP Error,code: ', e.code)
            status_code = int(e.code)
            break
        except URLError as e:
            status_code = int(e.code)
            print('We failed to reach a server.Reason: ', e.reason)
            print('url: %s, status code:%d, retry count:%d' % (url + '?' + bytes.decode(data), status_code, i))
        else:
            html = response.read( ).decode('gbk')
            break

    return html, status_code


def request_page(page):
    url = 'http://cmispub.cicpa.org.cn/cicpa2_web/PersonIndexAction.do'
    values = {
        'method': 'indexQuery',
        'queryType': '2',
        'isStock': '00',
        'pageSize': '',
        'pageNum': page,
        'offName': '',
        'ascGuid': '',
        'perCode': 0,
        'perName': ''
    }
    return get_html(url, values)


def parse_cicpa_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.select("#tabDetail a")
    return items


def request_detail(code):
    print('request code:', code)
    url = 'http://cmispub.cicpa.org.cn/cicpa2_web/07/' + code + '.shtml'
    values = {}
    return get_html(url, values)


def parse_detail_header(html):
    soup = BeautifulSoup(html, 'html.parser')
    headers = soup.select("#detailtb td.tdl")
    line = ''
    for item in headers:
        line = line + item.get_text( ).strip( ) + ','
    line = line.strip(',')
    return line


def parse_detail_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    headers = soup.select("#detailtb td.data_tb_content")
    line = ''
    for item in headers:
        line = line + item.get_text( ).strip( ) + ','
    line = line.strip(',')
    return line


def create_file(filepath, header):
    file_dir = os.path.split(filepath)[0]
    if not os.path.isdir(file_dir):
        os.makedirs(file_dir)
    if not os.path.exists(filepath):
        f = open(filepath, 'w')
        if len(header) > 0:
            f.write(header + '\n')
        f.close( )


def is_down_exists(code):
    return False


def main():
    start_time = datetime.datetime.now( )

    html_dir = 'D:/crawl_data/cicpa/html/'
    if not os.path.isdir(html_dir):
        os.makedirs(html_dir)

    header_file = 'D:/crawl_data/cicpa/header.csv'
    need_header = not os.path.exists(header_file)

    datafile = 'D:/crawl_data/cicpa/data_%s.csv' % start_time.strftime("%Y%m%d_%H%M%S_%f")
    page_error_file = 'D:/crawl_data/cicpa/error_page_%s.txt' % start_time.strftime("%Y%m%d_%H%M%S_%f")
    detail_error_file = 'D:/crawl_data/cicpa/error_detail_%s.txt' % start_time.strftime("%Y%m%d_%H%M%S_%f")

    create_file(datafile, '')
    create_file(page_error_file, 'page,status')
    create_file(detail_error_file, 'code,status')

    data_file_object = open(datafile, '+w')
    page_error_file_object = open(page_error_file, '+w')
    detail_error_file_object = open(detail_error_file, '+w')

    for i in range(1, 6912):
        print('request:', i)
        result, status = request_page(i)
        if status != 200:
            page_error_file_object.write(str(i) + ',' + str(status) + '\n')
            page_error_file_object.flush( )
            continue
        items = parse_cicpa_page(result)
        for item in items:
            code = re.findall(r"javascript:viewDetail\(\'(\w+?)\',", str(item))[0]
            html_file_path = html_dir + code + '.html'
            if os.path.exists(html_file_path):
                continue
            detail_html, status = request_detail(code)
            if len(detail_html) == 0:
                detail_error_file_object.write(code + ',%d\n' % status)
                detail_error_file_object.flush( )
                continue
            if need_header:
                header = parse_detail_header(detail_html)
                f = open(header_file, 'w')
                f.write(header + '\n')
                f.close()
                need_header = False
            # save base data
            line = parse_detail_content(detail_html)
            data_file_object.write(line + '\n')
            data_file_object.flush( )
            # save html
            html_file_object = open(html_file_path, 'w')
            html_file_object.write(detail_html + '\n')
            html_file_object.close( )
            print(line)

    data_file_object.close( )
    page_error_file_object.close( )
    detail_error_file_object.close( )
    print('finished in', (datetime.datetime.now( ) - start_time).microseconds, 'ms')

if __name__ == '__main__':
    main( )

猜你喜欢

转载自www.cnblogs.com/zhaohz/p/12117167.html