Python website crawling data and generate Excel

Disclaimer: This article is a blogger original article, shall not be reproduced without the bloggers allowed. https://blog.csdn.net/lierwang2017/article/details/83585754

# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import json
import time
import xlwt
import urllib
import urllib2
import shutil


from pyExcelerator import *

# Cancel the certificate validation
context = ssl._create_unverified_context ()

Dynamic # page number (up to 60)
perPage = 60;

# File storage path
filePath = 'd: / Reptilian / health / xxxx /'

# Number of records
count = 1

# Create a workbook
import xlwt 

filePath = 'd:/Reptilian/health/xxxx/'

workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('sheet1') 

# Reptile base address, a station popular popular dynamic interface, open web analytics come through developer tools
xxxxUrl = 'https://api.xxxxx.com/social/v2/timeline/hot?startPage={}&perPage='+ str (perPage) + '& lastId = 5aee63773c549f58fa1c3bb1'


# 请求头定义
headers = {
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Accept':'text/html,application/xhtml+xml,application/xml;\
        q=0.9,image/webp,image/apng,*/*;q=0.8',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }

# Get user information
DEF getUserInfo (Page):
    URL = xxxxUrl.format (STR (Page))
    Print (( 'crawl content (each' + str (perPage) + 'section) Address:' + url) .decode ( 'UTF-. 8'). encode ( 'GBK'))
    REQ = urllib2.Request (URL, headers = headers)
    RESP = urllib2.urlopen (REQ, context = context)
    Result = resp.read (). decode ( 'UTF -8 ');
    jsonData json.loads = (Result)
    IF jsonData [' OK ']:
        # interface call success
        Users jsonData = [' Data ']
    the else:
        # the interface call failed
        !. print (' interface call failed 'decode ( 'UTF-. 8') encode ( 'GBK'));.
        return
    Global COUNT
    IF len (Users)> 0:
        for Users in User:
            if (getCorrectName(user['author']['username']) == ''):
                break
            # 拿上传图片列表
            if ('images' in user.keys()):
                imgs = user['images']
                imgNum = len(imgs)
                if imgNum > 1:
                    worksheet.write_merge(count, count + imgNum - 1, 0, 0, getCorrectName(user['author']['username']))
                    worksheet.write_merge(count, count + imgNum - 1 , 1, 1, user['author']['gender'])
                    worksheet.write_merge(count, count + imgNum - 1 , 2, 2, user['author']['avatar'])
                    worksheet.write_merge(count, count + imgNum - 1, 4, 4, getCorrectDate(str(user['created'])))
                    worksheet.write_merge(count, count + imgNum - 1, 5, 5, getCorrectContent(user['content']))
                    for img in imgs:
                        worksheet.write (count, 3, img)
                        count += 1
                elif imgNum == 1:
                    worksheet.write(count, 0, getCorrectName(user['author']['username']))
                    worksheet.write(count, 1, user['author']['gender'])
                    worksheet.write(count, 2, user['author']['avatar'])
                    worksheet.write(count, 3, imgs[0]) 
                    worksheet.write(count, 4, getCorrectDate(str(user['created'])))
                    worksheet.write(count, 5, getCorrectContent(user['content']))
                    count += 1
                else:
                    worksheet.write(count, 0, getCorrectName(user['author']['username']))
                    worksheet.write(count, 1, user['author']['gender'])
                    worksheet.write(count, 2, user['author']['avatar'])
                    worksheet.write(count, 3, user['photo']) 
                    worksheet.write(count, 4, getCorrectDate(str(user['created'])))
                    worksheet.write(count, 5, getCorrectContent(user['content']))
                    count += 1
            else:
                worksheet.write(count, 0, getCorrectName(user['author']['username']))
                worksheet.write(count, 1, user['author']['gender'])
                worksheet.write(count, 2, user['author']['avatar'])
                worksheet.write(count, 3, '') 
                worksheet.write(count, 4, getCorrectDate(str(user['created'])))
                worksheet.write(count, 5, getCorrectContent(user['content']))
                count += 1

# Create a directory developed
DEF mkdir (path):
    path = path.strip ()
    path = path.rstrip ( "\\")
    ISEXIST = os.path.exists (path)
    
    IF not ISEXIST:
        os.makdirs (path)
    the else:
        print ( 'directory already exists, do not need to be repeated to create!'. decode ( 'utf- 8'). encode ( 'gbk'))

# Verify that the name is legitimate (retaining only the Chinese and English)
DEF getCorrectName (name):
    the TEMP = re.sub (r '([\ u4e00- \ u9fa5 \ u0030- \ u0039 \ u0041- \ u005a \ u0061- \ u007a]) ',' ', name)
    # the re.sub (R & lt' (\\ -) ',' ', TEMP)
    return TEMP

# Replace special characters in dynamic
DEF getCorrectContent (Content):
    return re.sub (r '# | xxxx | XXXX | @', '', Content)

# Get the correct time format
DEF getCorrectDate (Time):
    T = the re.sub (r'T ',' ', Time)
    return the re.sub (R & lt' ([0-9] * the Z $). ',' ', t)


__name__ == IF '__main__':
    # Create a directory
    mkdir (filePath)
    # create tables head
    .. worksheet.write (0, 0, ' Nickname' .decode ( 'utf-8' ) encode ( 'gbk') decode ( 'GBK'))
    worksheet.write (0,. 1, 'gender' .decode ( 'UTF-. 8'). encode ( 'GBK'). decode ( 'GBK'))
    worksheet.write (0, 2, 'user Profile' .decode ( 'utf-8' ) . encode ( 'GBK'). decode ( 'GBK'))
    worksheet.write (0,. 3, 'user Photo' .decode ( 'utf-8' ) .encode ( 'GBK'). decode ( 'GBK'))
    worksheet.write (0,. 4, 'dynamically created time' .decode ( 'utf-8' ). encode ( 'gbk'). decode ( 'gbk' ))
    worksheet.write (0,. 5, 'dynamic user' .decode ( 'utf-8' ). encode ( 'gbk'). decode ( 'gbk'))
    # Get user information and written form
    for i in the Range (0, 200):
        getUserInfo (i)
    # save xls file
    workbook.save(filePath + 'content.xls')

Guess you like

Origin blog.csdn.net/lierwang2017/article/details/83585754