# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import json
import time
import xlwt
import urllib
import urllib2
import shutil
from pyExcelerator import *
# Cancel the certificate validation
context = ssl._create_unverified_context ()
Dynamic # page number (up to 60)
perPage = 60;
# File storage path
filePath = 'd: / Reptilian / health / xxxx /'
# Number of records
count = 1
# Create a workbook
import xlwt
filePath = 'd:/Reptilian/health/xxxx/'
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('sheet1')
# Reptile base address, a station popular popular dynamic interface, open web analytics come through developer tools
xxxxUrl = 'https://api.xxxxx.com/social/v2/timeline/hot?startPage={}&perPage='+ str (perPage) + '& lastId = 5aee63773c549f58fa1c3bb1'
# 请求头定义
headers = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept':'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
# Get user information
DEF getUserInfo (Page):
URL = xxxxUrl.format (STR (Page))
Print (( 'crawl content (each' + str (perPage) + 'section) Address:' + url) .decode ( 'UTF-. 8'). encode ( 'GBK'))
REQ = urllib2.Request (URL, headers = headers)
RESP = urllib2.urlopen (REQ, context = context)
Result = resp.read (). decode ( 'UTF -8 ');
jsonData json.loads = (Result)
IF jsonData [' OK ']:
# interface call success
Users jsonData = [' Data ']
the else:
# the interface call failed
!. print (' interface call failed 'decode ( 'UTF-. 8') encode ( 'GBK'));.
return
Global COUNT
IF len (Users)> 0:
for Users in User:
if (getCorrectName(user['author']['username']) == ''):
break
# 拿上传图片列表
if ('images' in user.keys()):
imgs = user['images']
imgNum = len(imgs)
if imgNum > 1:
worksheet.write_merge(count, count + imgNum - 1, 0, 0, getCorrectName(user['author']['username']))
worksheet.write_merge(count, count + imgNum - 1 , 1, 1, user['author']['gender'])
worksheet.write_merge(count, count + imgNum - 1 , 2, 2, user['author']['avatar'])
worksheet.write_merge(count, count + imgNum - 1, 4, 4, getCorrectDate(str(user['created'])))
worksheet.write_merge(count, count + imgNum - 1, 5, 5, getCorrectContent(user['content']))
for img in imgs:
worksheet.write (count, 3, img)
count += 1
elif imgNum == 1:
worksheet.write(count, 0, getCorrectName(user['author']['username']))
worksheet.write(count, 1, user['author']['gender'])
worksheet.write(count, 2, user['author']['avatar'])
worksheet.write(count, 3, imgs[0])
worksheet.write(count, 4, getCorrectDate(str(user['created'])))
worksheet.write(count, 5, getCorrectContent(user['content']))
count += 1
else:
worksheet.write(count, 0, getCorrectName(user['author']['username']))
worksheet.write(count, 1, user['author']['gender'])
worksheet.write(count, 2, user['author']['avatar'])
worksheet.write(count, 3, user['photo'])
worksheet.write(count, 4, getCorrectDate(str(user['created'])))
worksheet.write(count, 5, getCorrectContent(user['content']))
count += 1
else:
worksheet.write(count, 0, getCorrectName(user['author']['username']))
worksheet.write(count, 1, user['author']['gender'])
worksheet.write(count, 2, user['author']['avatar'])
worksheet.write(count, 3, '')
worksheet.write(count, 4, getCorrectDate(str(user['created'])))
worksheet.write(count, 5, getCorrectContent(user['content']))
count += 1
# Create a directory developed
DEF mkdir (path):
path = path.strip ()
path = path.rstrip ( "\\")
ISEXIST = os.path.exists (path)
IF not ISEXIST:
os.makdirs (path)
the else:
print ( 'directory already exists, do not need to be repeated to create!'. decode ( 'utf- 8'). encode ( 'gbk'))
# Verify that the name is legitimate (retaining only the Chinese and English)
DEF getCorrectName (name):
the TEMP = re.sub (r '([\ u4e00- \ u9fa5 \ u0030- \ u0039 \ u0041- \ u005a \ u0061- \ u007a]) ',' ', name)
# the re.sub (R & lt' (\\ -) ',' ', TEMP)
return TEMP
# Replace special characters in dynamic
DEF getCorrectContent (Content):
return re.sub (r '# | xxxx | XXXX | @', '', Content)
# Get the correct time format
DEF getCorrectDate (Time):
T = the re.sub (r'T ',' ', Time)
return the re.sub (R & lt' ([0-9] * the Z $). ',' ', t)
__name__ == IF '__main__':
# Create a directory
mkdir (filePath)
# create tables head
.. worksheet.write (0, 0, ' Nickname' .decode ( 'utf-8' ) encode ( 'gbk') decode ( 'GBK'))
worksheet.write (0,. 1, 'gender' .decode ( 'UTF-. 8'). encode ( 'GBK'). decode ( 'GBK'))
worksheet.write (0, 2, 'user Profile' .decode ( 'utf-8' ) . encode ( 'GBK'). decode ( 'GBK'))
worksheet.write (0,. 3, 'user Photo' .decode ( 'utf-8' ) .encode ( 'GBK'). decode ( 'GBK'))
worksheet.write (0,. 4, 'dynamically created time' .decode ( 'utf-8' ). encode ( 'gbk'). decode ( 'gbk' ))
worksheet.write (0,. 5, 'dynamic user' .decode ( 'utf-8' ). encode ( 'gbk'). decode ( 'gbk'))
# Get user information and written form
for i in the Range (0, 200):
getUserInfo (i)
# save xls file
workbook.save(filePath + 'content.xls')