# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import urllib
import urllib2
import shutil
from pyExcelerator import *
# Cancel the certificate validation
context = ssl._create_unverified_context ()
# Crawler base address
baseUrl = 'https://www.qiushibaike.com/text/page/{}/'
# Match the regular expression nickname and avatar
pattern = '<img src = " // ([^ \ s;] + \ (\ w | /) * (jpg | .JPEG) \ imageView2 / 1 /..?? w / 90 / h / 90) "alt =" (. *?) "> '
# Picture storage directory
resourcePath = 'D: / Reptilian / Health / Content /'
# image storage directory
imgPath = resourcePath + 'img /'
# Create a workbook
w = Workbook ()
WS = w.add_sheet ( '1')
COUNT = 1
# 请求头定义
headers = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept':'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
# Access to resources to address crawling
DEF buildUrl (Page):
return baseUrl.format (str (Page))
# 获取用户信息方法
def getUserInfo(page):
url = buildUrl(page)
print(url)
req = urllib2.Request(url, headers = headers)
resp = urllib2.urlopen(req, context = context)
rStr = str(resp.read());
userList = re.compile(pattern).findall(rStr)
print(('第' + str(page) + '页动态条数:' + str(len(userList))).decode('utf-8').encode('gbk'));
l = len(userList)
global count
for user in userList:
imgUrl = 'http://' + user[0]
ws.write (count, 1, imgUrl)
ws.write (count, 0, user[3].decode('utf-8').encode('gbk').decode('gbk'))
count += 1
downloadImg(imgUrl)
#print(('头像地址:' + user[0]).decode('utf-8').encode('gbk'))
#print(('用户昵称:' + user[3]).decode('utf-8').encode('gbk'))
# Create a directory to store
DEF mkdir (path):
path = path.strip ()
path = path.rstrip ( "\\")
ISEXIST = os.path.exists (path)
IF not ISEXIST:
os.makdirs (path)
the else:
print ( 'directory exists!' decode ( 'UTF-. 8') encode ( 'GBK')..);
DEF downloadImg (URL):
Global imgpath
urllib.urlretrieve (URL, imgpath + STR (COUNT) + '.jpg ')
for i in range(1, 6):
mkDir(resourcePath)
mkDir(imgPath)
ws.write (0, 0, '用户昵称'.decode('utf-8').encode('gbk').decode('gbk'))
ws.write (0, 1, '用户头像地址'.decode('utf-8').encode('gbk').decode('gbk'))
getUserInfo(i)
w.save(resourcePath + 'content.xls')