Python爬取用户头像及昵称

# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import urllib
import urllib2
import shutil

from pyExcelerator import *

# 取消证书验证
context = ssl._create_unverified_context()

# 爬虫基础地址
baseUrl = 'https://www.qiushibaike.com/text/page/{}/'

# 匹配昵称及头像的正则表达式
pattern = '<img src="//([^\s;]+\.(\w|/)*(.jpg|.JPEG)?\?imageView2/1/w/90/h/90)" alt="(.*?)">'

# 头像储存目录
resourcePath = 'd:/Reptilian/health/content/'
# 图片储存目录
imgPath = resourcePath + 'img/'

# 创建工作簿
w = Workbook()
ws = w.add_sheet('1')
count = 1

# 请求头定义
headers = {
       'Connection': 'keep-alive',
       'Accept-Language': 'zh-CN,zh;q=0.9',
       'Accept':'text/html,application/xhtml+xml,application/xml;\
       q=0.9,image/webp,image/apng,*/*;q=0.8',
       'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
       (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
   }

# 获取要爬取的资源地址
def buildUrl(page):
return baseUrl.format(str(page))

# 获取用户信息方法
def getUserInfo(page):
   url = buildUrl(page)
   print(url)
   req = urllib2.Request(url, headers = headers)
   resp = urllib2.urlopen(req, context = context)
   rStr = str(resp.read());
   userList = re.compile(pattern).findall(rStr)

   print(('第' + str(page) + '页动态条数：' + str(len(userList))).decode('utf-8').encode('gbk'));
   l = len(userList)
   global count
   for user in userList:
       imgUrl = 'http://' + user[0]
       ws.write (count, 1, imgUrl)
       ws.write (count, 0, user[3].decode('utf-8').encode('gbk').decode('gbk'))
       count += 1
       downloadImg(imgUrl)
       #print(('头像地址：' + user[0]).decode('utf-8').encode('gbk'))
       #print(('用户昵称：' + user[3]).decode('utf-8').encode('gbk'))

# 创建存储目录
def mkDir(path):
   path = path.strip()
   path = path.rstrip("\\")
   isExist = os.path.exists(path)

   if not isExist:
       os.makedirs(path)
   else:
       print('目录已存在！'.decode('utf-8').encode('gbk'));

def downloadImg(url):
   global imgPath
   urllib.urlretrieve(url, imgPath + str(count) + '.jpg')

for i in range(1, 6):
   mkDir(resourcePath)
   mkDir(imgPath)
   ws.write (0, 0, '用户昵称'.decode('utf-8').encode('gbk').decode('gbk'))
   ws.write (0, 1, '用户头像地址'.decode('utf-8').encode('gbk').decode('gbk'))
   getUserInfo(i)
   w.save(resourcePath + 'content.xls')

Python爬取用户头像及昵称

猜你喜欢