Python爬取用户头像及昵称

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lierwang2017/article/details/83585903

# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import urllib
import urllib2
import shutil

from pyExcelerator import *

# 取消证书验证
context = ssl._create_unverified_context()

# 爬虫基础地址
baseUrl = 'https://www.qiushibaike.com/text/page/{}/'

# 匹配昵称及头像的正则表达式
pattern = '<img src="//([^\s;]+\.(\w|/)*(.jpg|.JPEG)?\?imageView2/1/w/90/h/90)" alt="(.*?)">'

# 头像储存目录
resourcePath = 'd:/Reptilian/health/content/'
# 图片储存目录
imgPath = resourcePath + 'img/'

# 创建工作簿
w = Workbook()
ws = w.add_sheet('1')
count = 1

# 请求头定义
headers = {
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Accept':'text/html,application/xhtml+xml,application/xml;\
        q=0.9,image/webp,image/apng,*/*;q=0.8',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }

# 获取要爬取的资源地址
def buildUrl(page):
    return baseUrl.format(str(page))

# 获取用户信息方法
def getUserInfo(page):
    url = buildUrl(page)
    print(url)
    req = urllib2.Request(url, headers = headers)
    resp = urllib2.urlopen(req, context = context)
    rStr = str(resp.read());
    userList = re.compile(pattern).findall(rStr)

    print(('第' + str(page) + '页动态条数:' + str(len(userList))).decode('utf-8').encode('gbk'));
    l = len(userList)
    global count
    for user in userList:
        imgUrl = 'http://' + user[0]
        ws.write (count, 1, imgUrl)
        ws.write (count, 0, user[3].decode('utf-8').encode('gbk').decode('gbk'))
        count += 1
        downloadImg(imgUrl)
        #print(('头像地址:' + user[0]).decode('utf-8').encode('gbk'))
        #print(('用户昵称:' + user[3]).decode('utf-8').encode('gbk'))

# 创建存储目录
def mkDir(path):
    path = path.strip()
    path = path.rstrip("\\")
    isExist = os.path.exists(path)
    
    if not isExist:
        os.makedirs(path)
    else:
        print('目录已存在!'.decode('utf-8').encode('gbk'));
    
def downloadImg(url):
    global imgPath
    urllib.urlretrieve(url, imgPath + str(count) + '.jpg')


for i in range(1, 6):
    mkDir(resourcePath)
    mkDir(imgPath)
    ws.write (0, 0, '用户昵称'.decode('utf-8').encode('gbk').decode('gbk'))
    ws.write (0, 1, '用户头像地址'.decode('utf-8').encode('gbk').decode('gbk'))
    getUserInfo(i)
    w.save(resourcePath + 'content.xls')

猜你喜欢

转载自blog.csdn.net/lierwang2017/article/details/83585903