Python crawling user avatar and nickname

Disclaimer: This article is a blogger original article, shall not be reproduced without the bloggers allowed. https://blog.csdn.net/lierwang2017/article/details/83585903

# -*- coding: utf-8 -*-
import os
import re
import sys
import ssl
import urllib
import urllib2
import shutil

from pyExcelerator import *

# Cancel the certificate validation
context = ssl._create_unverified_context ()

# Crawler base address
baseUrl = 'https://www.qiushibaike.com/text/page/{}/'

# Match the regular expression nickname and avatar
pattern = '<img src = " // ([^ \ s;] + \ (\ w | /) * (jpg | .JPEG) \ imageView2 / 1 /..?? w / 90 / h / 90) "alt =" (. *?) "> '

# Picture storage directory
resourcePath = 'D: / Reptilian / Health / Content /'
# image storage directory
imgPath = resourcePath + 'img /'

# Create a workbook
w = Workbook ()
WS = w.add_sheet ( '1')
COUNT = 1

# 请求头定义
headers = {
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Accept':'text/html,application/xhtml+xml,application/xml;\
        q=0.9,image/webp,image/apng,*/*;q=0.8',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }

# Access to resources to address crawling
DEF buildUrl (Page):
    return baseUrl.format (str (Page))

# 获取用户信息方法
def getUserInfo(page):
    url = buildUrl(page)
    print(url)
    req = urllib2.Request(url, headers = headers)
    resp = urllib2.urlopen(req, context = context)
    rStr = str(resp.read());
    userList = re.compile(pattern).findall(rStr)

    print(('第' + str(page) + '页动态条数:' + str(len(userList))).decode('utf-8').encode('gbk'));
    l = len(userList)
    global count
    for user in userList:
        imgUrl = 'http://' + user[0]
        ws.write (count, 1, imgUrl)
        ws.write (count, 0, user[3].decode('utf-8').encode('gbk').decode('gbk'))
        count += 1
        downloadImg(imgUrl)
        #print(('头像地址:' + user[0]).decode('utf-8').encode('gbk'))
        #print(('用户昵称:' + user[3]).decode('utf-8').encode('gbk'))

# Create a directory to store
DEF mkdir (path):
    path = path.strip ()
    path = path.rstrip ( "\\")
    ISEXIST = os.path.exists (path)
    
    IF not ISEXIST:
        os.makdirs (path)
    the else:
        print ( 'directory exists!' decode ( 'UTF-. 8') encode ( 'GBK')..);
    
DEF downloadImg (URL):
    Global imgpath
    urllib.urlretrieve (URL, imgpath + STR (COUNT) + '.jpg ')


for i in range(1, 6):
    mkDir(resourcePath)
    mkDir(imgPath)
    ws.write (0, 0, '用户昵称'.decode('utf-8').encode('gbk').decode('gbk'))
    ws.write (0, 1, '用户头像地址'.decode('utf-8').encode('gbk').decode('gbk'))
    getUserInfo(i)
    w.save(resourcePath + 'content.xls')

Guess you like

Origin blog.csdn.net/lierwang2017/article/details/83585903