python3爬虫例子01(获取个人博客园的粉丝)

#!/usr/bin/env python
# -*- coding:UTF-8 -*-

import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time


class GetFansName:
#初始化各配置项数值
def __init__(self, profiles, url, ses, sleepTime, fansNameFile):
self.profiles = profiles
self.url = url
self.ses = ses
self.sleepTime = sleepTime
self.fansNameFile = fansNameFile

def get_cookies(self):
try:
#加载配置文件
profiles = webdriver.FirefoxProfile(self.profiles)

driver = webdriver.Firefox(profiles)

driver.get(self.url+"/followers")

time.sleep(self.sleepTime)

#获取COOKIES
cookies = driver.get_cookies()

# print(cookies)

driver.quit()

return cookies
except Exception as msg:
print("get_cookies error:%s"%str(msg))


def add_cookies(self,cookies):
try:
c=requests.cookies.RequestsCookieJar()
for i in cookies:
c.set(i["name"],i["value"])

#更新COOKIES
self.ses.cookies.update(c)
except Exception as msg:
print("add_cookies error:%s"%str(msg))


def get_fansNum(self):
try:
#发送访问粉丝的请求
fansres = self.ses.get(self.url+"/relation/followers")

fanssoup = BeautifulSoup(fansres.content,"html.parser")

#获取粉丝数量
tempfansnum = fanssoup.find_all(class_="current_nav")

# print(tempfansnum[0].string)

strfansnum = re.findall(u"我的粉丝\((.+?)\)",tempfansnum[0].string)
print(u"我的粉丝数量:%s"%str(strfansnum[0]))

#粉丝分页数量
fansnum = int(int(strfansnum[0])/45)+1

print(u"总的分页:%s"%str(fansnum))

return fansnum
except Exception as msg:
print("get_fansNum error:%s"%str(msg))
return 1


def get_fansName(self,fansnum):
try:
#判断有几页粉丝,然后分别去处理
if fansnum <=1:
url_page=self.url+"/relation/followers"
else:
url_page=self.url+"/relation/followers?page=%s"%str(fansnum)

print("正在抓取页面:%s"%url_page)

fansnameres=self.ses.get(url_page,verify=False)

fansnamesoup=BeautifulSoup(fansnameres.content,"html.parser")

fansnames=fansnamesoup.find_all(class_="avatar_name")

#将粉丝名字写入文件
for fansname in fansnames:
name=fansname.string.replace("\n"," ").strip(" ")

with open(self.fansNameFile,'a',encoding="utf-8") as file:
file.write(name+"\n")
except Exception as msg:
print("get_fansName error:%s"%str(msg))


if __name__ == '__main__':

#FireFox profile文件路径
profiles = r"C:\Users\Administrator\AppData\Roaming\Mozilla\Firefox\Profiles\wv0f79j4.default"

#要抓取的粉丝的URL
url = "https://home.cnblogs.com/u/NiceTime"

#存放粉丝名字的文件
fansNameFile = "fansNameFile.txt"

#打开浏览器后,等待的时间,单位秒
sleepTime = 5

#获取当前请求的会话
ses = requests.session()

fansName = GetFansName(profiles, url, ses,sleepTime,fansNameFile)

cookies = fansName.get_cookies()

fansName.add_cookies(cookies)

fansNums = fansName.get_fansNum()

for fansNum in range(1, fansNums+1):
fansName.get_fansName(fansNum)



猜你喜欢

转载自www.cnblogs.com/NiceTime/p/10070139.html