selenium+BeautifulSoup+request:抓取博客园我的粉丝

#coding:utf-8
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import re
import time

#chrome配置路径
user_data_dir = r"--user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data\Default"
#加载配置数据
option = webdriver.ChromeOptions()
option.add_argument(user_data_dir)

driver = webdriver.Chrome(chrome_options=option)
driver.get("https://home.cnblogs.com/u/xiaohuhu/followers/")
time.sleep(3)
cookies = driver.get_cookies()
print(cookies)
driver.quit()

#创建session
s = requests.session()
c=requests.cookies.RequestsCookieJar()
for i in cookies:
  c.set(i["name"],i["value"])
s.cookies.update(c)

#发get请求
r1=s.get("https://home.cnblogs.com/u/xiaohuhu/followers/")
soup = BeautifulSoup(r1.content,"html.parser")
#抓取我的粉丝数, 返回一个list,取第一个
fensinub = soup.find_all(class_="current_nav")
print(fensinub[0].string)

#正则表达式匹配"我的粉丝(3)",返回一个list,取第一个
num=re.findall("我的粉丝\((.+?)\)", fensinub[0].string)
print(num[0])

#计算有多少页,每页45条,如果超过2页的话...
#ye = int(int(num[0])/45) + 1

#抓取粉丝名
fensi = soup.find_all(class_="avatar_name")
for i in fensi:
  name = i.string.replace("\n","").replace(" ","")
  print(name)
  with open("d:\\name.txt","a",encoding="utf-8") as f:
    f.write(name+"\n")

'''
# 抓第二页后的数据, 如果有第二页的话...
for i in range(2, ye+1):
  r2 = s.get("https://home.cnblogs.com/u/yoyoketang/relation/followers?page=%s"%str(i))
  soup = BeautifulSoup(r1.content, "html.parser")
  # 抓取我的粉丝数
  fensi = soup.find_all(class_="avatar_name")
  for i in fensi:
    name = i.string.replace("\n", "").replace(" ","")
    print(name)
    with open("name.txt","a",encoding="utf-8") as f:
      f.write(name+"\n")
'''

猜你喜欢

转载自www.cnblogs.com/xiaohuhu/p/9392858.html