百家号爬虫(获取各领域创作者appid)

本文为爬虫及数据分析学习文章,网页解析方法较笨,仅作纪念。

百家号爬虫(获取各领域创作者appid)

由于百度的限制,每个领域最多能获取760个id

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


from urllib.parse import quote
from urllib import request
from bs4 import BeautifulSoup
from urllib import error
from openpyxl import Workbook
import time

#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]


#当遍历账号后,百度搜索结果会重新开始;所以要获取第一个name,作为停止的判断标准
def name_first(field):
    url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
    +quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
    +quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn=0&data_type=json%20---------------------%20'
    Response_1= str(request.urlopen(url).read().decode('utf-8'))
    soup_1= BeautifulSoup(Response_1,'lxml')
    name_1=soup_1.find('div',class_=\
    'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
    print(name_1)
    return name_1 
    
def appid_list_excel(appid_list,field):
    wb=Workbook()
    ws=wb.active
    ws.append(['name','field','appid','smallfont','vip_info']) 
    for i in range(len(appid_list)):
        lists=appid_list[i]
        ws.append([lists[0],lists[1],lists[2],lists[3],lists[4]])
    save_path=field
    save_path+='.xlsx'
    wb.save(save_path)


#从百度搜索获取各领域百家号账号信息
def get_appid(field,name_1):
    number = 0 #URL地址中,pn=number为账号定位,XHR,每次从pn开始返回10账号,所以要循环操作
    appid_list=[]
    name='name'
    
    while  number<=10000 and name!=name_1 : 

        url = 'https://www.baidu.com/sf?word=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
        +quote(field)+'&pd=cambrian_list&atn=index&title=%E7%99%BE%E5%AE%B6%E5%8F%B7%2B'\
        +quote(field)+'&lid=9080249029523443283&ms=1&frsrcid=206&frorder=1&pn='\
        +str(number)+'&data_type=json%20---------------------%20'  
        
        try:
            req = request.Request(url, headers=hds[number%len(hds)])
            Response = str(request.urlopen(req).read().decode('utf-8'))
            soup = BeautifulSoup(Response,'lxml')
            subsrcibes =soup.find_all('div',class_="sfc-cambrian-list-subscribe")
        except error.HTTPError as e:
                print("HTTPError") 
                print(e.code)
        except error.URLError as e:
                print("URLError")
                print(e.reason)
           
        for subsrcibe in subsrcibes:
            smallfont=subsrcibe.find('div',class_='c-font-small c-gray c-line-clamp1').string.strip()
            name=subsrcibe.find('div',class_=\
            'c-color-link c-font-big sfc-cambrian-list-subscribe-title c-line-clamp1').string.strip()
            img_info=subsrcibe.find_all('img')  #从图片地址截取信息
            try:
                appid_info=str(img_info[0])
                appid=appid_info[appid_info.find('_')+1:appid_info.find('.jpeg')] 
            except:
                appid='缺失'
            try:
                  vip_info=str(img_info[1])\
                  [str(img_info[1]).find('vip'):str(img_info[1]).find('vip')+5]
            except:
                vip_info='暂无'
            if number>=10 and name==name_1: 
                break         
            appid_list.append([name,field,appid,smallfont,vip_info])
            
        number+=10
        print('%s==%d'% (field,number))
        time.sleep(1)
        
    return appid_list

if __name__=='__main__':
#    field_list = ['娱乐’,’体育’,’财经’]
#    field_list = ['人文’,'科技','互联网','数码','社会']
#    field_list = ['汽车','房产','旅游','女人','情感','时尚','星座','美食','生活']
#    field_list = ['育儿','影视','音乐','动漫','搞笑','教育','文化','宠物','游戏','家居']
#    field_list = ['悦读','艺术','摄影','健康','养生','科学','三农','职场','综合','百科','学术']
    field_list =['其它']
    for field in field_list:
        name_1=name_first(field)
        appid_list=get_appid(field,name_1) 
        appid_list_excel(appid_list,field)
    print('ok')

     
    
    
    
    
发布了90 篇原创文章 · 获赞 3 · 访问量 4918

猜你喜欢

转载自blog.csdn.net/weixin_43329319/article/details/90381735