python_PyQuery_爬取百度企业信用的企业数据

主要步骤:
1、获取你需要爬取的公司名列表
2、根据公司名访问百度企信的url
注意:反爬虫很厉害,记得随机休息几秒;代码停了要捕获异常,那样才能持续不断的爬取数据;代理要付费代理才有用。

#coding=gbk
import os
import random
import time

import requests
from pyquery import PyQuery as pq
import urllib.parse
import pandas as pd

from utils.read_write import writeOneTxt, writeOneCsv


def write_cnames(cnames,filename):
    df=pd.DataFrame(cnames,columns=['单位名称'])
    df.to_csv(filename,index=False,encoding='utf-8')

def search(cname,rest):
    try:
        headers1 = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko)'
                      ' Chrome/66.0.3359.181 Safari/537.36',
                        'refer':'https://xin.baidu.com/'
        }
        base_url='https://xin.baidu.com/s?q={}&t=1'.format(urllib.parse.quote(cname))
        r1=requests.get(base_url,headers=headers1)
        p=pq(r1.text)
        if p('em.zx-result-counter').text() == '0': #判断企业是否能搜索到
            print(cname+',未找到')
            writeOneCsv([cname,'未找到'],csv_dir+'企业爬取记录.csv')
            # rest.append(cname)
        else:
            writeOneTxt(r1.text,saveDir+cname+'.Txt')
            print(cname + ',已找到')
            writeOneCsv([cname, '已找到'], csv_dir + '企业爬取记录.csv')
    except Exception as e:
        print(cname+',被屏蔽')
        writeOneCsv([cname, '被屏蔽'], csv_dir + '企业爬取记录.csv')
        rest.append(cname)




if __name__ == '__main__':
    saveDir = 'D:\data0\\'
    shiyanDir = 'D:\data\Json\\'
    csv_dir = 'D:\dat\\'
    company = 'company.csv'
    save1 = pd.read_csv(csv_dir + company)
    cnames = list(set(save1['企业名称']))
    rest=[]
    for company in cnames:
        file = saveDir + company +'.Txt'
        shiyan = shiyanDir + company +'.Txt'
        if not os.path.exists(file) and os.path.exists(shiyan):
            search(company,rest)
            time.sleep(random.randint(6, 9))
        else:
            print(company + ',已存挡')

如需数据或帮忙处理数据请私聊我。。。

猜你喜欢

转载自blog.csdn.net/qq_30803353/article/details/107436259
今日推荐