主要步骤:
1、获取你需要爬取的公司名列表
2、根据公司名访问百度企信的url
注意:反爬虫很厉害,记得随机休息几秒;代码停了要捕获异常,那样才能持续不断的爬取数据;代理要付费代理才有用。
#coding=gbk
import os
import random
import time
import requests
from pyquery import PyQuery as pq
import urllib.parse
import pandas as pd
from utils.read_write import writeOneTxt, writeOneCsv
def write_cnames(cnames,filename):
df=pd.DataFrame(cnames,columns=['单位名称'])
df.to_csv(filename,index=False,encoding='utf-8')
def search(cname,rest):
try:
headers1 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/66.0.3359.181 Safari/537.36',
'refer':'https://xin.baidu.com/'
}
base_url='https://xin.baidu.com/s?q={}&t=1'.format(urllib.parse.quote(cname))
r1=requests.get(base_url,headers=headers1)
p=pq(r1.text)
if p('em.zx-result-counter').text() == '0': #判断企业是否能搜索到
print(cname+',未找到')
writeOneCsv([cname,'未找到'],csv_dir+'企业爬取记录.csv')
# rest.append(cname)
else:
writeOneTxt(r1.text,saveDir+cname+'.Txt')
print(cname + ',已找到')
writeOneCsv([cname, '已找到'], csv_dir + '企业爬取记录.csv')
except Exception as e:
print(cname+',被屏蔽')
writeOneCsv([cname, '被屏蔽'], csv_dir + '企业爬取记录.csv')
rest.append(cname)
if __name__ == '__main__':
saveDir = 'D:\data0\\'
shiyanDir = 'D:\data\Json\\'
csv_dir = 'D:\dat\\'
company = 'company.csv'
save1 = pd.read_csv(csv_dir + company)
cnames = list(set(save1['企业名称']))
rest=[]
for company in cnames:
file = saveDir + company +'.Txt'
shiyan = shiyanDir + company +'.Txt'
if not os.path.exists(file) and os.path.exists(shiyan):
search(company,rest)
time.sleep(random.randint(6, 9))
else:
print(company + ',已存挡')
如需数据或帮忙处理数据请私聊我。。。