#encoding=utf-8
import requests
import re
from lxml import etree
infos=[]
count=0
def getHtmlContext(addr):
a=requests.get(addr)
a.encoding='gbk'
return etree.HTML(a.content.decode("gbk"))
def parseListHtml(htm):
# 获取地址
nextUrl = htm.xpath(".//div[@class='dw_table']/div[@class='el']/p/span/a/@href")
# 获取职位名
jobName = htm.xpath(".//div[@class='dw_table']/div[@class='el']/p/span/a/@title")
# 获取公司名
compName = htm.xpath(".//div[@class='dw_table']/div[@class='el']/span/a/@title")
# 获取工作地点和薪资 (判断下一页是否有地址)
ad = htm.xpath(".//div[@class='dw_table']/div[@class='el']/span[@class='t3']/text()")
addr = "default" if len(ad)==0 else ad[0]
sal = htm.xpath(".//div[@class='dw_table']/div[@class='el']/span[@class='t4']/text()")
salary = "0" if len(sal)==0 else sal[0]
# print(len(addrAndSalary),len(nextUrl))
# 循环填充全局数组
#0-->2 1-->4,5 2-->7,8
for i,v in enumerate(jobName):
# 根据每个超链接 获取下个页面中的数据数组并返回
others = parseDetailHtml(getHtmlContext(nextUrl[i]),i)
tmp =[[v,compName[i],addr,salary]+others]
global infos
infos+=tmp
# 判断是否有下一页
# 获取下页标签
np = htm.xpath(".//div[@class='dw_page']/div/div/div/ul/li[last()]/a/@href")
# 如果有下一页就把下一页的地址送给
if len(np)!=0:
global count
count+=1
print(count)
parseListHtml(getHtmlContext(np[0]))
# print(infos)
def parseDetailHtml(htm,ind):
# 要求经验和学历要求
exp = htm.xpath(".//div[@class='cn']/p[@class='msg ltype']/@title")
if len(exp)==0:
return [[" "]]
else:
ss = exp[0].split("\xa0\xa0|\xa0\xa0")
# 招聘要求
requests = htm.xpath(".//div[@class='bmsg job_msg inbox']/*/text()")
# tmp = ss+[str(requests).replace("[","").replace("【","").replace("]","").replace("'","").replace("|","").replace("\\r\\n","").replace(" ","")]
return ss+[re.sub(r"[\\r\\n[\s\]'【】]","",str(requests))]
def dataWriteCsv():
import csv
with open("e:/job.csv",'a',newline='') as file:
writer = csv.writer(file)
for info in infos:
writer.writerow(tuple(info))
if __name__ == '__main__':
address = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%2594%25B5%25E7%25AB%259E,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
parseListHtml(getHtmlContext(address))
dataWriteCsv()
【Python】Python 爬虫实战之某1job职位信息爬取
猜你喜欢
转载自blog.csdn.net/beautiful_huang/article/details/103994681
今日推荐
周排行