【Python】Python 爬虫实战之某1job职位信息爬取

#encoding=utf-8
import requests
import re
from lxml import etree
infos=[]
count=0

def getHtmlContext(addr):
	a=requests.get(addr)
	a.encoding='gbk'
	return etree.HTML(a.content.decode("gbk"))

def parseListHtml(htm):
	# 获取地址
	nextUrl = htm.xpath(".//div[@class='dw_table']/div[@class='el']/p/span/a/@href")
	# 获取职位名
	jobName = htm.xpath(".//div[@class='dw_table']/div[@class='el']/p/span/a/@title")
	# 获取公司名
	compName = htm.xpath(".//div[@class='dw_table']/div[@class='el']/span/a/@title")
	# 获取工作地点和薪资 (判断下一页是否有地址)
	ad = htm.xpath(".//div[@class='dw_table']/div[@class='el']/span[@class='t3']/text()")
	addr = "default" if len(ad)==0 else ad[0]
	sal = htm.xpath(".//div[@class='dw_table']/div[@class='el']/span[@class='t4']/text()")
	salary = "0" if  len(sal)==0 else sal[0]
	# print(len(addrAndSalary),len(nextUrl))
	# 循环填充全局数组
	#0-->2 1-->4,5 2-->7,8
	for i,v in enumerate(jobName):
		# 根据每个超链接 获取下个页面中的数据数组并返回
		others = parseDetailHtml(getHtmlContext(nextUrl[i]),i)
		tmp =[[v,compName[i],addr,salary]+others]
		global infos
		infos+=tmp	
	# 判断是否有下一页
	# 获取下页标签
	np = htm.xpath(".//div[@class='dw_page']/div/div/div/ul/li[last()]/a/@href")
	# 如果有下一页就把下一页的地址送给
	if len(np)!=0:
		global count
		count+=1
		print(count)
		parseListHtml(getHtmlContext(np[0]))
	# print(infos)

def parseDetailHtml(htm,ind):
	# 要求经验和学历要求
	exp = htm.xpath(".//div[@class='cn']/p[@class='msg ltype']/@title")
	if len(exp)==0:
		return [[" "]]
	else:
		ss = exp[0].split("\xa0\xa0|\xa0\xa0")
		# 招聘要求
		requests = htm.xpath(".//div[@class='bmsg job_msg inbox']/*/text()")
		# tmp = ss+[str(requests).replace("[","").replace("【","").replace("]","").replace("'","").replace("|","").replace("\\r\\n","").replace(" ","")]
		return ss+[re.sub(r"[\\r\\n[\s\]'【】]","",str(requests))]

	
	

def dataWriteCsv():
	import csv
	with open("e:/job.csv",'a',newline='') as file:
		writer = csv.writer(file)
		for info in infos:
			writer.writerow(tuple(info))



if __name__ == '__main__':
	address = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%2594%25B5%25E7%25AB%259E,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
	parseListHtml(getHtmlContext(address))
	dataWriteCsv()

发布了94 篇原创文章 · 获赞 110 · 访问量 5037

猜你喜欢

转载自blog.csdn.net/beautiful_huang/article/details/103994681