一:url的处理
import urllib.request from lxml import etree def bo_url(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) res = urllib.request.urlopen(req) return res
二:爬取并匹配数据
def bo_spider(res): html = res.read() tree = etree.HTML(html) bo_list = tree.xpath('//div[@class="job-list"]//ul') bo_dict = {} for bo in bo_list: bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()') bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()') bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()') bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()') bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()') print(bo_dict) return bo_dict
三:用json存储匹配的数据
def xiazai(bo_dict): bo_list = json.dumps(bo_dict) with open("boos.json", 'a') as fp: fp.write(json.dumps(bo_list)) fp.close() return bo_list
四:控制函数
def main(): work = input("请输入你要爬取的岗位名称") url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position=" # text = xiazai(bo_spider(bo_url(url))) text = bo_spider(bo_url(url)) return text if __name__ == '__main__': main()五:整体代码
import json import urllib.request from lxml import etree def bo_url(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) res = urllib.request.urlopen(req) return res def bo_spider(res): html = res.read() tree = etree.HTML(html) bo_list = tree.xpath('//div[@class="job-list"]//ul') bo_dict = {} for bo in bo_list: bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()') bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()') bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()') bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()') bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()') print(bo_dict) return bo_dict def xiazai(bo_dict): bo_list = json.dumps(bo_dict) with open("boos.json", 'a') as fp: fp.write(json.dumps(bo_list)) fp.close() return bo_list def main(): work = input("请输入你要爬取的岗位名称") url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position=" text = xiazai(bo_spider(bo_url(url))) return text if __name__ == '__main__': main()