xpath 语法运用实例【爬取boos】

一:url的处理

import urllib.request
from lxml import etree
def bo_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=url,headers=headers)
    res = urllib.request.urlopen(req)
    return res

二:爬取并匹配数据

def bo_spider(res):
    html = res.read()
    tree = etree.HTML(html)
    bo_list = tree.xpath('//div[@class="job-list"]//ul')
    bo_dict = {}
    for bo in bo_list:
        bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()')
        bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()')
        bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()')
        bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()')
        bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()')
        print(bo_dict)
    return bo_dict

三:用json存储匹配的数据

def xiazai(bo_dict):
    bo_list = json.dumps(bo_dict)
    with open("boos.json", 'a') as fp:
        fp.write(json.dumps(bo_list))
        fp.close()
    return bo_list

四:控制函数

def main():
    work = input("请输入你要爬取的岗位名称")
    url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position="
    # text = xiazai(bo_spider(bo_url(url)))
    text = bo_spider(bo_url(url))
    return text
if __name__ == '__main__':
    main()
五:整体代码

import json
import urllib.request
from lxml import etree
def bo_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=url,headers=headers)
    res = urllib.request.urlopen(req)
    return res

def bo_spider(res):
    html = res.read()
    tree = etree.HTML(html)
    bo_list = tree.xpath('//div[@class="job-list"]//ul')
    bo_dict = {}
    for bo in bo_list:
        bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()')
        bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()')
        bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()')
        bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()')
        bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()')
        print(bo_dict)
    return bo_dict

def xiazai(bo_dict):
    bo_list = json.dumps(bo_dict)
    with open("boos.json", 'a') as fp:
        fp.write(json.dumps(bo_list))
        fp.close()
    return bo_list

def main():
    work = input("请输入你要爬取的岗位名称")
    url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position="
    text = xiazai(bo_spider(bo_url(url)))
    return text
if __name__ == '__main__':
    main()


猜你喜欢

转载自blog.csdn.net/mjp_erhuo/article/details/80236844