xpath 语法运用实例【爬取智联】

智联招聘网的工作岗位爬取

一:页数的处理

def zhi_page(url, startpage, endpage):
    for page in range(startpage, endpage):
        page = url + str(page)
        return page

二:响应数据的获取

import urllib.request
import urllib.parse
from lxml import etree
def zhi_url(page):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=page, headers=headers)
    res = urllib.request.urlopen(req)
    return res

三:网页上想要数据的匹配

def zhi_spider(res):
    html = res.read()
    tree = etree.HTML(html)
    zhi_list = tree.xpath('//div[@id="newlist_list_content_table"]/table')
    zhis = {}
    for zhi in zhi_list:
        zhis['职位类型'] = zhi.xpath('.//div/a/text()')
        zhis['公司名称'] = zhi.xpath('.//td[@class="gsmc"]//text()')
        zhis['待遇'] = zhi.xpath('.//td[@class="zwyx"]/text()')
        zhis['工作地点'] = zhi.xpath('.//td[@class="gzdd"]/text()')
        zhis['发布时间'] = zhi.xpath('.//td[@class="gzdd"]/text()')
    return zhis

四:主函数

def main():
    dd = input('输入工作的地点')
    dd = urllib.parse.quote(dd)
    work = input('输入工作的类型')
    startpage = int(input('输入起始页'))
    endpage = int(input('输入结束页'))
    url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=" + dd + "&kw=" + work + "&sm=0&p="
    text = zhi_spider(zhi_url(zhi_page(url, startpage, endpage)))
    return text

if __name__ == '__main__':
    main()

整体代码的样式

import urllib.request
import urllib.parse
from lxml import etree
def zhi_url(page):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=page, headers=headers)
    res = urllib.request.urlopen(req)
    return res

def zhi_page(url, startpage, endpage):
    for page in range(startpage, endpage):
        page = url + str(page)
        return page

def zhi_spider(res):
    html = res.read()
    tree = etree.HTML(html)
    zhi_list = tree.xpath('//div[@id="newlist_list_content_table"]/table')
    zhis = {}
    for zhi in zhi_list:
        zhis['职位类型'] = zhi.xpath('.//div/a/text()')
        zhis['公司名称'] = zhi.xpath('.//td[@class="gsmc"]//text()')
        zhis['待遇'] = zhi.xpath('.//td[@class="zwyx"]/text()')
        zhis['工作地点'] = zhi.xpath('.//td[@class="gzdd"]/text()')
        zhis['发布时间'] = zhi.xpath('.//td[@class="gzdd"]/text()')
    return zhis

def main():
    dd = input('输入工作的地点')
    dd = urllib.parse.quote(dd)
    work = input('输入工作的类型')
    startpage = int(input('输入起始页'))
    endpage = int(input('输入结束页'))
    url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=" + dd + "&kw=" + work + "&sm=0&p="
    text = zhi_spider(zhi_url(zhi_page(url, startpage, endpage)))
    return text

if __name__ == '__main__':
    main()



猜你喜欢

转载自blog.csdn.net/mjp_erhuo/article/details/80245733