爬取猎聘网招聘信息

进入猎聘网首页获取六个大分类网址
进入六个分类获取每个职位网址
进入每个职位招聘信息网站获取第一个招聘信息(如果需要所有的话或者前几个需要修改一下正则表达式)
代码是存入mongodb
若不会配置或者没有mongodb
就不用那个函数
改为我注释的那好几行 存为csv文件

import requests 
import re
import time
import csv
import pymongo
from requests.exceptions import RequestException  
from lxml import etree
def pipei(url):#获取信息
    html=gethtml(url)
    pattern=re.compile('<ul class="sojob-list">.*?li>.*?<div class="sojob-item-main clearfix.*?div class="job-info".*?h3 title="(.*?)">.*?class="text-warning">(.*?)</span>.*?area.*?>(.*?)</.*?span class="edu">(.*?)</span>.*?an>(.*?)</span>.*?<div class="company-info nohover">.*?class="company-name">.*?a.*?>(.*?)</a>.*?<p class="field-financing">.*?an.*?a class="industry-link" href=.*?>(.*?)</a>',re.S)
    result=re.findall(pattern,html)
    if(len(result)>0):#部分职位暂时无招聘
        cuncu(result[0])
#        with open("data.csv","a",encoding='gb18030',newline='')as csvfile:
#            writer=csv.writer(csvfile)
#            writer.writerow(result[0])
#职位信息            print(result[0])
    else:
#        with open("data.csv","a",encoding='gb18030',newline='')as csvfile:
#            writer=csv.writer(csvfile)
#            writer.writerow(["null"])
        print("null")
def gethtml(url):
    try:
        response = requests.get(url)
        if response.status_code ==200:
            return response.text
    except:
        return gethtml(url)
def one_page(url):#获取每个大分类里的小分类
    html=etree.HTML(gethtml(url))
    result=html.xpath('//li/dl/dd/a[contains(@target,"_blank") and @rel="nofollow"]/@href')
    for i in range(len(result)):
        urll="https://www.liepin.com"+result[i]
#网址        print((re.match(r'(.*?)dqs',urll).group(0)))
        pipei((re.match(r'(.*?)dqs',urll).group(0)))
def shouye(url):#获取六个大分类网址
        pattern = re.compile('<a .*?target="_blank" href="https://www.liepin.com/(.*?)">')
        six=re.findall(pattern,gethtml(url))
        for i in six:
            wangzhi="https://www.liepin.com/"+i
            one_page(wangzhi)
def cuncu(result):
    client=pymongo.MongoClient(host='localhost',port=27017)
    db=client.test
    collection=db.jobs
    results={
        'Position':result[0],
        'salary':result[1],
        'area':result[2],
        'edu':result[3],
        'work time':result[4],
        'company':result[5],
        'company-type':result[6]
        }
    collection.insert_one(results)
def main():
#    with open("data.csv","a",encoding='gb18030',newline='')as csvfile:
#        writer=csv.writer(csvfile)
#        writer.writerow(["Position "," salary", "area", "education", "work time", "company", "company type"])
    url='https://www.liepin.com/'
    shouye(url)
if __name__ =='__main__':
    main()

猜你喜欢

转载自blog.csdn.net/weixin_43323333/article/details/90170941