python爬取智联招聘信息

import random
import re
from time import sleep
import requests
from tqdm import tqdm
import user_agents
import csv

def get_page(city,keyword,page):
    # 构造请求地址
    paras = {
        'jl': city,  #搜索城市
        'kw': keyword,  #搜索关键词
        'isadv': 0,
        'isfilter': 1,
        'p': page          #搜索页数
    }
    #完整网页地址
    url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?'
    #请求头
    headers = {
        'User-Agent': random.choice(user_agents.agents),
        'Host': 'sou.zhaopin.com',
        'Referer': 'https://zhaopin.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
    }

    try:
        response = requests.get(url, params=paras, headers=headers)
        # 通过状态码判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except:
        return None

def parse_page(html):
    # 正则表达式匹配需要的信息
    pattern = re.compile(
                         '<td class="zwmc".*? href="(.*?)" target="_blank">(.*?)</a>.*?' # 职位链接和职位名称
                         '<td.*? class="fk_lv".*?<span>(.*?)</span>.*?'                  # 反馈率
                         '<td class="gsmc".*? href="(.*?)" target="_blank">(.*?)</a>.*?'  # 公司链接和公司名称  
                         '<td class="zwyx">(.*?)</td>.*?'                                # 月薪
                         '<td class="gzdd">(.*?)</td>.*?'                                # 地点  
                         '<td class="gxsj".*?<span>(.*?)</span>.*?'                      #发布时间
                         , re.S)
    # 匹配所有符合标准的内容
    data = re.findall(pattern, html)
    # print(items)

    #去掉前面置顶的无用信息 换了职位后手动增加或者减少
    _, _, _, _, *items = data
    # print(items)
    for item in items:
        job_name = item[1]
        job_name = job_name.replace('<b>', '')
        job_name = job_name.replace('</b>', '')
        yield {
            'zhiweilianjie': item[0],
            'jobname': job_name,
            'Response Rate': item[2],
            'gongshilianjie': item[3],
            'company': item[4],
            'salary': item[5],
            'address': item[6],
            'time': item[7]
        }

def write_file_header(file_name, headers):
    """
    写入表头(第一行)
    :param file_name:
    :param headers:
    :return:
    """
    with open(file_name, 'a', encoding='utf-8', newline='') as f:
        f_csv = csv.DictWriter(f, headers)
        f_csv.writeheader()

def write_file_rows(file_name, headers, rows):
    """
    写入信息
    :param file_name:
    :param headers:
    :param rows:
    :return:
    """
    with open(file_name, 'a', encoding='utf-8', newline='') as f:
        f_csv = csv.DictWriter(f, headers)
        f_csv.writerows(rows)


def main(city, keyword, page):
    file_name = '/Users/xiongxing/Desktop/' + '智联' + city + keyword + '.csv'
    headers = ['zhiweilianjie', 'jobname', 'Response Rate', 'gongshilianjie', 'company', 'salary', 'address', 'time']
    write_file_header(file_name, headers)
    for i in tqdm(range(page)):
        job = []
        html = get_page(city, keyword, i)
        # print(html)
        sleep(0.1)
        contents = parse_page(html)
        for item in contents:
            # print(item)
            job.append(item)
        write_file_rows(file_name, headers, job)
   

if __name__ == '__main__':
    main('成都', 'python', 1) #可更换搜索条件

猜你喜欢

转载自blog.csdn.net/xx117501/article/details/80356297