import random import re from time import sleep import requests from tqdm import tqdm import user_agents import csv def get_page(city,keyword,page): # 构造请求地址 paras = { 'jl': city, #搜索城市 'kw': keyword, #搜索关键词 'isadv': 0, 'isfilter': 1, 'p': page #搜索页数 } #完整网页地址 url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' #请求头 headers = { 'User-Agent': random.choice(user_agents.agents), 'Host': 'sou.zhaopin.com', 'Referer': 'https://zhaopin.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' } try: response = requests.get(url, params=paras, headers=headers) # 通过状态码判断是否获取成功 if response.status_code == 200: return response.text return None except: return None def parse_page(html): # 正则表达式匹配需要的信息 pattern = re.compile( '<td class="zwmc".*? href="(.*?)" target="_blank">(.*?)</a>.*?' # 职位链接和职位名称 '<td.*? class="fk_lv".*?<span>(.*?)</span>.*?' # 反馈率 '<td class="gsmc".*? href="(.*?)" target="_blank">(.*?)</a>.*?' # 公司链接和公司名称 '<td class="zwyx">(.*?)</td>.*?' # 月薪 '<td class="gzdd">(.*?)</td>.*?' # 地点 '<td class="gxsj".*?<span>(.*?)</span>.*?' #发布时间 , re.S) # 匹配所有符合标准的内容 data = re.findall(pattern, html) # print(items) #去掉前面置顶的无用信息 换了职位后手动增加或者减少 _, _, _, _, *items = data # print(items) for item in items: job_name = item[1] job_name = job_name.replace('<b>', '') job_name = job_name.replace('</b>', '') yield { 'zhiweilianjie': item[0], 'jobname': job_name, 'Response Rate': item[2], 'gongshilianjie': item[3], 'company': item[4], 'salary': item[5], 'address': item[6], 'time': item[7] } def write_file_header(file_name, headers): """ 写入表头(第一行) :param file_name: :param headers: :return: """ with open(file_name, 'a', encoding='utf-8', newline='') as f: f_csv = csv.DictWriter(f, headers) f_csv.writeheader() def write_file_rows(file_name, headers, rows): """ 写入信息 :param file_name: :param headers: :param rows: :return: """ with open(file_name, 'a', encoding='utf-8', newline='') as f: f_csv = csv.DictWriter(f, headers) f_csv.writerows(rows) def main(city, keyword, page): file_name = '/Users/xiongxing/Desktop/' + '智联' + city + keyword + '.csv' headers = ['zhiweilianjie', 'jobname', 'Response Rate', 'gongshilianjie', 'company', 'salary', 'address', 'time'] write_file_header(file_name, headers) for i in tqdm(range(page)): job = [] html = get_page(city, keyword, i) # print(html) sleep(0.1) contents = parse_page(html) for item in contents: # print(item) job.append(item) write_file_rows(file_name, headers, job) if __name__ == '__main__': main('成都', 'python', 1) #可更换搜索条件
python爬取智联招聘信息
猜你喜欢
转载自blog.csdn.net/xx117501/article/details/80356297
周排行