[Python爬虫]智联招聘

import requests
import re
import json
import time
from openpyxl import workbook
from openpyxl import load_workbook
from pymongo import MongoClient
from requests.exceptions import RequestException
import csv
import xlwt
def get_one_page(url):
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
        'Host':'search.51job.com',
        'Cookie':'partner=www_baidu_com; guid=992c8a3fa4140d299ad06533b8965bdd; 51job=cenglish%3D0%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA04%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA05%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60030200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21',
    }
    try:
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            response.encoding='gbk'
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html):
    pattern=re.compile('<a target="_blank" title="(.*?)"[.\s\S]*?<span class="t2"><a target="_blank" title=.*?href="(.*?)">'
                       '(.*?)</a></span>[.\s\S]*?<span class="t3">(.*?)</span>[.\s\S]*?<span class="t4">(.*?)</span>',re.S)
    results=re.findall(pattern,html)
    return results

def write_to_file(content):
    with open('jobs.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()
client=MongoClient()
db=client['jobs']
collections=db['jobs']

def save_to_mongo(result):
    if collections.insert(result):
        print('Saved to Mongo')

def save_to_csv(result):
    with open('data.csv','a',encoding='utf-8') as csvfile:
        fieldnames=['name','web','company','location','salary']
        writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow(result)
def save_to_excel(result):
    global ws
    ws.append(result)
def main(page):
    #url='https://search.51job.com/list/040000,000000,0000,00,9,99,C%252B%252B%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,'+str(page)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    #url='https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%259F%25B9%25E8%25AE%25AD%25E6%259C%25BA%25E6%259E%2584,2,'+str(page)+'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    url='https://search.51job.com/list/010000,000000,0000,00,9,09,%25E5%259F%25B9%25E8%25AE%25AD%25E6%259C%25BA%25E6%259E%2584,2,'+str(page)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    html=get_one_page(url)
    for content in parse_one_page(html):
        print(content)
        save_to_excel(content)
if __name__=='__main__':
    wb=workbook.Workbook()
    ws=wb.active
    fieldnames = ['name', 'web', 'company', 'location', 'salary']
    ws.append(fieldnames)
    for i in range(1,3):
        main(i)
        time.sleep(1)
    wb.save('data.xlsx')

猜你喜欢

转载自www.cnblogs.com/lightmonster/p/11602988.html