#! /usr/bin/env python # -*- coding: utf-8 -*- """ @author: ligang @contact: [email protected] @software: PyCharm IDEA @file: company_position_count.py @create at: 2018-09-04 10:40 """ from mf_utils.core import BaseInitCore from mf_utils.logger import Logger from mf_utils.decorates import cls_catch_exception from datetime import datetime # from mf_utils.sql.redis_m import get_redis_client from mf_utils.sql.mysql import MysqlHandle from collections import OrderedDict import json, re, xlrd, codecs class CompanyPosition(BaseInitCore): def __init__(self): super(CompanyPosition, self).__init__() self.logger = Logger.file_logger() self.mysql_handle = MysqlHandle(host='127.0.0.1', user="root", passwd='mysql', db='ligang', port=3306, charset='utf8') def get_zhi_lian_position_list(self, company_name, start=0, res_lst=None): try: url = 'https://fe-api.zhaopin.com/c/i/sou?' \ 'start={start}&pageSize=60&cityId=489' \ '&kw={company_name}' \ '&kt=2'.format(start=start, company_name=company_name) res = self.html_downloader.download(url) data_lst = json.loads(res.text).get('data').get('results') total = int(json.loads(res.text).get('data').get('numFound')) current_page = int(re.findall( '(?<=start=).*?(?=&)', res.url)[0]) / 60 + 1 self.logger.debug('current_page - %s' % current_page) for data in data_lst: position_info = dict() position_info['site'] = 'ZHI_LIAN' position_info['city'] = data.get('city').get('display') position_info['jobName'] = data.get('jobName') if len(res_lst) >= 60: return res_lst res_lst.append(position_info) start = current_page * 60 if (current_page - 1) * 60 < total: self.get_zhi_lian_position_list( company_name, start=start, res_lst=res_lst) return res_lst except Exception as e: self.logger.exception(e) return res_lst def get_five_one_position_list(self, company_name, page=1, res_lst=None): try: url = 'https://search.51job.com/list/000000,' \ '000000,0000,00,9,99,{company_name}' \ ',1,{page}.html'.format(company_name=company_name, page=page) res = self.html_downloader.download(url) soups = self.html_parser.parser(res.content) current_page = int(soups.find( 'div', class_='p_in').find('li', class_='on').text) total_page = int(re.findall('\d+', soups.find( 'div', class_='p_in').find('span', class_='td').text)[0]) self.logger.debug('current_page - %s' % current_page) data_lst = soups.find( 'div', id='resultList').find_all('div', class_='el')[1:] for data in data_lst: position_info = dict() position_info['site'] = 'FIVE_ONE' position_info['jobName'] = data.find('a').get('title') if len(res_lst) >= 30: return res_lst res_lst.append(position_info) if current_page < total_page: page += 1 self.get_five_one_position_list( company_name, page=page, res_lst=res_lst) return res_lst except Exception as e: self.logger.exception(e) return res_lst @cls_catch_exception def get_zhi_lian_position_detail(self, job_id): url = 'https://jobs.zhaopin.com/{}.htm'.format(job_id) headers = { 'Cookie': 'ZP_OLD_FLAG=false;' } res = self.html_downloader.download(url, headers=headers) self.logger.debug('get detail {}'.format(job_id)) soups = self.html_parser.parser(res.content) position_desc = soups.find('div', class_='pos-ul').text.strip() return position_desc @cls_catch_exception def get_five_one_position_detail(self, job_id): url = 'https://jobs.51job.com/all/{}.html'.format(job_id) res = self.html_downloader.download(url) self.logger.debug('get detail {}'.format(job_id)) soups = self.html_parser.gbk_parser(res.content) city, exp, degree = soups.find( 'p', class_='msg ltype').text.strip().replace(u' ', '').split('|')[:3] if u'招' in degree: degree = '' position_desc_lst = soups.find( 'div', class_='bmsg job_msg inbox').find_all('p', recursive=False) position_desc = ''.join( map(lambda x: x.text.strip(), position_desc_lst)).replace('\n', ' ') return city, exp, degree, position_desc def main(): cp = CompanyPosition() cp.logger.info('start') positions = [ 'FIVE_ONE|天津津天连达贸易有限公司', 'FIVE_ONE|上海丽享贸易有限公司' ] for task in positions: site, company_name = task.split("|") cp.logger.info('start_task: {} | {}'.format(site, company_name)) if site == "ZHI_LIAN": res_lst = cp.get_zhi_lian_position_list(company_name, res_lst=[]) elif site == "FIVE_ONE": res_lst = cp.get_five_one_position_list(company_name, res_lst=[]) else: res_lst = [] sql = 'insert into ligang.lg_position(company_name,city, position, source,publis_time) values(%s,%s,%s,%s,%s)' for res in res_lst: print res data = (company_name, res.get('city'), res.get('jobName'), site, datetime.now()) cp.mysql_handle.save(sql=sql, data=data) print json.dumps(res_lst, ensure_ascii=False, indent=4) cp.logger.info( 'match position : {} ,TOTAL' '{}'.format(site, len(res_lst)) ) if __name__ == '__main__': main()
mysql.py
#! /usr/bin/env python # -*- coding: utf-8 -*- """ @author: ligang @contact: [email protected] @software: PyCharm @file: mysql.py @create at: 2017-12-15 14:20 这一行开始写关于本文件的说明与解释 """ from mf_utils.exceptions import MfImportException, MfDbException from mf_utils.decorates import cls2singleton try: import MySQLdb except ImportError: raise MfImportException( "You should install MySQLdb first! try: pip install " "mysql-python") try: from DBUtils.PersistentDB import PersistentDB except ImportError: raise MfImportException( "You should install DBUtils first! try: pip install " "dbutils") @cls2singleton class MysqlHandle(object): def __init__(self, **kwargs): self._mysql_pool = PersistentDB(MySQLdb, **kwargs) def query_by_sql(self, sql): conn = self._mysql_pool.connection() cur = conn.cursor() try: cur.execute(sql) result = cur.fetchall() return result except Exception as e: raise MfDbException(e) finally: cur.close() conn.close() def save(self, sql, data, many=False, get_last_insert_id=False): conn = self._mysql_pool.connection() cur = conn.cursor() try: if many is False: cur.execute(sql, data) else: cur.executemany(sql, data) conn.commit() if get_last_insert_id is False: return cur.execute("select last_insert_id()") return cur.fetchone()[0] except Exception as e: raise MfDbException(e) finally: cur.close() conn.close()