Python2爬虫，单线程

#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: ligang
@contact: [email protected]
@software: PyCharm IDEA
@file: company_position_count.py
@create at: 2018-09-04 10:40
"""
from mf_utils.core import BaseInitCore
from mf_utils.logger import Logger
from mf_utils.decorates import cls_catch_exception
from datetime import datetime
# from mf_utils.sql.redis_m import get_redis_client
from mf_utils.sql.mysql import MysqlHandle
from collections import OrderedDict

import json, re, xlrd, codecs


class CompanyPosition(BaseInitCore):
    def __init__(self):
        super(CompanyPosition, self).__init__()
        self.logger = Logger.file_logger()
        self.mysql_handle = MysqlHandle(host='127.0.0.1', user="root", passwd='mysql', db='ligang',
                                        port=3306, charset='utf8')

    def get_zhi_lian_position_list(self, company_name, start=0, res_lst=None):
        try:
            url = 'https://fe-api.zhaopin.com/c/i/sou?' \
                  'start={start}&pageSize=60&cityId=489' \
                  '&kw={company_name}' \
                  '&kt=2'.format(start=start, company_name=company_name)
            res = self.html_downloader.download(url)
            data_lst = json.loads(res.text).get('data').get('results')
            total = int(json.loads(res.text).get('data').get('numFound'))
            current_page = int(re.findall(
                '(?<=start=).*?(?=&)', res.url)[0]) / 60 + 1
            self.logger.debug('current_page - %s' % current_page)

            for data in data_lst:
                position_info = dict()
                position_info['site'] = 'ZHI_LIAN'
                position_info['city'] = data.get('city').get('display')
                position_info['jobName'] = data.get('jobName')

                if len(res_lst) >= 60:
                    return res_lst
                res_lst.append(position_info)

            start = current_page * 60
            if (current_page - 1) * 60 < total:
                self.get_zhi_lian_position_list(
                    company_name, start=start, res_lst=res_lst)
            return res_lst
        except Exception as e:
            self.logger.exception(e)
            return res_lst

    def get_five_one_position_list(self, company_name, page=1, res_lst=None):
        try:
            url = 'https://search.51job.com/list/000000,' \
                  '000000,0000,00,9,99,{company_name}' \
                  ',1,{page}.html'.format(company_name=company_name, page=page)

            res = self.html_downloader.download(url)
            soups = self.html_parser.parser(res.content)

            current_page = int(soups.find(
                'div', class_='p_in').find('li', class_='on').text)
            total_page = int(re.findall('\d+', soups.find(
                'div', class_='p_in').find('span', class_='td').text)[0])
            self.logger.debug('current_page - %s' % current_page)

            data_lst = soups.find(
                'div', id='resultList').find_all('div', class_='el')[1:]

            for data in data_lst:
                position_info = dict()
                position_info['site'] = 'FIVE_ONE'
                position_info['jobName'] = data.find('a').get('title')

                if len(res_lst) >= 30:
                    return res_lst
                res_lst.append(position_info)
            if current_page < total_page:
                page += 1
                self.get_five_one_position_list(
                    company_name, page=page, res_lst=res_lst)
            return res_lst
        except Exception as e:
            self.logger.exception(e)
            return res_lst

    @cls_catch_exception
    def get_zhi_lian_position_detail(self, job_id):
        url = 'https://jobs.zhaopin.com/{}.htm'.format(job_id)
        headers = {
            'Cookie': 'ZP_OLD_FLAG=false;'
        }
        res = self.html_downloader.download(url, headers=headers)
        self.logger.debug('get detail {}'.format(job_id))
        soups = self.html_parser.parser(res.content)
        position_desc = soups.find('div', class_='pos-ul').text.strip()
        return position_desc

    @cls_catch_exception
    def get_five_one_position_detail(self, job_id):
        url = 'https://jobs.51job.com/all/{}.html'.format(job_id)
        res = self.html_downloader.download(url)
        self.logger.debug('get detail {}'.format(job_id))
        soups = self.html_parser.gbk_parser(res.content)

        city, exp, degree = soups.find(
            'p', class_='msg ltype').text.strip().replace(u' ', '').split('|')[:3]

        if u'招' in degree:
            degree = ''
        position_desc_lst = soups.find(
            'div', class_='bmsg job_msg inbox').find_all('p', recursive=False)
        position_desc = ''.join(
            map(lambda x: x.text.strip(), position_desc_lst)).replace('\n', ' ')

        return city, exp, degree, position_desc


def main():
    cp = CompanyPosition()
    cp.logger.info('start')
    positions = [
        'FIVE_ONE|天津津天连达贸易有限公司',
        'FIVE_ONE|上海丽享贸易有限公司'
    ]
    for task in positions:
        site, company_name = task.split("|")
        cp.logger.info('start_task: {} | {}'.format(site, company_name))
        if site == "ZHI_LIAN":
            res_lst = cp.get_zhi_lian_position_list(company_name, res_lst=[])
        elif site == "FIVE_ONE":
            res_lst = cp.get_five_one_position_list(company_name, res_lst=[])
        else:
            res_lst = []
        sql = 'insert into ligang.lg_position(company_name,city, position, source,publis_time) values(%s,%s,%s,%s,%s)'
        for res in res_lst:
            print res
            data = (company_name, res.get('city'), res.get('jobName'), site, datetime.now())
            cp.mysql_handle.save(sql=sql, data=data)
        print json.dumps(res_lst, ensure_ascii=False, indent=4)
        cp.logger.info(
            'match position : {} ,TOTAL'
            '{}'.format(site, len(res_lst))
        )

if __name__ == '__main__':
    main()

mysql.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: ligang
@contact: [email protected]
@software: PyCharm
@file: mysql.py
@create at: 2017-12-15 14:20

这一行开始写关于本文件的说明与解释
"""

from mf_utils.exceptions import MfImportException, MfDbException
from mf_utils.decorates import cls2singleton

try:
    import MySQLdb
except ImportError:
    raise MfImportException(
        "You should install MySQLdb first! try: pip install "
        "mysql-python")
try:
    from DBUtils.PersistentDB import PersistentDB
except ImportError:
    raise MfImportException(
        "You should install DBUtils first! try: pip install "
        "dbutils")


@cls2singleton
class MysqlHandle(object):
    def __init__(self, **kwargs):
        self._mysql_pool = PersistentDB(MySQLdb, **kwargs)

    def query_by_sql(self, sql):
        conn = self._mysql_pool.connection()
        cur = conn.cursor()
        try:
            cur.execute(sql)
            result = cur.fetchall()
            return result
        except Exception as e:
            raise MfDbException(e)
        finally:
            cur.close()
            conn.close()

    def save(self, sql, data, many=False, get_last_insert_id=False):
        conn = self._mysql_pool.connection()
        cur = conn.cursor()
        try:
            if many is False:
                cur.execute(sql, data)
            else:
                cur.executemany(sql, data)
            conn.commit()

            if get_last_insert_id is False:
                return

            cur.execute("select last_insert_id()")
            return cur.fetchone()[0]

        except Exception as e:
            raise MfDbException(e)
        finally:
            cur.close()
            conn.close()

Python2爬虫，单线程

猜你喜欢