Python爬虫(爬取招聘网站信息)

这篇博客实现了一个python网络爬虫，爬取实习僧网站上的一些信息，存储到MongDB中，并设计了一个server和client，client给server发送要查询的岗位信息，server在数据库中查询，并返回给client。server使用Flask实现。

一共分为三个文件，crawler.py, server.py和client.py。 crawler.py负责爬取网站内容。

下面是源代码：
crawler.py
该爬虫使用了multiprocessing 和线程池，ThreadPool来提高运行效率。由爬虫每次爬取网页内容时，等待I/O的过程中都要消耗一些时间，因此使用多线程可以提高总体的执行效率。使用多进程的原因是为了提高CPU利用率，加快运算速度。因为Python中的线程需要获取GIL才能运行，因此使用多线程不能提高CPU利用率。因而使用多进程。

MongoDB在python中有pymongo作为支持的库。
数据库表的名字是‘craw‘
创建了MongoConnector类，包含数据库的操作的常用方法。

引入logging模块，并使用Logger类，可创建Logger对象并获取logger句柄。

MyCrawler类中，通过seed page循环抓取网页上的链接，并将每个链接中的岗位title存储到数据库中，已经存储的则跳过。

getPageSource中使用BeautifulSoup中提供的方法获取网页源码，并拿到title值。

这里使用了广度优先算法，广度值为3。这部分代码是借鉴的另一篇博客，链接在此：https://blog.csdn.net/weixin_34613450/article/details/72810595

from pymongo import MongoClient
import multiprocessing
from multiprocessing import Process
from concurrent.futures import ThreadPoolExecutor
import time
import random
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
from bs4 import BeautifulSoup
import logging

class MongoConnector:
    def __init__(self):
        # create the connection with mongodb
        self.client = MongoClient('localhost', 27017)
        self.db = self.client.mydatabase
    # 获取数据库中所有pairs
    def GetCollection(self):
        self.collection = self.db['craw']
        return self.collection
    # 插入一条记录
    def Insert(self,pair):
        self.client.mydatabase.craw.insert_one(pair)
    # 关闭数据库
    def Close(self):
        self.client.close()
    #查找记录并返回结果
    def Find(self,name,value):
        row = self.collection.find({name:value})
        return row

class Logger():
    def __init__(self,filename):
        # create a logger with the program name and the level = DEBUG
        self.logger = logging.getLogger(__name__)
        # logger.setLevel(logging.DEBUG)
        # set up logging to file
        logging.basicConfig(
            format='[%(asctime)s] [%(levelname)s] [%(processName)s] [%(threadName)s] : %(message)s',
            level=logging.INFO)
        # create a handler that will write log to stderr
        self.file_handler = logging.FileHandler(filename)
        self.file_handler.setLevel(logging.INFO)
        self.logger.addHandler(self.file_handler)

    # 获取logger
    def getLogger(self):
        return self.logger

#自定义队列类
class linkQuence:
    def __init__(self):
        # 已访问的url集合
        self.visted = []
        # 待访问的url集合
        self.unVisited = []

    # 获取访问过的url队列
    def getVisitedUrl(self):
        return self.visted

    # 获取未访问的url队列
    def getUnvisitedUrl(self):
        return self.unVisited

    # 添加到访问过得url队列中
    def addVisitedUrl(self, url):
        self.visted.append(url)

    # 移除访问过得url
    def removeVisitedUrl(self, url):
        self.visted.remove(url)

    # 未访问过得url出队列
    def unVisitedUrlDeQuence(self):
        try:
            return self.unVisited.pop()
        except:
            return None

    # 保证每个url只被访问一次
    def addUnvisitedUrl(self, url):
        if url != "" and url not in self.visted and url not in self.unVisited:
            self.unVisited.insert(0, url)

    # 获得已访问的url数目
    def getVisitedUrlCount(self):
        return len(self.visted)

    # 获得未访问的url数目
    def getUnvistedUrlCount(self):
        return len(self.unVisited)

    # 判断未访问的url队列是否为空
    def unVisitedUrlsEnmpy(self):
        return len(self.unVisited) == 0

class MyCrawler:
    def __init__(self,seeds):
        # 初始化当前抓取的深度
        self.current_deepth = 1
        # 构造访问代理
        self.proxy = '119.28.142.148:8888'
        self.proxy_handler = ProxyHandler({
            'http': 'http://' + self.proxy,
            'https': 'https://' + self.proxy
        })
        self.opener = build_opener(self.proxy_handler)
        # 构造一个linkQuence对象,使用种子初始化url队列
        self.linkQuence = linkQuence()
        if isinstance(seeds, str):
            self.linkQuence.addUnvisitedUrl(seeds)
        if isinstance(seeds, list):
            for seed in seeds:
                self.linkQuence.addUnvisitedUrl(seed)
        # print("Add the seeds url %s to the unvisited url list" % str(self.linkQuence.unVisited))
        # logger.info(("Add the seeds url {} to the unvisited url list").format(str(self.linkQuence.unVisited)))

    # 抓取过程主函数
    def crawling1(self, seeds):
        # 获取超链接
        links = self.getHyperLinks(seeds)
        logger.info(("Get {} new links").format(len(links)))

    def crawling(self, seeds, crawl_deepth):
            # 循环条件：抓取深度不超过crawl_deepth
            while self.current_deepth <= crawl_deepth:
                # 循环条件：待抓取的链接不空
                while not self.linkQuence.unVisitedUrlsEnmpy():
                    # 队头url出队列
                    visitUrl = self.linkQuence.unVisitedUrlDeQuence()
                    #logger.info(("Pop out one url {} from unvisited url list").format(visitUrl))

                    if visitUrl is None or visitUrl == "":
                        continue
                    # 获取超链接
                    links = self.getHyperLinks(visitUrl)
                    logger.info(("Get {} new links").format(len(links)))

                    # 将url放入已访问的url中
                    self.linkQuence.addVisitedUrl(visitUrl)
                    #logger.info(("Visited url count:{} " ).format(str(self.linkQuence.getVisitedUrlCount())))

                    #logger.info(("Visited deepth: {}").format(str(self.current_deepth)))

                # 未访问的url入列
                for link in links:
                    self.linkQuence.addUnvisitedUrl(link)
                # print("%d unvisited links:" % len(self.linkQuence.getUnvisitedUrl()))
                #logger.info(("{} unvisited links:").format(len(self.linkQuence.getUnvisitedUrl())))

                self.current_deepth += 1

    # 获取源码中的超链接
    def getHyperLinks(self, url):
        got_link = None
        links = set()
        soup = self.getPageSource(url)

        for link in soup.find_all('a'):
            href = str(link.get('href')).split()
            if 'None' in href or 'javascript' in href or href == '/' or 'Mailto:[email protected]' in href:
                pass
            else:
                if len(href) == 1:
                    if 'http:' in href[0] or 'www' in href[0] or 'shixiseng' in href[0]:
                        got_link = href[0]
                    else:
                        got_link = 'https://www.shixiseng.com' + href[0]
                elif len(href) == 2:
                    got_link = 'https://www.shixiseng.com' + href[0] + href[1]

            links.add(got_link)
        logger.info(('Get {} new links').format(len(links)))
        self.getTitles(links)
        return links

    # 获取网页源码
    def getPageSource(self, url):
        try:
            html = self.opener.open(url)
            soup = BeautifulSoup(html.read().decode('utf-8'),'lxml')
            return soup
        except URLError as e:
            logger.error(('URLError:{}').format(e.reason))
            return ''
        except UnicodeEncodeError as e:
            logger.error(('UnicodeEncodeError:{}').format(e.reason))

    def getTitles(self,links):
        for link in links:
            # check if link is already in database
            row = mongo.Find('url', link)
            if row.count() > 0:
                logger.info(('URl {} already in database.').format(link))
            else:
                # page = request.Request(link, headers=headers)
                try:
                    html = self.opener.open(link)
                    soup = BeautifulSoup(html.read().decode('utf-8'), 'lxml')
                    title = str(list(soup.title.children))
                    pair = {'title': title[2:-2], 'url': link}
                    mongo.Insert(pair)
                    logger.info(('Insert title:{},url:{} into database.').format(title[2:-2], link))
                except Exception:
                    pass

def main(seeds):
    craw = MyCrawler(seeds)
    craw.crawling(seeds,3)

def TargetFunction(socket_queue, locker,seed_link):
    with ThreadPoolExecutor(max_workers=4) as executor:
        while True:
            seconds = random.random()
            time.sleep(seconds)
            locker.acquire()
            #get mongodb connection
            locker.release()
            # Submit a target function  with arguments to the executor
            executor.submit(main, seed_link)
            # print('seed_link in target function',seed_link)

if __name__ == '__main__':
    logger = Logger('crawler_log.txt').getLogger()

    seed_link = "https://www.shixiseng.com/"

    # create variables for multiprocessing
    processes = []
    socket_queue = multiprocessing.Queue()
    locker = multiprocessing.Lock()

    mongo = MongoConnector()
    collection = mongo.GetCollection()
    # create child processes
    for _ in range(4):
        # Process.sleep(10)
        child_process = Process(target=TargetFunction, args=(socket_queue, locker, seed_link,))
        child_process.start()
        processes.append(child_process)

    mongo.Close()

server.py

from flask import Flask
from flask import jsonify
from pymongo import MongoClient
import logging
# f = open(r'index.html','w')
# f.write(str(soup))

class MongoConnector:
    def __init__(self):
        # create the connection with mongodb
        self.client = MongoClient('localhost', 27017)
        self.db = self.client.mydatabase
    # 获取数据库中所有pairs
    def GetCollection(self):
        self.collection = self.db['data']
        return self.collection
    # 插入一条记录
    def Insert(self,pair):
        self.client.mydatabase.data.insert_one(pair)
    # 关闭数据库
    def Close(self):
        self.client.close()
    #查找记录并返回结果
    def Find(self,name,value):
        row = self.collection.find({name:value})
        return row

class Logger():
    def __init__(self,filename):
        # create a logger with the program name and the level = DEBUG
        self.logger = logging.getLogger(__name__)
        # logger.setLevel(logging.DEBUG)
        # set up logging to file
        logging.basicConfig(
            format='[%(asctime)s] [%(levelname)s] [%(processName)s] [%(threadName)s] : %(message)s',
            level=logging.INFO)
        # create a handler that will write log to stderr
        self.file_handler = logging.FileHandler(filename)
        self.file_handler.setLevel(logging.INFO)
        self.logger.addHandler(self.file_handler)

    # 获取logger
    def getLogger(self):
        return self.logger

app = Flask(__name__)

@app.route('/position/<position_info>', methods=['GET'])
def search(position_info):
    links = set()
    row = collection.find({'title': {'$regex': position_info}})
    result = {}
    for item in row:
        if item['url'] not in links:
            logger.info(('Find a record about {}, url is {}').format(item['title'],item['url']))
            #print('item', item)  # item is a dict
            links.add(item['url'])
            result[item['title']] = item['url']
    #print('result',result)
    response = jsonify(result)
    return response

if __name__ == '__main__':
    logger = Logger('server_log.txt').getLogger()
    mongo = MongoConnector()
    collection = mongo.GetCollection()
    try:
        app.run(host="localhost", port=5002)
    except ValueError as e:
        logger.info(('Error:',e.args))

client.py

import sys
import json
import requests
import logging
import multiprocessing
from multiprocessing import Process
import time
import random

class Logger():
    def __init__(self,file_name):
        # create a logger with the program name and the level = DEBUG
        self.logger = logging.getLogger(__name__)
        # logger.setLevel(logging.DEBUG)
        # set up logging to file
        logging.basicConfig(
            format='[%(asctime)s] [%(levelname)s] [%(processName)s] [%(threadName)s] : %(message)s',
            level=logging.INFO)
        # create a handler that will write log to stderr
        self.file_handler = logging.FileHandler(file_name)
        self.file_handler.setLevel(logging.INFO)
        self.logger.addHandler(self.file_handler)

    # 获取logger
    def getLogger(self):
        return self.logger

def TargetFunction(socket_queue, locker,position_info):
    seconds = random.random()
    time.sleep(seconds)
    locker.acquire()
    locker.release()
    try:
        request = 'http://{}:{}/position/{}'.format(ipAddress, portNum,position_info)
        response = requests.get(request)
        result = json.dumps(response.json(), indent=4, sort_keys=True)
        logger.info(('Request:/position/{}').format(position_info))
        print(("Search results of '{}':").format(position_info))
        try:
            d = json.loads(result)
            for key,value in d.items():
                print(key,'  You can visit the position here:',value)
                logger.info(('Searched {},{} for {}').format(key,value,position_info))
        except UnboundLocalError as e:
            logger.error('Error:',e.args)
    except json.decoder.JSONDecodeError as e:
        logger.error('Error:',e.args)

if __name__ == "__main__":
    logger = Logger('client_log.txt').getLogger()
    position_info = str(sys.argv[1])
    ipAddress = 'localhost'
    portNum = 5002

    # create variables for multiprocessing
    processes = []
    socket_queue = multiprocessing.Queue()
    locker = multiprocessing.Lock()

    # create child processes
    for _ in range(4):
        # Process.sleep(10)
        child_process = Process(target=TargetFunction, args=(socket_queue, locker, position_info.strip(),))
        child_process.start()
        processes.append(child_process)

Python爬虫(爬取招聘网站信息)

猜你喜欢