用selenium爬取、BeautifulSoup解析拉勾网职位信息,oop模式,带自动翻页

拉勾网职位信息的爬取
爬取关键字为python,工作地点在北京朝阳区的工作信息,从中提取今天发布的岗位信息,url:
https://www.lagou.com/jobs/list_python/p-city_2?px=default&district=朝阳区#filterBox
oop代码思路:spider(爬取网页原始内容)----parser(解析原始内容,得到岗位信息)----job(岗位信息对象)
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import time
import pandas as pd   # 为了存储到csv


class Spider():
    def __init__(self,target_url):
        self.url = target_url
        self.raw_pages = []
        self.boot()

    def boot(self):
        self.chrome = Chrome(executable_path='C:/Users/lori/Desktop/python52project/chromedriver_win32/chromedriver.exe')
        self.chrome.start_client()

    def crawl(self):
        self.chrome.get(self.url)
        print('loading Webpage...')
        time.sleep(5)
        while True:
            raw_page = self.chrome.page_source
            self.raw_pages.append(raw_page)
            time.sleep(2)
            if self.next_page():
                next_page_ele = self.next_page()[0]
                self.click_ele(next_page_ele)
                print('点击后加载页面...')
                time.sleep(5)
            else:
                break

    def click_ele(self,ele):
        self.chrome.execute_script("arguments[0].scrollIntoView(false);", ele)  # 移动到元素element对象的“底端”,与当前窗口的“底部”对齐
        time.sleep(2)
        self.chrome.execute_script("arguments[0].click();", ele)  # 不管元素有没有被遮挡,直接在元素上发送单击
        print('点击下一页')

    def next_page(self):
        sel = 'span[class="pager_next "]'    # 可点的"下一页"的元素,不同于灰的“下一页”元素
        return self.chrome.find_elements_by_css_selector(sel)  # 返回一个列表

class Parser():
    def __init__(self,raw_pages):
        self.raw_pages = raw_pages
        self.jobs = []
        pass

    def parse(self):
        for html in self.raw_pages:
            soup = BeautifulSoup(html,'html.parser')
            pos_sel = 'div.position h3'   # text
            time_sel = 'span.format-time'   # text
            money_sel = 'span.money'  # text
            company_sel = 'div.company_name>a'   # text
            link_sel = 'a.position_link'  # href

            pos_eles = soup.select(pos_sel)
            time_eles = soup.select(time_sel)
            money_eles = soup.select(money_sel)
            company_eles = soup.select(company_sel)
            link_eles = soup.select(link_sel)

            for p,t,m,c,l in zip(pos_eles,time_eles,money_eles,company_eles,link_eles):
                cell = {
                    'position':p.text,
                    'time':t.text,
                    'money':m.text,
                    'company':c.text,
                    'link':l.get('href')
                }
                #print(cell)
                self.jobs.append(cell)

    def get_jobs(self):
        return [Job(j) for j in self.jobs]

class Job():
    def __init__(self,data_dict):
        self.position = data_dict.get('position')  # dict.get('key')优于 dict['key'],可以不报错
        self.time = data_dict.get('time')
        self.money = data_dict.get('money')
        self.company = data_dict.get('company')
        self.link = data_dict.get('link')

    def is_today(self):
        return ':' in self.time


target_url = 'https://www.lagou.com/jobs/list_python/p-city_2?px=default&district=朝阳区#filterBox'
s = Spider(target_url)
s.crawl()
print('s.raw_pages的页数:',len(s.raw_pages))
p = Parser(s.raw_pages)
p.parse()
print('-'*40)
jobs = p.get_jobs()
num = 0
for job in jobs:
    if job.is_today():
        num += 1
        print(num,job.position,job.time, job.company)

jobs_df = pd.DataFrame(p.jobs)
jobs_df.to_csv('./lagouwang.csv', index=False, mode='w',encoding='utf-8-sig')

  

猜你喜欢

转载自www.cnblogs.com/djlbolgs/p/12513516.html