python抓取招聘信息

selenium应用中的坑

  1. selenium库的使用能够很好的让你绕过反爬机制,应为程序在运行的过程中完全符合浏览器的行为,既然是完全符合历览器的行为那么也就不会被轻易的挡在外面,但是在应用过程中还是存在问题的。
  2. spider在获取数据的时候,就是它能看到的,然后是你指定的数据,只要程序员爸爸给了合适的定位操作,spider就能通过定位拿到数据,但是前端的大佬们,往往在写页面的时候,有时候标签的使用会变化,也就让我们的定位无法准确的定位,这时候,你的spider就会出现宕机的危机。
  3. 解决方法:
    1. 在开始编写代码之前,事先打开网页,进行查看,主要观察的地方是,你所获取内容在页面上的展示方式和地方。
    2. 异常处理,利用异常处理来保证你的spider不会死亡,而且通过异常的抛出你也能够进行发先,定位标记的错误,及时进行代码的优化。
    3. 访问时适当的加入等待机制,合适的等待机制,爬取的效率虽然会稍微的降低,但是稳定的数据获取,能够减少你的返工次数。

应用实例:

猎聘网的有关ptython的职位信息的获取(仅用于练习和代码测试

利用chrome的自动控制,进行数据的获取

主要获取的字段为,公司名称、职位名称、薪资、应聘要求。

最后将数据存放到数据库中。

import pymysql
import sys

def save(table):
    print('------------------------------')
    global conn
    conn = pymysql.connect(host='127.0.0.1',
                           user='root',
                           passwd='XXX',
                           port=8080,
                           charset='utf8')
    global cur
    cur = conn.cursor()
    print('获取游标')
    try:
        cur.execute("create database lp character set utf8;")
    except Exception as e:
        print(e)
    cur.execute('use lp;')
    try:
        cur.execute("create table "+table+"(id int,company char(100),job char(200),\
                address char(100),salary char(100),ask varchar(5000))character set utf8;"
                )
    except Exception as e:
        print(e)
    print('创建表完成')
def inser_data(table,id,company,job,address,salary,ask):
    sql_insert = 'insert into '+table+'(id,company,job,address,salary,ask) values (%s,%s,%s,%s,%s,%s);'
    try:
        cur.execute(sql_insert,[id,company,job,address,salary,ask])
    except Exception as e:
        print(e)
    conn.commit()


def my_txt(table,ask):
    f = open(table+'.txt','a+',encoding='utf-8')
    f.write(ask)
    f.close()

'''
职位要求得数据全部存储在本地txt文档制作词云
公司名称,职位名称和薪资字段全部存放于数据库
由于薪资字段得数据显示方式为“XX-XX”的范围所以全部以字符串的形式进行存放
'''
from selenium import webdriver
from time import sleep
import random
import re
from lp_spider import save_data
# from  lp_spider import py_cloud
start_url = 'https://www.liepin.com/zhaopin/'

def open_url():
    global driver
    driver = webdriver.Chrome()
    driver.get(start_url)
    driver.maximize_window()

def get_page(type):
    #隐形等待,网页完全打开
    driver.implicitly_wait(20)
    #输入需要查找的类型
    driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/input').send_keys(type)
    #点击进行查找
    driver.find_element_by_xpath('//*[@id="sojob"]/div[1]/form/div[1]/div/div/div[1]/button').click()
    # 滑动滑块
    driver.execute_script('window.scrollBy(0, 500)')



def get_info(table):
    global id  # 标号
    id = 0
    for j in range(1,101):
        for i in range(1,41):

            global company  # 公司名称
            global job  # 职位名称
            global salary  # 薪资
            global Ask  # 职位要求
            try:
                ty = driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/i/b').text
            except:
                ty = '无'
            print(ty)
            if ty == '企':
                #sleep(random.choice(range(5, 15)))
                #打开对应页面
                try:
                    #打开对应的页面
                    driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/ul/li['+str(i)+']/div/div[1]/h3/a').click()
                    #print(i)
                    #跳转
                    print('站点地址:',end=' ')
                    print(driver.current_url)
                    handles = driver.window_handles
                    driver.switch_to.window(handles[len(handles)-1])
                    #print(driver.current_url)
                    driver.implicitly_wait(20)
                    #开始进行获取信息
                    try:
                        company = driver.find_element_by_xpath(
                            '//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h3/a[@title]').text
                    except Exception as e:
                        print(e)
                        try:
                            company = driver.find_element_by_xpath(
                                '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h3').text
                        except Exception as e:
                            print(e)
                            company = driver.find_element_by_xpath(
                                '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text

                    #print(company)
                    try:
                        job = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[1]/h1').text
                    except Exception as e:
                        print(e)
                        job = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[1]/h1[@title]').text
                    #print(job)
                    #sleep(random.choice(range(1,5)))
                    try:
                        salary = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
                        salary_m = re.findall('[\u4e00-\u9fa5]+',salary)
                        if (salary_m[0] == '面议'):
                            salary = ['面议']
                        else:
                            salary = driver.find_element_by_xpath(
                                '//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[1]').text
                            if (len(salary)<8):
                                salary = [salary]
                            else:
                                salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)

                    except Exception as e:
                        print(e)
                        salary = driver.find_element_by_xpath(
                            '//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[1]').text
                        if (len(salary) < 8):
                            salary = [salary]
                        else:
                            salary = re.findall('[0-9]*.[0-9]*.[\u4e00-\u9fa5]+', salary)
                    #print(salary)#!salary经过处理后变成字典形式
                    try:

                        address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/a').text
                    except Exception as e:
                        print(e)
                        try:
                            address = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[2]/div/div/p[2]/span').text
                        except Exception as e:
                            print(e)
                            try:
                                address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span/text()').text
                            except Exception as e:
                                print(e)
                                address = driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/p[2]/span').text
                    #print(address)
                    #移动滑动条
                    driver.execute_script('window.scrollBy(0,400)')
                    #sleep(10)
                    try:
                        Ask= driver.find_element_by_xpath('//*[@id="job-view-enterprise"]/div[1]/div[1]/div[1]/div[1]/div[3]/div').text

                    except Exception as e:
                        Ask = driver.find_element_by_xpath('//*[@id="job-hunter"]/div[1]/div[1]/div[1]/div[1]/div/div[3]/div').text
                    #Ask = Ask.replace("\n",'')
                    try:
                        Ask = Ask.replace("任职要求:", "")
                    except:
                        #print(Ask)
                        pass
                    try:
                        Ask = Ask.replace("岗位职责:", "")
                    except:
                        #print(Ask)
                        pass
                    try:
                        Ask = Ask.replace("职位描述:", "")
                    except:
                        #print(Ask)
                        pass
                    try:
                        Ask = Ask.replace("岗位要求:", "")
                    except:
                        #print(Ask)
                        pass
                    try:
                        Ask = Ask.replace("职责描述:", "")

                    except:
                        #print(Ask)
                        pass
                    try:
                        Ask = Ask.replace("任职资格:", "")
                    except:
                        #print(Ask)
                        pass
                    # print(Ask)
                    driver.close()
                    handles = driver.window_handles
                    sleep(random.choice(range(1, 5)))
                    driver.switch_to.window(handles[len(handles)-2])
                    # #滑动滑块
                    # driver.execute_script('window.scrollBy(0, 145)')
                    print(j, end='.')
                    print(i)
                    #print('————————————————————————————————————————————————————————————————————————' * 10)
                    save_data.inser_data(table,str(id), company, job, address, salary[0], Ask)
                    save_data.my_txt(table,Ask)
                    id = id + 1
                except:
                    pass
            else:
                print(j, end='.')
                print(i,end='完成')
                #print('————————————————————————————————————————————————————————————————————————'*10)
            if i<40:
                if ty == '企':
                    # 滑动滑块
                    driver.execute_script('window.scrollBy(0, 145)')
                if ty == '猎':
                    driver.execute_script('window.scrollBy(0,141)')
                if ty == '直':
                    driver.execute_script('window.scrollBy(0,145)')
                if ty == '无':
                    driver.execute_script('window.scrollBy(0,137)')
                if ty == '优':
                    driver.execute_script('window.scrollBy(0,139)')
        try:
            driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
        except:
            driver.execute_script('window.scrollTo(0,0)')#返回到页面首位
            driver.execute_script('window.scrollBy(0,{})'.format(145 * 42))
            driver.find_element_by_xpath('//*[@id="sojob"]/div[2]/div/div[1]/div[1]/div/div/a[8]').click()
        sleep(random.choice(range(3,5)))
        driver.execute_script('window.scrollBy(0, 500)')
    save_data.cur.close()
    save_data.conn.close()
if __name__ == '__main__':
    while(1):
        print('输入爬取职位类别名称,输入后按回车继续-->',end='')
        ty = input()
        save_data.save(ty)
        open_url()
        get_page(ty)
        get_info(ty)
        #py_cloud.make_cloud('python')
        print('爬取结束')
# 词云


from wordcloud import WordCloud
import cv2
import jieba

with open('lp.txt', 'r',encoding='utf-8') as f:
    text = f.read()

cut_text = " ".join(jieba.cut(text))

color_mask = cv2.imread('python1.jpg')

cloud = WordCloud(
    # 设置字体,不指定就会出现乱码
    font_path=" C:\\Windows\\Fonts\\STXINGKA.TTF",
    # font_path=path.join(d,'simsun.ttc'),
    # 设置背景色
    background_color='white',
    # 词云形状
    mask=color_mask,
    # 允许最大词汇
    max_words=10000,
    # 最大号字体
    max_font_size=100
)

wCloud = cloud.generate(cut_text)
wCloud.to_file('cloud.png')

import matplotlib.pyplot as plt

plt.imshow(wCloud, interpolation='bilinear')
plt.axis('off')
plt.show()

发布了23 篇原创文章 · 获赞 22 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_36389249/article/details/87275723