python爬虫使用selenium爬取动态网页信息——以智联招聘网站为例

python版本3.6
#导入两个模块
from selenium import webdriver
import time
from openpyxl import Workbook
import csv
import codecs
#存储csv的方法
class saving(object):
        def __init__(self):
                # print('存储开始')
            #创建文件夹
                if not os.path.exists('csv'):
                        os.mkdir('csv')
                if not os.path.exists('excel'):
                        os.mkdir('excel')
        def savetocsv(self, data):
                with codecs.open('csv/zhilian.csv', 'a', encoding='utf-8') as csvfile:
                        write = csv.writer(csvfile)
                        # 写入一行的语法，write.writerow([])
                        write.writerow(data)
#设定爬取内容的方法
def getzhilian(url,city_name,industry):
    browser = webdriver.Chrome()
    browser.get(url)
    #设置时间是为了防止ip被封
    time.sleep(1)
    browser.maximize_window()
    time.sleep(1)
    #设置循环，需要爬取多少页内容就循环多少次
    for i in range(25):
    	#设置拖动网页的循环，注意智联的这个循环只是为了防止ip很快的被封掉
        for n in range(3):
            time.sleep(2)
            browser.execute_script('window.scrollBy(0,300)', '')
         #提醒说明
        print('开始获取',city_name,industry,i+1,'页数据')
        time.sleep(1)
        #爬取的一个list
        all=browser.find_element_by_id('listContent').find_elements_by_xpath('div')
        for each in all:
            data=[]
            time.sleep(1)
            #具体内容的获取，有些需要进行判断是否存在
            company=each.find_element_by_xpath('div/div/div/div[2]/a').text
            Release_time = each.find_element_by_xpath('div/div/div[3]/div[2]/span').text
            salary=each.find_element_by_xpath('div/div/div[2]/div/p').text
            address=each.find_element_by_xpath('div/div/div[2]/div/ul/li[1]').text
            experience=each.find_element_by_xpath('div/div/div[2]/div/ul/li[2]').text
            education=each.find_element_by_xpath('div/div/div[2]/div/ul/li[3]').text
            c_type=each.find_element_by_xpath('div/div/div[2]/div[2]/span[1]').text
            try:
                c_size=each.find_element_by_xpath('div/div/div[2]/div[2]/span[2]').text
            except:
                c_size=''
            data.append([industry,company,Release_time,salary,address,experience,education,c_type,c_size])
            time.sleep(1)
            #进行存储，方式很多，这里就写了一个csv的存储方式
            myzhilian=saving()
            myzhilian.savetocsv(data[0])
        print(city_name, industry, i + 1, '页数据获取完成')
        #下面是判断当前页是否为最后一页的一种方法
        if len(all)<60:
            browser.close()
            return
        browser.find_element_by_xpath('.//div/button[2]').click()
        time.sleep(1)
 #主函数设置
def main(filename):
	#城市与行业的设置可选，然后循环获取
    citys={530:'北京',538:'上海',763:'广州',765:'深圳'}
    industries={10100:'互联网/IT',10200:'金融',10800:'房地产/建筑',10900:'商业服务',10300:'贸易/批发/零售',10400:'教育/艺术',10000:'服务业',11300:'文化/传媒/娱乐',999999:'其他'}
    for city in citys:
        city_name=citys[city]
        for i in industries:
            industry=industries[i]
            url='https://sou.zhaopin.com/?pageSize=60&jl='+str(city)+'&in='+str(i)+'&kw='+filename+'&kt=3'
            getzhilian(url,city_name,industry)
            print(city,industry,'存储数据完成')
if __name__ == '__main__':
    main('职位名称')
注意事项：当selenium中的class_name与tag_name等方式无法获取内容时，不妨试试xpath的方法。
python爬虫使用selenium爬取动态网页信息——以智联招聘网站为例

猜你喜欢