pyspark Configuration config

To be used with caution pyspark pyspark_python to python3.5, you can use ml, the operating environment should also be python3.5, version must be the same, or will be error.

import findspark

findspark.init()

import pandas as pd

import numpy as np

import pickle

import os

os.environ["PYSPARK_PYTHON"] = "/home/q/conda/bin/python3.5"

from pyspark imports SparkContext, SparkConf

from pyspark.sql import SparkSession, SQLContext

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

from pyspark.ml import Pipeline

from pyspark.ml.classification import NaiveBayes

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

os.environ["PYSPARK_DRIVER_PYTHON"] = "python"

# local[20]

#import jieba

# Jieba.initialize ()

 

SparkConf conf = () \

    .setAppName("NLP_Project_youming.guo") \

    .setMaster("yarn") \

    .set('spark.yarn.queue', "root.adhoc") \

    .set('spark.yarn.dist.files',

         'file:/home/q/spark/python/lib/pyspark.zip,file:/home/q/spark/python/lib/py4j-0.10.4-src.zip') \

    .setExecutorEnv('PYTHONPATH', 'pyspark.zip:py4j-0.10.4-src.zip') \

    .set('PYSPARK_PYTHON', '/home/q/conda/bin/python3.5')

conf.set("spark.executor.memory", "5g")

conf.set("spark.driver.memory","10g")

conf.set("spark.executor.cores","2")

conf.set("spark.dynamicAllocation.maxExecutors","5")

conf.set("spark.driver.maxResultSize","0")

conf.set("spark.dynamicAllocation.enabled","true")

conf.set("spark.shuffle.service.enabled", "true")

conf.set("spark.shuffle.service.port", "7338")

sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

spark = SparkSession(sc)

 

Guess you like

Origin www.cnblogs.com/Tw1st-Fate/p/11094344.html