To be used with caution pyspark pyspark_python to python3.5, you can use ml, the operating environment should also be python3.5, version must be the same, or will be error.
import findspark
findspark.init()
import pandas as pd
import numpy as np
import pickle
import os
os.environ["PYSPARK_PYTHON"] = "/home/q/conda/bin/python3.5"
from pyspark imports SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
os.environ["PYSPARK_DRIVER_PYTHON"] = "python"
# local[20]
#import jieba
# Jieba.initialize ()
SparkConf conf = () \
.setAppName("NLP_Project_youming.guo") \
.setMaster("yarn") \
.set('spark.yarn.queue', "root.adhoc") \
.set('spark.yarn.dist.files',
'file:/home/q/spark/python/lib/pyspark.zip,file:/home/q/spark/python/lib/py4j-0.10.4-src.zip') \
.setExecutorEnv('PYTHONPATH', 'pyspark.zip:py4j-0.10.4-src.zip') \
.set('PYSPARK_PYTHON', '/home/q/conda/bin/python3.5')
conf.set("spark.executor.memory", "5g")
conf.set("spark.driver.memory","10g")
conf.set("spark.executor.cores","2")
conf.set("spark.dynamicAllocation.maxExecutors","5")
conf.set("spark.driver.maxResultSize","0")
conf.set("spark.dynamicAllocation.enabled","true")
conf.set("spark.shuffle.service.enabled", "true")
conf.set("spark.shuffle.service.port", "7338")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = SparkSession(sc)