Import pyspark in the script process
import os
import sys
spark_name = os.environ.get('SPARK_HOME',None)
# SPARK_HOME That spark of the installation directory, not the bin level, usually / usr / local / spark
if not spark_home:
raise ValueErrorError ( 'spark environment is not configured')
# Sys.path is a Python list of third-party packages to find the path, the path will be imported into the package to add, to avoid can not find modal xxxx
This method should add parameter # --py_files filed at the same time spark-submit = '/ path / to / my / python / packages.zip', will depend on a zip bag added to it consistent results
sys.path.insert(0,'/root/virtualenvs/my_envs/lib/python3.6/site-packages/')
sys.path.insert(0,os.path.join(spark_name,'python')
sys.path.insert(0,os.path.join(spark_name,'python/lib/py4j-0.10.7-src.zip'))
# sys.path.insert(0,os.path.join(spark_name,'libexec/python'))
# sys.path.insert(0,os.path.join(spark_name,'libexex/python/build'))
from pyspark imports SparkConf, SparkContext
Set python runtime version pyspark
we ~ / .bashrc
export PYSPARK_PYTHON=/usr/local/bin/python3
export PYSPARK_DRIVER_PYTHON=ipython3
Finish editing save and exit
source ~/.bashrc