安装spark-bench

git clone https://github.com/SparkTC/spark-bench.git

cd spark-bench/
mvn package install
[root@datanode01 bin]# pwd
/opt/spark-bench/bin
[root@datanode01 bin]# ./build-all.sh

[root@datanode01 spark-bench]# cat conf/env.sh
# global settings

master="namenode01"
#A list of machines where the spark cluster is running
MC_LIST="namenode01"

[ -z "$HADOOP_HOME" ] && export HADOOP_HOME=/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/
# base dir for DataSet
HDFS_URL="hdfs://namenode01:8020"
SPARK_HADOOP_FS_LOCAL_BLOCK_SIZE=536870912

# DATA_HDFS="hdfs://${master}:9000/SparkBench", "file:///home/`whoami`/SparkBench"
DATA_HDFS="hdfs://namenode01:8020/SparkBench"

#Local dataset optional
DATASET_DIR=/home/`whoami`/SparkBench/dataset

SPARK_VERSION=2.0.1 #1.5.1
[ -z "$SPARK_HOME" ] && export SPARK_HOME=/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark

#SPARK_MASTER=local
#SPARK_MASTER=local[K]
#SPARK_MASTER=local[*]
#SPARK_MASTER=spark://HOST:PORT
##SPARK_MASTER=mesos://HOST:PORT
##SPARK_MASTER=yarn-client
SPARK_MASTER=yarn
MASTER=yarn
YARN_DEPLOY_MODE=client # or cluster, this will go to spark submit as --deploy-mode
SPARK_RPC_ASKTIMEOUT=500
#SPARK_MASTER=spark://${master}:7077

# Spark config in environment variable or aruments of spark-submit
# - SPARK_SERIALIZER, --conf spark.serializer
# - SPARK_RDD_COMPRESS, --conf spark.rdd.compress
# - SPARK_IO_COMPRESSION_CODEC, --conf spark.io.compression.codec
# - SPARK_DEFAULT_PARALLELISM, --conf spark.default.parallelism
SPARK_SERIALIZER=org.apache.spark.serializer.KryoSerializer
SPARK_RDD_COMPRESS=false
SPARK_IO_COMPRESSION_CODEC=lzf

# Spark options in system.property or arguments of spark-submit
# - SPARK_EXECUTOR_MEMORY, --conf spark.executor.memory
# - SPARK_STORAGE_MEMORYFRACTION, --conf spark.storage.memoryfraction
#SPARK_STORAGE_MEMORYFRACTION=0.5
SPARK_EXECUTOR_MEMORY=1g
#export MEM_FRACTION_GLOBAL=0.005

# Spark options in YARN client mode
# - SPARK_DRIVER_MEMORY, --driver-memory
# - SPARK_EXECUTOR_INSTANCES, --num-executors
# - SPARK_EXECUTOR_CORES, --executor-cores
# - SPARK_DRIVER_MEMORY, --driver-memory
#export EXECUTOR_GLOBAL_MEM=2g
#export executor_cores=2
export SPARK_DRIVER_MEMORY=2g
export SPARK_EXECUTOR_INSTANCES=4
export SPARK_EXECUTOR_CORES=1

# Storage levels, see http://spark.apache.org/docs/latest/api/java/org/apache/spark/api/java/StorageLevels.html
# - STORAGE_LEVEL, set MEMORY_AND_DISK, MEMORY_AND_DISK_SER, MEMORY_ONLY, MEMORY_ONLY_SER, or DISK_ONLY
STORAGE_LEVEL=MEMORY_AND_DISK

# for data generation
NUM_OF_PARTITIONS=2
# for running
NUM_TRIALS=1

[root@namenode01 bin]# pwd
/opt/spark-bench/KMeans/bin

[root@namenode01 bin]# vi ../conf/env.sh

# The parameters for data generation. 100 million points roughly produces 36GB data size
NUM_OF_POINTS=300000000 ---100g左右
NUM_OF_CLUSTERS=2
DIMENSIONS=20
SCALING=0.6
NUM_OF_PARTITIONS=2

MAX_ITERATION=2
NUM_RUN=1

SPARK_STORAGE_MEMORYFRACTION=0.48

猜你喜欢