目的:
Terasort测试Kubernetes平台对于Spark计算过程
前提:
从HDFS读取能力
向HDFS写入能力
shuffle中网络读写能力
1:准备工作
获取代码
git clone https://github.com/ehiggs/spark-terasort.git
修改编译配置
指定合适的spark和scala版本
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<scala.version>2.11.8</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<spark.version>2.1.1</spark.version>
</properties>
运行
1: 在hdfs建立数据目录
hadoop fs -mkdir /terasort
2:建立spark运行的namespace和权限
admin-role.conf
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: spark-admin
annotations:
rbac.authorization.kubernetes.io/autoupdate: "true"
roleRef:
kind: ClusterRole
name: cluster-admin
apiGroup: rbac.authorization.k8s.io
subjects:
- kind: ServiceAccount
name: spark-admin
namespace: spark
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: spark-admin
namespace: spark
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
建立spark namespace 并绑定cluster-admin role
kubectl create namespace spark
kubectl apply -f admin-role.conf
3:运行TeraGen
run_spark_teragen.sh
/root/spark/bin/spark-submit \
--master k8s://https://192.168.0.2:6443 --deploy-mode cluster --name spark-terasort \
--class com.github.ehiggs.spark.terasort.Teragen \ #class地址
--conf spark.kubernetes.namespace=spark \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark-admin \
--conf spark.kubernetes.container.image=bigdata/spark:terasort \
--conf spark.eventLog.dir=hdfs://xx.xx.xx.xx:9000/eventLog \
--conf spark.eventLog.enabled=true \
--conf spark.executor.instances=50 \
--conf spark.driver.cores=2 \
--conf spark.driver.memory=8g \
--conf spark.executor.memory=6g \
--conf spark.executor.cores=1 \
--proxy-user hadoop \
local:///opt/spark/examples/jars/spark-terasort-1.1-SNAPSHOT-jar-with-dependencies.jar \
50g \ #数据大小
hdfs://172.2.2.11:9000/terasort/50g_generated #数据生成目录
4:运行TeraSort
run_spark_terasort.sh
/root/spark/bin/spark-submit \
--master k8s://https://172.3.0.3:6443 \
--deploy-mode cluster \
--name spark-terasort \
--class com.github.ehiggs.spark.terasort.TeraSort \
--conf spark.kubernetes.namespace=spark \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark-admin \
--conf spark.kubernetes.container.image=172.3.0.3:5000/spark:terasort \
--conf spark.eventLog.dir=hdfs://172.2.2.11:9000/eventLog \
--conf spark.eventLog.enabled=true \
--conf spark.executor.instances=50 \
--conf spark.driver.cores=2 \
--conf spark.driver.memory=8g \
--conf spark.executor.memory=6g \
--conf spark.executor.cores=1 \
--proxy-user hadoop \
local:///opt/spark/examples/jars/spark-terasort-1.1-SNAPSHOT-jar-with-dependencies.jar \
hdfs://172.2.2.11:9000/terasort/50g_generated \
hdfs://172.2.2.11:9000/terasort/50g_sorted
5:运行TeraValidate
run_spark_teravarify.sh
/root/spark/bin/spark-submit \
--master k8s://https://172.3.0.3:6443 \
--deploy-mode cluster \
--name spark-terasort \
--class com.github.ehiggs.spark.terasort.TeraValidate \
--conf spark.kubernetes.namespace=spark \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark-admin \
--conf spark.kubernetes.container.image=172.3.0.3:5000/spark:terasort \
--conf spark.eventLog.dir=hdfs://172.2.2.11:9000/eventLog \
--conf spark.eventLog.enabled=true \
--conf spark.executor.instances=50 \
--conf spark.driver.cores=2 \
--conf spark.driver.memory=8g \
--conf spark.executor.memory=6g \
--conf spark.executor.cores=1 \
--proxy-user hadoop \
local:///opt/spark/examples/jars/spark-terasort-1.1-SNAPSHOT-jar-with-dependencies.jar \
hdfs://172.2.2.11:9000/terasort/50g_sorted \
hdfs://172.2.2.11:9000/terasort/50g_varified