Spark parameter configuration and tuning, Spark-SQL, Config

1. Hive-SQL/Spark-SQL parameter configuration and tuning

#设置计算引擎
set hive.execution.engine=spark;

#设置spark提交模式
set spark.master=yarn-cluster;

#设置作业提交队列
set spark.yarn.queue=${queue_name};

#设置队列的名字
set mapreduce.job.queuename=root.users.hdfs;

#设置作业名称
set spark.app.name=${job_name};

#该参数用于设置Spark作业总共要用多少个Executor进程来执行
set spark.executor.instances=25;

#设置执行器计算核个数
set spark.executor.cores=4;

#设置执行器内存
set spark.executor.memory=8g

 #设置任务并行度
set mapred.reduce.tasks=600;

#设置每个executor的jvm堆外内存
set spark.yarn.executor.memoryOverhead=2048;

#设置内存比例(spark2.0+)
set spark.memory.fraction=0.8;

#设置对象序列化方式
set spark.serializer=org.apache.serializer.KyroSerializer;

#设置动态分区
set hive.exec.dynamic.partition=true;  --开启动态分区功能
set hive.exec.dynamic.partition.mode=nonstrict;  --允许所有分区是动态的
set hive.exec.max.dynamic.partitions.pernode=1000;  --每个mapper/reducer可以创建的最大动态分区数
--set hive.exec.dynamic.partitions=10000; 这个可不要
insert overwrite table test partition(country,state) select * from test2; --添加动态分区示例

2. Shell script spark-submit parameter configuration

#python版本提交spark任务

spark-submit \
--master yarn \
--deploy-mode client \
--driver-memory 24G \
--driver-cores 8 \
--num-executors 100 \
--executor-cores 8 \
--executor-memory 24G \
--conf spark.driver.maxResultSize=24G \
--conf spark.kubernetes.executor.limit.cores=12 \
--conf spark.kryoserializer.buffer.max=1024m \
--conf spark.kryoserializer.buffer=512m \
--conf spark.dynamicAllocation.enabled=true \
--conf spark.shuffle.service.enabled=true \
--conf spark.sql.shuffle.partitions=3200 \
--conf spark.default.parallelism=3200 \
--conf spark.storage.memoryfraction=0.3 \
--conf spark.shuffle.memoryFraction=0.3 \
--conf spark.sql.hive.mergeFiles=true \
--conf spark.blacklist.enabled=true \
--conf spark.speculation=true \
--conf spark.sql.sources.readWithSubdirectories.enabled=false \
--conf spark.sql.autoBroadcastJoinThreshold=102400 \
--py-files utils.py \
--name analysis \
analysis.py ${calc_date}
#scala 版本提交 spark任务

spark-submit --class com.ad.data.algorithms.model.runModel \
--master yarn \
--deploy-mode cluster \
--driver-memory 16G \
--conf spark.driver.maxResultSize=16G \
--driver-cores 8 \
--num-executors 100 \
--executor-cores 8 \
--executor-memory 16G \
--conf spark.dynamicAllocation.enabled=true \
--conf spark.shuffle.service.enabled=true \
--conf spark.sql.shuffle.partitions=3200 \
--conf spark.default.parallelism=3200 \
--conf spark.storage.memoryfraction=0.4 \
--conf spark.shuffle.memoryFraction=0.4 \
--conf spark.sql.hive.mergeFiles=true \
--conf spark.blacklist.enabled=true \
--conf spark.speculation=true \
--conf spark.hadoop.hive.exec.orc.split.strategy=ETL \
--name segment-model \
${basePath}/../algorithms-model.jar ${calculateDate} ${cateCodes}

3. Configuration parameters in sparkSession

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, expr

if __name__ == '__main__':
    script, calc_date = argv
    spark = SparkSession.builder.appName("analysis")\
        .config("spark.sql.autoBroadcastJoinThreshold", 102400)\
        .config("spark.driver.maxResultSize", "24G")\
        .enableHiveSupport().getOrCreate()

Guess you like

Origin blog.csdn.net/eylier/article/details/129152832