Spark:从oracle读取大表写入hive实践

oracle表数据量

60G,数据条数:58475050

提交参数

spark-submit --master yarn --deploy-mode client \
--class com.xjb.Test \
--jars ojdbc5-11.2.0.3.jar \
--conf spark.yarn.executor.memoryOverhead=300M \
--num-executors 10 --executor-cores 2 --executor-memory 3724M \
etl.jar

程序

import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import com.hs.xlzf.data.etl._

object Test {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("Test")
      .config("spark.sql.parquet.writeLegacyFormat", true)
      .enableHiveSupport()
      .getOrCreate()

    val ct = 58475050
    val sql_str = s"(select a.*, ROWNUM rownum__rn from db1.tb1 a) b"
    val table = spark.read
      .format("jdbc")
      .option("driver", "oracle.jdbc.driver.OracleDriver")
      .option("url", "jdbc:oracle:thin:@10.18.2.3:1521:etc")
      .option("user", "***")
      .option("password", "***")
      .option("dbtable", sql_str)
      .option("fetchsize", 100000)
      .option("partitionColumn", "rownum__rn")
      .option("lowerBound", 0)
      .option("upperBound", ct)
      .option("numPartitions", 10)
      .load()
      .drop("rownum__rn")

    table
      .repartition(100)
      .write
      .mode(SaveMode.Overwrite)
      .saveAsTable("db1.tb1")
  }
}

执行时间

25分钟

猜你喜欢

转载自blog.csdn.net/xuejianbest/article/details/86000784