spark 加载数据库mysql表中数据进行分析

1.工程maven依赖包

<properties>
    <spark_version>2.3.1</spark_version>
    <!-- elasticsearch-->
    <elasticsearch.version>5.5.2</elasticsearch.version>
    <fastjson.version>1.2.28</fastjson.version>
    <elasticsearch-hadoop.version>6.3.2</elasticsearch-hadoop.version>
    <elasticsearch-spark.version>5.5.2</elasticsearch-spark.version>
</properties>
<dependencies>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>${spark_version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.11</artifactId>
        <version>${spark_version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-yarn -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-yarn_2.11</artifactId>
        <version>${spark_version}</version>
    </dependency>
    <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch-spark-20_2.11</artifactId>
        <version>${elasticsearch-spark.version}</version>
    </dependency>
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.46</version>
    </dependency>
</dependencies>

2.spark加载数据库中数据

public class GoodsFromMySQL {

    /**
     * 加载数据库数据
     *
     * @param sc           spark context
     * @param sparkSession spark session
     */
    public static void loadGoodsInfo(SparkContext sc, SparkSession sparkSession) {
        String url = "jdbc:mysql://x.x.x.x:3306/db-test";

        String sql = "(SELECT item_name as itemName, goods_category as goodsCategory FROM goods where dict_type='100203' and item_name " +
                "is not null) as my-goods";

        SQLContext sqlContext = SQLContext.getOrCreate(sc);
        DataFrameReader reader = sqlContext.read().format("jdbc").
                option("url", url).option("dbtable", sql).
                option("driver", "com.mysql.jdbc.Driver").
                option("user", "root").
                option("password", "xxxxx");


        Dataset<Row> goodsDataSet = reader.load();

        // Looks the schema of this DataFrame.
        goodsDataSet.printSchema();

        goodsDataSet.write().mode(SaveMode.Overwrite).json("/data/app/source_new.json");
    }


    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("my-app");
        SparkContext sc = new SparkContext(conf);

        SparkSession sparkSession = new SparkSession(sc);

        loadGoodsInfo(sc, sparkSession);
    }
}

3.spark支持加载多种数据库,仅需要用户依赖不同的数据库驱动包,并且代码进行微调即可

  根据以上java代码,仅需调整18行,更改驱动加载类即可。

猜你喜欢

转载自blog.csdn.net/yangbosos/article/details/88620118
今日推荐