spark sql — Partition Discovery

paraquet数据源自动分区推断

For example, when path/to/table/gender=male is the path of the data and users set basePath to path/to/table/, gender will be a partitioning column.
关键理解上面这段话

package sql;

import java.util.ArrayList;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

/**
 * parquet数据源自动分区推断
 */

public class SparkSqlLocalPartition1 {
    public static void main(String[] args) {
        SparkSession spark = SparkSession
                .builder()
                .master("local")
                .appName("Java Spark SQL data sources example")
                .config("spark.some.config.option", "some-value")
                .getOrCreate();

        runProgram(spark);
    }
    private static void runProgram(SparkSession spark) {
        // Create a simple DataFrame, store into a partition directory
        Dataset<Row> userDF = spark.read().option("basePath","D://learning_bigdata/sparkDemo/data2/users")                .parquet("D://learning_bigdata/sparkDemo/data2/users/gender=male/country=US/users.parquet");
        userDF.printSchema();
        userDF.show();
    }
}

猜你喜欢

转载自blog.csdn.net/qq_29573903/article/details/88539448
今日推荐