一:创建persons.txt测试数据文本文件
内容如下:
1001,张三,23
1002,李四,33
1003,王五,18
1004,张三4,23
1005,李四6,33
1006,王五7,18
1007,张三8,23
1008,李四9,33
1009,王五10,18
1010,张三11,13
1011,李四12,33
1012,王五13,18
1013,张三15,23
1014,李四14,29
1015,王五16,18
二:构建maven项目,引入对应依赖
<properties> <scala.version>2.11.8</scala.version> </properties> <repositories> <repository> <id>repos</id> <name>Repository</name> <url>http://maven.aliyun.com/nexus/content/groups/public</url> </repository> <repository> <id>scala-tools.org</id> <name>Scala-Tools Maven2 Repository</name> <url>http://scala-tools.org/repo-releases</url> </repository> </repositories> <pluginRepositories> <pluginRepository> <id>repos</id> <name>Repository</name> <url>http://maven.aliyun.com/nexus/content/groups/public</url> </pluginRepository> <pluginRepository> <id>scala-tools.org</id> <name>Scala-Tools Maven2 Repository</name> <url>http://scala-tools.org/repo-releases</url> </pluginRepository> </pluginRepositories> <dependencies> <dependency> <!-- Spark --> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.4.0</version> </dependency> <dependency> <!-- Spark --> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>2.4.0</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> </dependency> </dependencies> <build> <sourceDirectory>src/main/scala</sourceDirectory> <testSourceDirectory>src/test/scala</testSourceDirectory> <plugins> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <version>2.15.2</version> <executions> <execution> <id>scala-compile-first</id> <goals> <goal>compile</goal> </goals> <configuration> <includes> <include>**/*.scala</include> </includes> </configuration> </execution> <execution> <id>scala-test-compile</id> <goals> <goal>testCompile</goal> </goals> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <appendAssemblyId>false</appendAssemblyId> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> <archive> <manifest> <mainClass>org.jy.data.yh.bigdata.drools.scala.sparkrdd.SparkRDDWordCountFrep</mainClass> </manifest> </archive> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> <version>2.4</version> <configuration> <archive> <manifest> <addClasspath>true</addClasspath> <mainClass>org.jy.data.yh.bigdata.drools.scala.sparkrdd.SparkRDDWordCountFrep</mainClass> </manifest> </archive> </configuration> </plugin> </plugins> </build> </project>
三,完整代码如下:
package org.jy.data.yh.bigdata.drools.scala.spark.dataframe; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import java.util.ArrayList; import java.util.List; /** * 通过RDD来构建DataFrame: * 主要步骤: * 读取文本文件,遍历全部行,创建RDD<Row> * 构造DataFrame的元数据(MetaData) * 基于MetaDat,RDD<Row>来构造持久化 * 注册称为临时表,进行数据查询 * 使用用上面的临时表,进行数据查询 * 对结果进行处理:DataFrame转换成RDD<Row>或者持久化 */ public class RDD2DataFrameByProgrammatically { public static void main(String[] args){ //spark://centoshadoop1:7077,centoshadoop2:7077 集群环境线上运行 SparkConf sparkConf = new SparkConf().setMaster("local") .setAppName("RDD2DataFrameByProgrammatically"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(javaSparkContext); JavaRDD<String> lines = javaSparkContext.textFile("D://jar/persons.txt");// 线上需要改为spark物理地址 // 第一步:在RDD的基础上创建类型为Row的RDD JavaRDD<Row> personsRDD = lines.map(new Function<String, Row>() { @Override public Row call(String line) throws Exception { System.out.println("当前正在解析的文本行内容为: "+line); String[] splited = line.split(","); return RowFactory.create(Integer.valueOf(splited[0]),splited[1],Integer.valueOf(splited[2])); } }); /** * 第二步:动态构造DataFrame的元数据,一般而言,有多少列及每列的具体类型可能来自于JSON文件,也可能来自与DB */ List<StructField> structFields = new ArrayList<>(); structFields.add(DataTypes.createStructField("id",DataTypes.IntegerType,true)); structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true)); structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true)); // 构建StructType,用于最后DataFrame元数据的描述 StructType structType = DataTypes.createStructType(structFields); // 第三步:基于MetaData及RDD<Row>来构造DataFrame Dataset<Row> personsDF = sqlContext.createDataFrame(personsRDD,structType); // 第四步:注册成为临时表以供后续的SQL查询操作 personsDF.registerTempTable("persons"); // 第五步: 进行数据的多维度分析 Dataset<Row> result = sqlContext.sql("select * from persons where age > 20"); // 第六步:对结果进行处理,包括有DataFrame转换为RDD<Row>,以及结果持久化 List<Row> listRow = result.javaRDD().collect(); for(Row row : listRow){ System.out.println(row); } javaSparkContext.close(); } }
四,运行该程序,输出结果如下:
20/04/12 22:36:33 INFO LineRecordReader: Found UTF-8 BOM and skipped it
当前正在解析的文本行内容为: 1001,张三,23
当前正在解析的文本行内容为: 1002,李四,33
当前正在解析的文本行内容为: 1003,王五,18
当前正在解析的文本行内容为: 1004,张三4,23
当前正在解析的文本行内容为: 1005,李四6,33
20/04/12 22:36:33 INFO CodeGenerator: Code generated in 42.2667 ms
当前正在解析的文本行内容为: 1006,王五7,18
当前正在解析的文本行内容为: 1007,张三8,23
当前正在解析的文本行内容为: 1008,李四9,33
当前正在解析的文本行内容为: 1009,王五10,18
当前正在解析的文本行内容为: 1010,张三11,13
当前正在解析的文本行内容为: 1011,李四12,33
当前正在解析的文本行内容为: 1012,王五13,18
当前正在解析的文本行内容为: 1013,张三15,23
当前正在解析的文本行内容为: 1014,李四14,29
当前正在解析的文本行内容为: 1015,王五16,18
20/04/12 22:36:33 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 2859 bytes result sent to driver
20/04/12 22:36:33 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 268 ms on localhost (executor driver) (1/1)
20/04/12 22:36:33 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
20/04/12 22:36:33 INFO DAGScheduler: ResultStage 0 (collect at RDD2DataFrameByProgrammatically.java:60) finished in 0.344 s
20/04/12 22:36:33 INFO DAGScheduler: Job 0 finished: collect at RDD2DataFrameByProgrammatically.java:60, took 0.392485 s
[1001,张三,23]
[1002,李四,33]
[1004,张三4,23]
[1005,李四6,33]
[1007,张三8,23]
[1008,李四9,33]
[1011,李四12,33]
[1013,张三15,23]
[1014,李四14,29]
20/04/12 22:36:33 INFO SparkUI: Stopped Spark web UI at http://192.168.227.1:4040
20/04/12 22:36:33 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
20/04/12 22:36:33 INFO MemoryStore: MemoryStore cleared
20/04/12 22:36:33 INFO BlockManager: BlockManager stopped
20/04/12 22:36:33 INFO BlockManagerMaster: BlockManagerMaster stopped
20/04/12 22:36:33 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
20/04/12 22:36:33 INFO SparkContext: Successfully stopped SparkContext
20/04/12 22:36:33 INFO ShutdownHookManager: Shutdown hook called
20/04/12 22:36:33 INFO ShutdownHookManager: Deleting directory C:\Users\Administrator\AppData\Local\Temp\spark-b73dd7f4-1fe3-49b7-a06a-8c4982b8cedd