name_phone.txt bob 15700079421 amy 18700079458 alice 17730079427 tom 16700379451 lulu 18800074423 nick 14400033426 name_addr.txt bob shanghai#200000 amy beijing#100000 alice shanghai#200000 tom beijing#100000 lulu hangzhou#310000 nick shanghai#200000
当然原文件和目标文件都在hdfs中
代码
package sparkstreaming_action.rdd.operation import org.apache.spark.SparkConf import org.apache.spark.SparkContext object RDDOperation extends App { val conf = new SparkConf() .setAppName("rddOperation") .setMaster("spark://192.168.199.120:7077") //主服务器 UI上显示 主机:port 会将excute 分配个其他节点 // Create spark context val sc = new SparkContext(conf) val txtNameAddr = "name_addr.txt" val txtNamePhone = "name_phone.txt" val rddNameAddr = sc.textFile(txtNameAddr).map(record => { val tokens = record.split(" ") (tokens(0), tokens(1)) }) // rdd1 rddNameAddr.cache val rddNamePhone = sc.textFile(txtNamePhone).map(record => { val tokens = record.split(" ") (tokens(0), tokens(1)) }) // rdd2 rddNamePhone.cache val rddNameAddrPhone = rddNameAddr.join(rddNamePhone) // rdd3 不需要on 默认为 (a,1)join (a,'y') (a,1,'y') val rddHtml = rddNameAddrPhone.map(record => { val name = record._1 val addr = record._2._1 val phone = record._2._2 s"<h2>姓名:${name}</h2><p>地址:${addr}</p><p>电话:${phone}</p>" }) // rdd4 val rddOutput = rddHtml.saveAsTextFile("UserInfo") val rddPostcode = rddNameAddr.map(record => { val postcode = record._2.split("#")(1) (postcode, 1) }) // rdd5 val rddPostcodeCount = rddPostcode.reduceByKey(_ + _) // rdd6 rddPostcodeCount.collect().foreach(println) sc.stop }
pom文件为
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.spark_smart</groupId> <artifactId>smart</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.0.0</version> </dependency> <dependency><!-- Log --> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.12</version> </dependency> </dependencies> <!-- <build>--> <!-- <plugins>--> <!-- <!– mixed scala/java compile –>--> <!-- <plugin>--> <!-- <groupId>org.scala-tools</groupId>--> <!-- <artifactId>maven-scala-plugin</artifactId>--> <!-- <executions>--> <!-- <execution>--> <!-- <id>compile</id>--> <!-- <goals>--> <!-- <goal>compile</goal>--> <!-- </goals>--> <!-- <phase>compile</phase>--> <!-- </execution>--> <!-- <execution>--> <!-- <id>test-compile</id>--> <!-- <goals>--> <!-- <goal>testCompile</goal>--> <!-- </goals>--> <!-- <phase>test-compile</phase>--> <!-- </execution>--> <!-- <execution>--> <!-- <phase>process-resources</phase>--> <!-- <goals>--> <!-- <goal>compile</goal>--> <!-- </goals>--> <!-- </execution>--> <!-- </executions>--> <!-- </plugin>--> <!-- <plugin>--> <!-- <artifactId>maven-compiler-plugin</artifactId>--> <!-- <configuration>--> <!-- <source>1.7</source>--> <!-- <target>1.7</target>--> <!-- </configuration>--> <!-- </plugin>--> <!-- <!– for fatjar –>--> <!-- <plugin>--> <!-- <groupId>org.apache.maven.plugins</groupId>--> <!-- <artifactId>maven-assembly-plugin</artifactId>--> <!-- <version>2.2</version>--> <!-- <configuration>--> <!-- <descriptorRefs>--> <!-- <descriptorRef>jar-with-dependencies</descriptorRef>--> <!-- </descriptorRefs>--> <!-- </configuration>--> <!-- <executions>--> <!-- <execution>--> <!-- <id>assemble-all</id>--> <!-- <phase>package</phase>--> <!-- <goals>--> <!-- <goal>single</goal>--> <!-- </goals>--> <!-- </execution>--> <!-- </executions>--> <!-- </plugin>--> <!-- <plugin>--> <!-- <groupId>org.apache.maven.plugins</groupId>--> <!-- <artifactId>maven-jar-plugin</artifactId>--> <!-- <configuration>--> <!-- <archive>--> <!-- <manifest>--> <!-- <addClasspath>true</addClasspath>--> <!-- <mainClass>com.sparkstreaming.action.main.WordFreq</mainClass>--> <!-- </manifest>--> <!-- </archive>--> <!-- </configuration>--> <!-- </plugin>--> <!-- </plugins>--> <!-- </build>--> <!-- <repositories>--> <!-- <repository>--> <!-- <id>aliyunmaven</id>--> <!-- <url>http://maven.aliyun.com/nexus/content/groups/public/</url>--> <!-- </repository>--> <!-- </repositories>--> </project>
提交代码 ./bin/spark-submit --class sparkstreaming_action.rdd.operation.RDDOperation --num-executors 4 --driver-memory 512M --executor-memory 512M --executor-cores 1 --conf spark.default.parallelism=1000 /root/spark/spark/smart.jar
结果
[root@min01 spark]# hdfs dfs -cat /user/root/UserInfo/* 19/07/14 19:44:17 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable <h2>姓名:tom</h2><p>地址:beijing#100000</p><p>电话:16700379451</p> <h2>姓名:alice</h2><p>地址:shanghai#200000</p><p>电话:17730079427</p> <h2>姓名:nick</h2><p>地址:shanghai#200000</p><p>电话:14400033426</p> <h2>姓名:lulu</h2><p>地址:hangzhou#310000</p><p>电话:18800074423</p> <h2>姓名:amy</h2><p>地址:beijing#100000</p><p>电话:18700079458</p> <h2>姓名:bob</h2><p>地址:shanghai#200000</p><p>电话:15700079421</p>