1.使用shell计算WordCount
1.1启动spark
bin/spark-shell
1.2 创建个文件里面放值你要计算的文件
mkdir input
1.3 数据计算的命令
sc.textFile("input").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect
2.使用idea计算WordCount
2.1 创建maven项目
创建成功后修改pom.xml这个文件
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.1</version>
</dependency>
</dependencies>
<build> <finalName>WordCount</finalName>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<archive>
<manifest>
<mainClass>WordCount</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
2.2 创建scala包开始编写代码
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
// 使用idea 完成wordCount
//创建、SparkConf
// setAppName 设置应用任务名称
val conf = new SparkConf().setMaster("local[8]").setAppName("WordCount")
// 创建 spark 上下文对象
val context = new SparkContext(conf)
// 读取文件 将文件一行一行的读取
val lines: RDD[String] = context.textFile("data")
// 进行切割 获取每个单词
val words: RDD[String] = lines.flatMap(_.split(" "))
//为了方便转换结构
val wordToOns: RDD[(String, Int)] = words.map((_,1))
//转换后的机构进行统计个数
val wordToSum = wordToOns.reduceByKey(_+_)
//将结果数据到控制台
println(wordToSum.collect().foreach(println))
}
}