版权声明:未经同意,严禁抄袭。 https://blog.csdn.net/qq_36235275/article/details/82502004
简单的WordCount用scala编写(用的本地模式)
package com.jiangnan.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object ScalaWordCount {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME","root");
//创建spark配置对象,设置APP名字
//val conf = new SparkConf().setAppName("ScalaWordCount")
val conf = new SparkConf().setAppName("LocalScalaWordCount").setMaster("local[4]")
//获取spark程序入口对象
val sc = new SparkContext(conf)
//设置数据源,要将什么数据读取到RDD(弹性分布式数据集)中
//首先获取数据放入RDD中
val line:RDD[String] = sc.textFile(args(0))
//对数据进行切分
val words:RDD[String] = line.flatMap(_.split(" "))
//将单词和1组合到元祖中
val wo:RDD[(String,Int)] = words.map((_,1))
//可以根据key聚合reduce
val reduced:RDD[(String,Int)] = wo.reduceByKey(_+_)
//排序
val sort = reduced.sortBy(_._2,false)
//保存结果
reduced.saveAsTextFile(args(1))
//释放资源
sc.stop()
}
}
当然也可以一步实现,这才是scala的魅力所在
System.setProperty("HADOOP_USER_NAME","root");
//创建spark配置对象,设置APP名字
val conf = new SparkConf().setAppName("LocalScalaWordCount").setMaster("local[4]")
//获取spark程序入口对象
val sc = new SparkContext(conf)
sc.textFile(args(0)).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).saveAsTextFile(args(1))