Spark RDD--1 WordCount

版权声明:未经同意,严禁抄袭。 https://blog.csdn.net/qq_36235275/article/details/82502004

简单的WordCount用scala编写(用的本地模式)

package com.jiangnan.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object ScalaWordCount {
  def main(args: Array[String]): Unit = {
    System.setProperty("HADOOP_USER_NAME","root");
    //创建spark配置对象,设置APP名字
    //val conf = new SparkConf().setAppName("ScalaWordCount")
    val conf = new SparkConf().setAppName("LocalScalaWordCount").setMaster("local[4]")
    //获取spark程序入口对象
    val sc = new SparkContext(conf)
    //设置数据源,要将什么数据读取到RDD(弹性分布式数据集)中

    //首先获取数据放入RDD中
    val line:RDD[String] = sc.textFile(args(0))
    //对数据进行切分
    val words:RDD[String] = line.flatMap(_.split(" "))
    //将单词和1组合到元祖中
    val wo:RDD[(String,Int)] = words.map((_,1))
    //可以根据key聚合reduce
    val reduced:RDD[(String,Int)] = wo.reduceByKey(_+_)
    //排序
    val sort = reduced.sortBy(_._2,false)
    //保存结果
    reduced.saveAsTextFile(args(1))
    //释放资源
    sc.stop()
  }
}

当然也可以一步实现,这才是scala的魅力所在

 System.setProperty("HADOOP_USER_NAME","root");
    //创建spark配置对象,设置APP名字
    val conf = new SparkConf().setAppName("LocalScalaWordCount").setMaster("local[4]")
    //获取spark程序入口对象
    val sc = new SparkContext(conf) 
    sc.textFile(args(0)).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).saveAsTextFile(args(1))

猜你喜欢

转载自blog.csdn.net/qq_36235275/article/details/82502004
今日推荐