假设有假数据,4个字段分别为orderid,userid,payment,productid,计算出前3个payment值。
1,1251,652,211
2,1253,254,326
3,5847,25,652
4,3562,259,1236
5,874,1658,654
6,123,588,1396
/**
* TopN
*/
object topN{
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("topN").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("hdfs://hdp-1:9000/testFile/TopN.txt")
//过滤掉空行和字段数量不等于4的,只留下正好包含四个字段值的行
val A: RDD[String] = lines.filter(line => (line.trim().length>0 && line.split(",").length == 4))
//将过滤出的行按照(,)切分,并取出第三个元素
val B: RDD[String] = A.map(_.split(",")(2))
//因为元素都是string类型,需转化成int,并且添加一个空的字符value,形成(key,value)键值对放入RDD中,
//比如(652,"")...用于接下来的排序,因为sortbykey操作要求是RDD中元素必须是kv键值对
val C: RDD[(Int, String)] = B.map(x => (x.toInt,""))
//根据key排序,倒序
val D: RDD[(Int, String)] = C.sortByKey(false)
//将每个元素的key取出来
val E: RDD[Int] = D.map(x => x._1)
//取出前三个
val array: Array[Int] = E.take(3)
array.foreach(println)
println("----------------------------------------------------")
var num = 0
array.foreach(x => {
num = num + 1
println(num + "\t" + x)
})
}
}
两种打印方式: