spark conversion actions and common action operations

First, the basic RDD

1, conversion action

(1)map()、flatMap()、filter()

scala> val line = sc.textFile("/Users/Desktop/log.txt")
line: org.apache.spark.rdd.RDD[String] = /Users/qanfuhong/Desktop/log.txt MapPartitionsRDD[1] at textFile at <console>:24                                            ^

scala> val erroRDD = line.filter(line => line.contains("error"))
erroRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at <console>:26

scala> val errorRDD = line.filter(line => line.contains("error"))
errorRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at filter at <console>:26

scala> val input =sc.parallelize(List(1,2,3,4))
input: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[4] at parallelize at <console>:24

scala> val result = input.map(x => x*x)
result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[5] at map at <console>:26  

scala> println(result.collect().mkString(","))
[Stage 0:>                                                                                            1,4,9,16                     

scala> println(result.collect().mkString(":"))
1:4:9:16
                           
scala> val lines = sc.parallelize(List("hello jjames","hi"))
lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[6] at parallelize at <console>:24

scala> val woeds = lines.flatMap(x => x.split(" ")) 
woeds: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[7] at flatMap at <console>:26

scala> woeds.first()
res11: String = hello

collect () in a production environment, with caution.

 

2, the pseudo-set operations: union (rdd) - union, distinct (rdd), intersection (rdd) - intersection, rdd1.subtract (rdd2) - returns the element rdd1 there is only set, cartesian (rdd) - flute Carl plot

scala> val lines = sc.parallelize(List(1,1,1,3,2,4,2))
lines: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:24

scala> val linesDistinct = lines.distinct()
linesDistinct: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[11] at distinct at <console>:26

scala> println(linesDistinct.collect().mkString(","))
1,2,3,4

scala> val rdd1 = sc.parallelize(List("coffee","coffee","pandas","monkey","tea"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[12] at parallelize at <console>:24

scala> val rdd2 = sc.makeRDD(List("coffee","money","kitty"))
rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[13] at makeRDD at <console>:24

scala> rdd1.distinct()
res13: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[16] at distinct at <console>:27

scala> println(rdd1.distinct().collect().mkString(","))
tea,pandas,monkey,coffee

scala> val rddUnion = rdd1.union(rdd2)
rddUnion: org.apache.spark.rdd.RDD[String] = UnionRDD[20] at union at <console>:28

scala> print(rddUnion.collect().mkString(","))
coffee,coffee,pandas,monkey,tea,coffee,money,kitty
scala> val rddInter = rdd1.intersection(rdd2)
rddInter: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[26] at intersection at <console>:28

scala> print(rddInter.collect().mkString(","))
coffee
scala> val rddSub = rdd1.subtract(rdd2)
rddSub: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[30] at subtract at <console>:28

scala> print(rddSub.collect().mkString(" "))
tea pandas monkey
scala> val rddCar = rdd1.cartesian(rdd2)
rddCar: org.apache.spark.rdd.RDD[(String, String)] = CartesianRDD[31] at cartesian at <console>:28

scala> print(rddCar.collect().mkString(","))
(coffee,coffee),(coffee,money),(coffee,kitty),(coffee,coffee),(coffee,money),(coffee,kitty),(pandas,coffee),(pandas,money),(pandas,kitty),(monkey,coffee),(monkey,money),(monkey,kitty),(tea,coffee),(tea,money),(tea,kitty)
scala> 

3, the operation action

Guess you like

Origin blog.csdn.net/BD_fuhong/article/details/93402263