SparkCore series (B) rdd polymerization operation, the polymerization operation between rdd

A: rdd polymerization operation

count

            val conf = new SparkConf().setAppName("HelloWorld").setMaster("local")
            val sc = new JavaSparkContext(conf).sc

            dataLength sc.textFile = Val ( "/ Software / Java / IDEA / Data")
                        .flatMap (X => x.split ( "\\ |")). COUNT () // length corresponding to the array

            println(dataLength)

countByValue

            Val initialScores1: the Array [(String, Double)] =
            the Array (( "A", 88.0), ( "B", 95.0), ( "C", 91.0), ( "D", 93.0))
            Val DATAl = SC .parallelize (initialScores1)
            the println (data1.countByValue) // count the current value as the key

reduce

            val conf = new SparkConf().setAppName("HelloWorld").setMaster("local")
            val sc = new JavaSparkContext(conf).sc

            dataLength sc.textFile = Val ( "/ Software / Java / IDEA / Data")
                        .flatMap (X => x.split ( "\\ |")). Map (X => x.toInt) .reduce ((X , y) => x + y ) // sum corresponds to the array

            println(dataLength)

reduceByKey

            AVG = sc.textFile Val ( "/ Software / Java / IDEA / Data")
                        .flatMap (X => x.split ( "\\ |")) Map (X => (x.toInt,. 1)).
                        . reduceByKey ((x, y) = > x + y) .collect (). map (x => println (x)) // reduceByKey now map polymerization end, in a real development process is also commonly used

sortByKey

            val conf = new SparkConf().setAppName("HelloWorld").setMaster("local")
            val sc = new JavaSparkContext(conf).sc

            val data = sc.textFile("/software/java/idea/data")
                        .flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1)).sortByKey(true)//true正序,false倒序

            println(data.collect().map(x=>println(x)))

countByKey production generally do not

            val data = sc.textFile("/software/java/idea/data")
                        .flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
                        .countByKey() //map结构 key->key value->的个数

            println(data)

collectAsMap production generally do not

            val data = sc.textFile("/software/java/idea/data")
                        .flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
                        .collectAsMap() //map结构   

            println(data)

river

            Data sc.textFile = Val ( "/ Software / Java / IDEA / Data")
                        .flatMap (X => x.split ( "\\ |")). Map (X => x.toInt)
                        .fold (100) ((x, y) => x + y) // polymerization with initial value

            println(data)

groupByKey

            AVG = sc.textFile Val ( "/ Software / Java / IDEA / Data")
                        .flatMap (X => x.split ( "\\ |")) Map (X => (x.toInt,. 1)).
                        . groupByKey (). collect (). map (x => println (x)) // value is an array, it is necessary when using cycle value

aggregate

            // custom aggregation function
            // The first two parameters are functions of the parameter 2 to count again
            // operation between the inner row and the second row parameter file
            // third parameter file operation results
            val sum = sc. textFile ( "/ Software / Java / IDEA / Data")
                        .flatMap (X => x.split ( "\\ |")) Map (X => x.toInt).
                        .aggregate (2) (pfun1, pfun2)
                        println (sum)

            def pfun1 (p1: Int, p2 : Int): Int = { // operations between the rows and the rows
                        the println ( "P1" + P1 + "P2:" + P2)
                        P1 * P2
            }
            DEF pfun2 (P3: Int, P4: int): int = {// file operation result between
                        P3 + P4
            } // SUM

 

            def pfun1 (p1: Tuple2 [Int , Int], p2: Int): Tuple2 [Int, Int] = { // row between row operation
                        (p1._1 +. 1, p1._2 + P2)
            }
            DEF pfun2 ( p1: Tuple2 [Int, Int] , p2: Tuple2 [Int, Int]): Tuple2 [Int, Int] = { // file between the result of the operation
                        (p1._1 + p2._1, p1._2 + p2._2 )
            }
            Val sc.textFile AVG = ( "/ Software / Java / IDEA / Data")
                        .flatMap (X => x.split ( "\\ |")) Map (X => x.toInt).
                        .aggregate ( 0, 0) (pfun1, pfun2)
            println (avg._2 / avg._1) // AVG

combineByKey

            type MVType = (Int, Int)
            val avg = sc.textFile("/software/java/idea/data")
                        .flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
                        .combineByKey(
                                    score => (score,1), //创建元素
                                    (c1: MVType, newScore) => (c1._1 + 1, c1._2 + newScore), //处理已经遇到的键
                                    (c1: MVType, c2: MVType) => (c1._1 + c2._1, c1._2 + c2._2) //处理已经未遇到的键
                        ).collect().map(x=>println(x))//value 是一个数组,需要循环value时候使用

            //aggregate功能很类似

二:rdd聚合操作

union

            val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0))
            val data1 = sc.parallelize(initialScores1)

            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.union(data2).collect().map(x=> println(x)) //SQL中UNION

intersection

            val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
            val data1 = sc.parallelize(initialScores1)

            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.intersection(data2).collect().map(x=> println(x)) //SQL INNER JOIN

join

            val initialScores1: Array[(String, Double)] =
            Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
            val data1 = sc.parallelize(initialScores1)

            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.join(data2).collect().map(x=>println(x))
            //SQL INNER JOIN

subtract

            val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0),(
            val data1 = sc.parallelize(initialScores1)
            
            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.subtract(data2).collect().map(x=> println(x)) //LEFT ANTI

subtractByKey

            val initialScores1: Array[(String, Double)] =
            Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
            val data1 = sc.parallelize(initialScores1)

            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.subtractByKey(data2).collect().map(x=>println(x))
            //删掉rdd1中与rdd2的key相同的元素 相当于subtract

rightOuterJoin

            val initialScores1: Array[(String, Double)] =
                        Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
            val data1 = sc.parallelize(initialScores1)

            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.rightOuterJoin(data2).collect().map(x=>println(x))
            //右外连接

leftOuterJoin

            val initialScores1: Array[(String, Double)] =
            Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
            val data1 = sc.parallelize(initialScores1)

            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.leftOuterJoin(data2).collect().map(x=>println(x))
            //左外连接

cartesian

            val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D",
            val data1 = sc.parallelize(initialScores1)
            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)
            data1.cartesian(data2).collect().map(x=> println(x)) // key不相同的笛卡尔积

cogroup

            val initialScores1: Array[(String, Double)] =
            Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
            val data1 = sc.parallelize(initialScores1)

            val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
            val data2 = sc.parallelize(initialScores2)

            data1.cogroup(data2).collect().map(x=>println(x))
            //key 相同的笛卡尔积

Guess you like

Origin www.cnblogs.com/wuxiaolong4/p/12046673.html