Big Data Tutorial: Transformation and operator demonstrates Action

Big Data Tutorial: Transformation and operator demonstrates Action

A, Transformation operators demo

Val  the conf = new new  SparkConf (). setAppName ( "the Test"). a setMaster ( "local")
      Val  SC = new new  SparkContext (the conf)

    // parallelizing generating RDD
     Val  RDD = sc.parallelize (List (5,6,4 , 7,3,8,2,9,10))

    // Map: for rdd which is multiplied by each metadata 2 and then sort
     Val  RDD2: RDD [Int] = rdd.map (_ * 2)
    // the collect an array returned in the form of a data set of all elements ( a Action operator )
    the println (rdd2.collect () toBuffer.)

    // filter: the RDD made through func return value of the function calculating true input element composition
     val rdd3: RDD [Int] = rdd2.filter (_> 10)
    the println (. rdd3.collect () toBuffer)

    Val  rdd4 = sc.parallelize (the Array ( "ab & C", "D BC"))
    // flatMap: The rdd4 the elements in the segmentation after flattening
     Val  rdd5: RDD [String] = rdd4.flatMap (_ Split ( "."))
    the println (rdd5.collect () toBuffer.)
    // If : List (List ( "a , b "," bc "), List (" ec "," IO "))
     // flatten flatMap (_. flatMap (_. Split (" ")))
    
     // the sample random sampling
     // withReplacement representation is withdrawn whether the data back, to true for the sampling with replacement, to false for the sampling without replacement
     // fraction sampling ratio e.g. 30% i.e. 0.3 However, this value is a floating value is not accurate
     // seedSpecifies default random number generator seed transmission parameters for non
     Val  rdd5_1 sc.parallelize = (. 1 to 10)
     Val  Sample = rdd.sample ( to false , 0.5)
    the println (. Sample.collect () toBuffer)

     // Union: seeking and set
     Val  rdd6 = sc.parallelize (List (5,6,7,8))
     Val  rdd7 = sc.parallelize (List (1,2,5,6))
     Val  rdd8 = rdd6 Union rdd7
    the println (rdd8.collect.toBuffer )

     // intersection: intersection of
     val  rdd9 = rdd6 intersection rdd7
    the println (rdd9.collect.toBuffer)

     // DISTINCT: to repeat the weight
    the println (rdd8.distinct.collect.toBuffer)

     // the Join same key will be combined
     val = sc.parallelize rdd10_1 (List (( "Tom",. 1), ( "Jerry",. 3), ( "Kitty", 2)))
    Val  rdd10_2 = sc.parallelize (List (( "Jerry", 2), ( "Tom", 2), ( "Dog", 10)))
    Val  rdd10_3 = rdd10_1 the Join rdd10_2
    the println (rdd10_3.collect (). toBuffer)
    
    // left and right connections
     // addition to the reference value Option type , because there may be a null value so the use of the Option
     Val  rdd10_4 = rdd10_1 leftOuterJoin rdd10_2  // to the left as the base is not null
     Val  rdd10_5 = rdd10_1 rightOuterJoin rdd10_2  // to the right as a reference is not null
    println (rdd10_4.collect (). toList)
    println ( rdd10_5.collect (). toBuffer)

    Val = sc.parallelize rdd11_1 (List (( "Tom",. 1), ( "Jerry",. 3), ( "Kitty", 2)))
    Val  rdd11_2 = sc.parallelize (List (( "Jerry", 2), ( "Tom", 2), ( "Dog", 10)))
    // Cartesian product
     Val  rdd11_3 = rdd11_1 of Cartesian rdd11_2
    the println (rdd11_3.collect.toBuffer)
  
   // grouped according to the parameters passed
     Val  rdd11_5_1 = rdd11_4. groupBy (_._. 1)
    the println (rdd11_5_1.collect (). toList)

    // In the same key group , and may be formulated partition
     Val  rdd11_5_2 = rdd11_4.groupByKey
    the println (rdd11_5_2.collect (). toList)

    // according to the same key for packet [ packet then need tuple ]
     // cogroup andgroupBykey difference
     // cogroup the data need to be combined on the results to be obtained with a packet key set of data and different data sets
     // groupByKey is then need to be combined according to the same key grouping
     Val  rdd11_6: RDD [(String, (the Iterable [Int], the Iterable [Int]))] = rdd11_1 cogroup rdd11_2
    the println (rdd11_6)

Two, Action operator demo

val conf = new SparkConf().setAppName("Test").setMaster("local[*]")
    val sc = new SparkContext(conf)
    /* Action 算子*/
    //集合函数
    val rdd1 = sc.parallelize(List(2,1,3,6,5),2)
    val rdd1_1 = rdd1.reduce(_+_)
    println(rdd1_1)
    //以数组的形式返回数据集的所有元素
    println(rdd1.collect().toBuffer)
    //返回RDD的元素个数
    println(rdd1.count())
    //取出对应数量的值 默认降序, 若输入0 会返回一个空数组
    println(rdd1.top(3).toBuffer)
    //顺序取出对应数量的值
    println(rdd1.take(3).toBuffer)
    //顺序取出对应数量的值 默认生序
    println(rdd1.takeOrdered(3).toBuffer)
    //获取第一个值 等价于 take(1)
    println(rdd1.first())
    //将处理过后的数据写成文件(存储在HDFS或本地文件系统)
    //rdd1.saveAsTextFile("dir/file1")
    //统计key的个数并生成map kkeyvkey的个数
    val rdd2 = sc.parallelize(List(("key1",2),("key2",1),("key3",3),("key4",6),("key5",5)),2)
    val rdd2_1: collection.Map[String, Long] = rdd2.countByKey()
    println(rdd2_1)
    //遍历数据
    rdd1.foreach(x => println(x))

    /*其他算子*/
    //统计value的个数 但是会将集合中的一个元素看做是一个vluae
    val value: collection.Map[(String, Int), Long] = rdd2.countByValue
    println(value)
    //filterByRange:RDD中的元素进行过滤,返回指定范围内的数据
    val rdd3 = sc.parallelize(List(("e",5),("c",3),("d",4),("c",2),("a",1)))
    val rdd3_1: RDD[(String, Int)] = rdd3.filterByRange("c","e")//包括开始和结束的
    println(rdd3_1.collect.toList)
    //flatMapValues对参数进行扁平化操作,value的值
    val rdd3_2 = sc.parallelize(List(("a","1 2"),("b","3 4")))
    println( rdd3_2.flatMapValues(_.split(" ")).collect.toList)
    //foreachPartition 循环的是分区数据
    // foreachPartiton一般应用于数据的持久化,存入数据库,可以进行分区的数据存储
    val rdd4 = sc.parallelize(List(1,2,3,4,5,6,7,8,9),3)
    rdd4.foreachPartition(x => println(x.reduce(_+_)))
    //keyBy 以传入的函数返回值作为key ,RDD中的元素为value 新的元组
    val rdd5 = sc.parallelize(List("dog","cat","pig","wolf","bee"),3)
    val rdd5_1: RDD[(Int, String)] = rdd5.keyBy(_.length)
    println(rdd5_1.collect.toList)
    //keys获取所有的key  values 获取所有的values
    println(rdd5_1.keys.collect.toList)
    println(rdd5_1.values.collect.toList)
    //collectAsMap  将需要的二元组转换成Map
    val map: collection.Map[String, Int] = rdd2.collectAsMap()
    println(map)

Guess you like

Origin www.cnblogs.com/gcghcxy/p/11040688.html
Recommended