Big Data learning day22 ------ spark05 ------ 1. The most popular subject teachers solution to add 2 custom sorting

1. Discipline The most popular solution to add teachers

There is also a problem for Four day21 in this case, that is, when the popularity of each teacher is the same time that it could not handle the collation, the following is an optimized solution for its

Implementation of five

FavoriteTeacher5

package com._51doit.spark04

imports org.apache.spark. {Partitions, SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

import scala.collection.mutable

object FavoriteTeacher5 {
  def main(args: Array[String]): Unit = {
    isLocal Val = args ( 0 ) .toBoolean
     // Create SparkConf, and then create SparkContext 
    Val = the conf new new SparkConf (). setAppName ( the this .getClass.getSimpleName)
     IF (isLocal) {
      conf.setMaster("local[*]")
    }
    SC Val = new new SparkContext (the conf)
     // specified later created from where to read data RDD 
    Val Lines: RDD [String] = sc.textFile (args ( . 1 ))
     // data segmentation 
    val subjectTeacherAndOne = lines.map ( = Line> {
      val fields = line.split("/")
      val subject = fields(2).split("[.]")(0)
      val teacher = fields(3)
      ((subject, teacher), 1)
    })

    // calculate all subjects, and collected Driver side 
    Val Subjects: the Array [String] = . SubjectTeacherAndOne.map (_._ 1._1) .distinct () the collect ()
     // The partitioner used is out of the new Driver end, but his method is to be called Executor 
    Val partitioner = new new SubjectPartitioner3 (Subjects)
     // polymerizing (a shuffle reduction) according to the specified key and the partitioner 
    val reduced: RDD [((String , String), Int)] subjectTeacherAndOne.reduceByKey = (partitioner, _ + _)
    val topN = args(2).toInt
    Result Val = reduced.mapPartitions (IT => {
       // definition of a set of ordered key TreeSet 
      Val Sorter = new new mutable.TreeSet [OrderingBean] ()
       // traverse the data iterator 
      IT. the foreach (T => {
        sorter += new OrderingBean(t._1._1, t._1._2, t._2)
        if (sorter.size > topN) {
          Last Val = sorter.last
           // removes the last 
          Sorter - = Last
        }
      })
      sorter.iterator
    })

    val r = result.collect()

    println(r.toBuffer)

    sc.stop()

  }

}

class SubjectPartitioner3(val subjects: Array[String]) extends Partitioner {

  // partitioning rule initialization splitter 
  Val = the rules new new mutable.HashMap [String, Int] ()
   var index = 0 
  for (Sub <- Subjects) {
    rules(sub) = index
    index += 1
  }

  override def numPartitions: Int = subjects.length

  // This method is called the Task Executor of 
  the override DEF getPartition (Key: the Any): Int = {
    val tuple = key.asInstanceOf[(String, String)]
    Subject Val = tuple._1
     // Find the corresponding partition number to discipline rules to achieve initialization 
    rules (subject)

  }
}
View Code

OrderingBean (redefined collation)

package com._51doit.spark04

import scala.collection.mutable.ArrayBuffer

class OrderingBean(val subject: String, val name: String, val count: Int) extends Ordered[OrderingBean] with Serializable {

  val equiv = new ArrayBuffer[(String, String, Int)]()

  equiv += ((subject, name, count))

  override def compare(that: OrderingBean): Int = {

    if (this.count == that.count) {
      equiv += ((that.subject, that.name, that.count))
      0
    } else {
      -(this.count - that.count)
    }
  }


  override def toString: String =
    if (equiv.size > 1) {
      equiv.toString()
    } else
      s"($subject, $name, $count)"
}
View Code

 

Implementation of six (use repartitionAndSortWithinPartitions)

  repartitionAndSortWithinPartitions sorted according to the specified partition and is sorted within each partition

FavoriteTeacher6

object FavoriteTeacher06 {
  def main(args: Array[String]): Unit = {
    isLocal Val = args ( 0 ) .toBoolean
     // Create SparkConf, and then create SparkContext 
    Val = the conf new new SparkConf (). setAppName ( the this .getClass.getSimpleName)
     IF (isLocal) {
      conf.setMaster("local[1]")
    }
    SC Val = new new SparkContext (the conf)
     // specified later created from where to read data RDD 
    Val Lines: RDD [String] = sc.textFile (args ( . 1 ))
     // data segmentation 
    val subjectTeacherAndOne = lines.map ( = Line> {
      val fields = line.split("/")
      val subject = fields(2).split("[.]")(0)
      val teacher = fields(3)
      ((subject, teacher), 1)
    })
    // polymerization 
    Val reduced subjectTeacherAndOne.reduceByKey = (_ + _)
     // calculate all subjects, and collected Driver side 
    Val Subjects: the Array [String] = . Reduced.map (_._ 1._1) .distinct () the collect ()
     // the partitioner used is new in Driver out of the end, but his method is to be called Executor 
    Val partitioner = new SubjectPartitionerV2 (Subjects)
     // to the original data collation 
    val keyByRDD: RDD [((String , String, Int), Null)] = reduced
      .map(t => ((t._1._1, t._1._2, t._2), null))
    //隐式转换
    implicit val orderRules = new Ordering[(String, String, Int)] {
      override def compare(x: (String, String, Int), y: (String, String, Int)): Int = {
        -(x._3 - y._3)
      }
    }
    the topN Val = args ( 2 ) .toInt
     // repartitionAndSortWithinPartitions sorted according to the specified partition and is sorted within each partition 
    Val Result: RDD [((String, String, Int), Null)] = keyByRDD
      .repartitionAndSortWithinPartitions (partitions)
    result.foreachPartition(it => {
      var index = 1
      while (it.hasNext && index <= topN) {
        val tuple = it.next()
        println(tuple)
        index += 1
      }
    })
    sc.stop()
  }
}
View Code

SubjectPartitionerV2

class SubjectPartitionerV2(val subjects: Array[String]) extends Partitioner {

  // partitioning rule initialization splitter 
  Val = the rules new new mutable.HashMap [String, Int] ()
   var index = 0 
  for (Sub <- Subjects) {
    rules(sub) = index
    index += 1
  }
  the override DEF numPartitions: Int = subjects.length
   // This method will be called in the Executor Task in 
  the override DEF getPartition (Key: the Any): Int = {
    val tuple = key.asInstanceOf[(String, String, Int)]
    Subject Val = tuple._1
     // Find the corresponding partition number to discipline rules to achieve initialization 
    rules (subject)
  }
}
View Code

 

2. Custom Ordering

Data form: name, age, color values

Demand: First Sort color values ​​(color values ​​higher front row), when the same color value, younger people front row

2.1 The first form

Ideas, define a class Boy (case class), it is used to load these properties, using implicit definition of a collation conversion, as follows

CustomSort1

object CustomSort1 {
  def main(args: Array[String]): Unit = {
    import com._51doit.spark04.MyPredef

    selection conf: SparkConf = new SparkConf ()
      .setAppName(this.getClass.getSimpleName)
      .setMaster("local[*]")

    // 创建SparkContext 
    choice sc: SparkContext = new SparkContext (conf)
    Lines Val: RDD [String] = sc.parallelize (List ( " Jack, 30,99.99 " , " Sherry, 18,9999.99 " , " Tom, 29,99.99 " ))
     // process the data, use mapPartitions, reduction of Boy Create 
    Val BoyRDD: RDD [Boy] = lines.mapPartitions (IT => {
      it.map(t => {
        val split: Array[String] = t.split(",")
        Boy(split(0), split(1).toInt, split(2).toDouble)
      })
    })
    import MyPredef.Boy2OrderingBoy
    val res: RDD[Boy] = BoyRDD.sortBy(t => t)
    print(res.collect().toBuffer)
  }
}

MyPredef

Both versions will do

Ordered form

object MyPredef {
  implicit val Boy2OrderingBoy: Boy => Ordered[Boy] = (boy:Boy) => new Ordered[Boy]{
    override def compare(that: Boy): Int = {
      if(boy.fv == that.fv){
        boy.age - that.age
      } else{
        -(boy.fv - that.fv).toInt
      }
    }
  }
}

Ordering form

object MyPredef {
    implicit val Boy2OrderingBoy: Ordering[Boy] = new Ordering[Boy] {
      override def compare(x: Boy, y:Boy): Int = {
        if (x.fv == y.fv) {
          x.age - y.age
        } else {
          -(x.fv - y.fv).toInt
        }
      }
    }
}

Boy

case class Boy(name:String, age:Int, fv: Double)

 

2.2 The second form (by means of a tuple)

object CustomSort2 {
  def main(args: Array[String]): Unit = {
    selection conf: SparkConf = new SparkConf ()
      .setAppName(this.getClass.getSimpleName)
      .setMaster("local[*]")

    // 创建SparkContext 
    choice sc: SparkContext = new SparkContext (conf)
    Lines Val: RDD [String] = sc.parallelize (List ( " Jack, 30,99.99 " , " Sherry, 18,9999.99 " , " Tom, 29,99.99 " ))
     // process the data, use mapPartitions, reduction of Boy Create 
    Val tpRDD: RDD [(String, Int, Double)] = lines.mapPartitions (IT => {
      it.map(t => {
        val split: Array[String] = t.split(",")
        (split(0), split(1).toInt, split(2).toDouble)
      })
    })
    // Use default rules tuple sort 
    Val the sorted: RDD [(String, Int, Double)] = tpRDD.sortBy (T => (- t._3, t._2))
    println(sorted.collect().toBuffer)
  }
}

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

Guess you like

Origin www.cnblogs.com/jj1106/p/12014214.html