spark data statistics

        See "Spark Advanced Data Analysis" for a good way to perform preliminary statistical analysis of data. After practice, it really works well. Record it here and share it with everyone to encourage yourself to keep learning new knowledge.


      

 //判断首行
def isHeader(line:String) = line.contain("id_1")
        
def toDouble(s:String)={
    try{
        s.toDouble
    }
    catch{
        case e: Exception => Double.NaN
    }
}

case class MatchData(id1:Int, id2:Int, scores:Array[Double],matched:Boolean)
def parseData(line:String){
    var arr = line.split(",")
    val id1 = arr(0).toInt
    val id2 = arr(1).toInt
    val matched = arr(11).toBoolean
    // 0 column is the first column, count data from 2 to 11 columns
    val scoses = arr.slice(2,11).map(toDouble)
    MatchData(id1,id2,scores,matched)
}

def DataAnalysis(){
    import Java.lang.Double.isNan
    val conf = new SparkConf().setAppName("dataAnalysis").setMaster("local[*]")
    val sc = new SparkContext(conf)
    //Read all files in the linkage directory
    val filePath = "linkage/*"
    val textContext = sc.textFile(filePath)
    // remove the first line
    val noHeader = textContext.filter{ x => !isHeader(x)}
    val mds = noHeader.map{ x => parseData(x)}
}

val statsm = statsWithMissing(mds.filter(_.matched).map(_.scores))
val statsn  = statsWithMissing(mds.filter(!_.matched).map(_.scores))
statsm.zip(statsn).map{ case(m,n) =>
    (m.m_lMissing + n.m_lMissing, m.stats.mean - n.stats.mean)
}.foreach(println)

import org.apache.spark.util.StatCounter
class VariableStats extends Serializable{
    val stats:StatCounter = new StatCounter()
    var m_Missing = 0
def add(x:Double):VariableStats={
        if(Double.NaN.equals(x)) m_lMissing += 1
else stats.merge(x)
            
        this
    }

    def merge(other:VariableStats):VariableStats={
        stats.merge(other.stats)
        m_lMissing += other.m_lMissing
        this
    }
    override def toString = {
        "stats:" + stats.toString() +"NaN" + m_lMissing
    }
}

object VariableStats extends Serializable{
    def apply(x:Double) = new VariableStats()
}
import org.apache.spark.rdd.RDD
 def statsWithMissing ( rdd: RDD [Array [Double]] ): Array [VariableStats] = {
    val nastats = rdd.mapPartitions((iter:Iterator[Array[Double]]) => {
        val nas:Array[VariableStats] = iter.next().map{ d => VariableStats(d) }
        iter.foreach{ arr => nas.zip(arr).foreach{case(n,d) => n.add(d)}}
        Iterator(nas)
    })
    nastats.reduce((n1,n2) => {
        n1.zip(n2).map{case(a,b) => a.merge(b)}
    })
}
def naz(d:Double) = if(isNan(d)) 0 else d
case class Scored(md:MatchData,score:Double)
def getScores(mds:RDD[MatchData]) = {
    val ct = mds.map(md =>{
        val score = Array(2,5,6,7,8).map(i => naz(md.score(i))).sum
    })
    ct.filter{ s => s.score >= 4}.map{ s => s.md.matched}.countByValue().foreach(println)
}

def main(args:Array[String]){
    DataAnalysis ()
    println("OK")
}

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325890174&siteId=291194637