See "Spark Advanced Data Analysis" for a good way to perform preliminary statistical analysis of data. After practice, it really works well. Record it here and share it with everyone to encourage yourself to keep learning new knowledge.
//判断首行 def isHeader(line:String) = line.contain("id_1") def toDouble(s:String)={ try{ s.toDouble } catch{ case e: Exception => Double.NaN } } case class MatchData(id1:Int, id2:Int, scores:Array[Double],matched:Boolean) def parseData(line:String){ var arr = line.split(",") val id1 = arr(0).toInt val id2 = arr(1).toInt val matched = arr(11).toBoolean // 0 column is the first column, count data from 2 to 11 columns val scoses = arr.slice(2,11).map(toDouble) MatchData(id1,id2,scores,matched) } def DataAnalysis(){ import Java.lang.Double.isNan val conf = new SparkConf().setAppName("dataAnalysis").setMaster("local[*]") val sc = new SparkContext(conf) //Read all files in the linkage directory val filePath = "linkage/*" val textContext = sc.textFile(filePath) // remove the first line val noHeader = textContext.filter{ x => !isHeader(x)} val mds = noHeader.map{ x => parseData(x)} } val statsm = statsWithMissing(mds.filter(_.matched).map(_.scores)) val statsn = statsWithMissing(mds.filter(!_.matched).map(_.scores)) statsm.zip(statsn).map{ case(m,n) => (m.m_lMissing + n.m_lMissing, m.stats.mean - n.stats.mean) }.foreach(println) import org.apache.spark.util.StatCounter class VariableStats extends Serializable{ val stats:StatCounter = new StatCounter() var m_Missing = 0 def add(x:Double):VariableStats={ if(Double.NaN.equals(x)) m_lMissing += 1 else stats.merge(x) this } def merge(other:VariableStats):VariableStats={ stats.merge(other.stats) m_lMissing += other.m_lMissing this } override def toString = { "stats:" + stats.toString() +"NaN" + m_lMissing } } object VariableStats extends Serializable{ def apply(x:Double) = new VariableStats() } import org.apache.spark.rdd.RDD def statsWithMissing ( rdd: RDD [Array [Double]] ): Array [VariableStats] = { val nastats = rdd.mapPartitions((iter:Iterator[Array[Double]]) => { val nas:Array[VariableStats] = iter.next().map{ d => VariableStats(d) } iter.foreach{ arr => nas.zip(arr).foreach{case(n,d) => n.add(d)}} Iterator(nas) }) nastats.reduce((n1,n2) => { n1.zip(n2).map{case(a,b) => a.merge(b)} }) } def naz(d:Double) = if(isNan(d)) 0 else d case class Scored(md:MatchData,score:Double) def getScores(mds:RDD[MatchData]) = { val ct = mds.map(md =>{ val score = Array(2,5,6,7,8).map(i => naz(md.score(i))).sum }) ct.filter{ s => s.score >= 4}.map{ s => s.md.matched}.countByValue().foreach(println) } def main(args:Array[String]){ DataAnalysis () println("OK") }