Scala implements word statistics --------------------- import scala.io.Source /** * Created by Administrator on 2018/5/7. */ object WCApp { def main(args: Array[String]): Unit = { // 1. Load file val src = Source.fromFile("d:/mr/word.txt" ) // 2. Get all lines val lines = src.getLines().toList // for(line <- lines){ // println(line) // } // 3. Flatten words val words = lines.flatMap(_.split(" " )) // Mark a pair val map1 = words.map((w:String) => (w, 1 )) // Group by word val map2 = map1.groupBy(t=> t._1) val map3 = map2.mapValues(list => list.size) lines.foreach(println) } } Scala implements word statistics 2 --------------------- import scala.io.Source /** * Created by Administrator on 2018/5/7. */ object WCApp2 { def main(args: Array[String]): Unit = { // 1. Load file val src = Source.fromFile("d:/mr/word.txt" ) // 2. Get all lines val lines = src.getLines().toList // for(line <- lines){ // println(line) // } // 3. Flatten words val words = lines.flatMap(_.split(" " )) // Mark a pair val map1 = words.map((w:String) => (w, 1 )) // Group by word {hello->[(hello,1),(hello,1),(hello,1)]} val map2 = map1.groupBy(t=> t._1) //{hello->(hello,4) , ...} val map3 = map2.mapValues(list => { // def op(a:Tuple2[String,Int] , b:Tuple2[String,Int]) = { // val word = a._1 // val cnt = a._2 + b._2 // (word , cnt) // } // list.reduce(op _) list.reduce((a,b)=>(a._1,a._2 + b._2 )) }) // val map4 = map3.map((t:Tuple2[String,Tuple2[String,Int]])=>t._2) map4.foreach(println) } } Bitmap implements topn statistics ------------------------ import scala.io.Source /** * Annual topN query of temperature, implemented using reduce */ object TempTopN2_Bitmap { def main(args: Array[String]): Unit = { // 1. Load temperature file val f = Source.fromFile("d:/mr/temp.dat" ) // 2. Get all lines val temps = f.getLines().toList // 3. Extract the year and temperature of each line to form a tuple {(1900,28),....} val map1 = temps.map((line:String) => { val arr = line.split(" ") val year = arr(0).toInt val temp = arr(1).toInt (year, temp) }) // 4. Group by year {(1920->{(),(),(),...}),...} val map2 = map1.groupBy((t:Tuple2[Int,Int])= > t._1) // 5. Aggregate the value corresponding to each key according to the temperature only top3 val map3 = map2.mapValues(list=> { val bytes = list.foldLeft(new Array[Byte](128))((a,b)=>{ val temp = b._2 if(temp > 0){ val index = temp / 8 val mod = temp % 8 a(index) = (a(index) | (1 << mod)).toByte } a }) // Define method, process bitmap def process(): String = { var count = 0; var tempStr = ""; for (x <- (0 until bytes.length).reverse) { val b = bytes(x) for (y <- (0 to 7).reverse) { if (((b >> y) & 1) != 0) { count += 1 tempStr = tempStr + "," + (8 * x + y) if (count == 3) { return tempStr } } } } tempStr } process() }) val map4 = map3.toList.sortBy(e=>e._1) map4.foreach(println(_)) } } Scala implements product reviews --------------------- 1 .TagUtil.java package com.oldboy.scala.util; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import java.util.ArrayList; import java.util.List; /** * Label tool class * / public class TagUtil { / ** * Extract the comment collection from the json data */ public static List<String> extractTags(String json){ //评论集合 List<String> tags = new ArrayList<String>() ; // Parse the file into a json object JSONObject obj = JSON.parseObject(json) ; // Get the array JSONArray array = obj.getJSONArray("extInfoList" ); // Determine the validity of the array if (array != null && array.size() > 0 ){ JSONObject obj2 = array.getJSONObject(0); JSONArray arr2 = obj2.getJSONArray("values") ; if(arr2 != null && arr2.size() > 0 ){ for(int i = 0 ; i < arr2.size() ; i ++){ tags.add(arr2.getString(i)); } } } return tags ; } } 2.TaggenDemo import javax.swing.text.html.HTML.Tag import com.oldboy.scala.util.TagUtil import scala.io.Source /** * Notes generation statistics */ object TaggingDemo { def main(args: Array[String]): Unit = { //1.加载文件 val file = Source.fromFile("d:/mr/temptags.txt") ; // 2. Extract all lines val lines = file.getLines().toList // 3. Flatten transform each line to form (busid, tag) val map1 = lines.flatMap(line=> { var list0: List [(String, String)] = Nile var arr = line.split("\t") val busid = arr(0 ) var json = arr(1) import scala.collection.JavaConversions._ val list:List[String] = TagUtil.extractTags(json).toList ; for(tag <- list){ list0 = (busid, tag) + : list0 } list0 }) // 4. Group the tuples, {(busid,tag)->List((busid,tag),(busid,tag),...} val map2 = map1.groupBy(t => t) // 5. Count the size of the List under each key, {(busid,tag)->300} val map3 = map2.mapValues(_.size) // 6. Swap element positions, List((busid , (tag,cnt)),...) val map4 = map3.toList.map(t=> (t._1._1 , (t._1._2, t._2))) // 7. Group by busid again Map(busid->List((busid , (tag,cnt)),...)) val map5 = map4.groupBy(t=> t._1) // 8. Sort the reviews in each business in reverse order. Map(busid->List((busid,(tag,59))) val map6 = map5.mapValues(list=> { val list2 = list.sortBy(t=> -t._2._2).take(5) val list3 = list2.map(t=>t._2) list3 }) // 9. Sort the merchants in reverse order according to the maximum number of reviews of the merchants val map7 = map6.toList.sortBy(t=> -t._2(0 )._2) map7.foreach(t=>{ val busid = t._1 val str = t._2.mkString(";") println(busid + "==>" + str) }) } }