添加maven依赖:
<dependencies>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.60</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.3</version>
</dependency>
</dependencies>
数据地址如下(真实数据)
https://pan.baidu.com/s/1GYr_sR4CBXCmd1AIaBbvPg
1、json工具类,解析json数据,并且拿到我们想要的字段(有标签的字段)
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class ReviewTags {
//86913510 {"reviewPics":null,"extInfoList":null,"expenseList":null,"reviewIndexes":[2],"scoreList":[{"score":5,"title":"口味","desc":""},{"score":5,"title":"服务","desc":""},{"score":5,"title":"环境","desc":""}]}
public static String extractTags(String jsonStr){
JSONObject object = JSON.parseObject(jsonStr);
if(object == null || !object.containsKey("extInfoList")){
return "";
}
JSONArray array = object.getJSONArray("extInfoList");
if(array == null) {
return "";
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < array.size(); i++) {
JSONObject obj = array.getJSONObject(i);
if (obj != null && obj.containsKey("title") && obj.getString("title").equals("contentTags") && obj.containsKey("values")) {
JSONArray arr = obj.getJSONArray("values");
if(arr == null){
continue;
}
boolean begin = true;
for (int j = 0; j < arr.size(); j++) {
if (begin) {
begin = false;
} else {
sb.append(",");
}
sb.append(arr.getString(j));
}
}
}
return sb.toString();
}
/**
* 测试类,用于拿出extInfoList的内容,并json解析
* @param args
*/
public static void main(String[] args){
String s = "{\"reviewPics\":[{\"picId\":2405538806,\"url\":\"http://p0.where.net/shaitu/7c10019c62947d01ded80cc698c77c90217708.jpg\",\"status\":1},{\"picId\":2405442602,\"url\":\"http://p0.meituan.net/shaitu/d41ef06f5d16d5d3cbc871765ff93130270451.jpg\",\"status\":1}],\"extInfoList\":[{\"title\":\"contentTags\",\"values\":[\"回头客\",\"上菜快\",\"环境优雅\",\"性价比高\",\"菜品不错\"],\"desc\":\"\",\"defineType\":0},{\"title\":\"tagIds\",\"values\":[\"493\",\"232\",\"24\",\"300\",\"1\"],\"desc\":\"\",\"defineType\":0}],\"expenseList\":null,\"reviewIndexes\":[1,2],\"scoreList\":null}";
//上面定义的是静态方法,所以直接调用
System.out.println(extractTags(s));
System.out.println(extractTags(""));
System.out.println(extractTags(null));
}
}
2、标签统计:
import com.fengrui.taggen.ReviewTags
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 团购网站的标签生成应用
*/
object TagGenerator {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("TagGenerator").setMaster("local[*]")
val sc = new SparkContext(conf)
//86913510 {"reviewPics":null,"extInfoList":null,"expenseList":null,"reviewIndexes":[2],"scoreList":[{"score":5,"title":"口味","desc":""},{"score":5,"title":"服务","desc":""},{"score":5,"title":"环境","desc":""}]}
val lines: RDD[String] = sc.textFile("file:///F:\\ideaworkspace\\temptag\\src\\main\\resources\\temptags.txt")
//将数据用制表符切割,并且过滤出长度等于2的所有满足条件的数据(原始数据制表符切割以后
// 左面是商店ID,右面是客户反馈信息)
val filter: RDD[Array[String]] = lines.map(_.split("\t")).filter(x => x.length == 2)
//将过滤完的数据进行映射,x(0) -> ReviewTags.extractTags(x(1))的意思就是(86913510,环境好,好吃,..)(86913510,)
//利用json工具类拿出出第二个元素,有的是空,有的会有数据
val useJson: RDD[(String, String)] = filter.map(x => x(0) -> ReviewTags.extractTags(x(1)))
// useJson.take(50).foreach(println)
//因为useJson中的二元组的value值有的是空,所以将useJson中第二个元素(value)长度大于零的二元组过滤出来(取出)
val ujf: RDD[(String, String)] = useJson.filter(x => x._2.length > 0)
//还是映射,变成 86913510 -> [环境好,好吃...]
val ujfm: RDD[(String, Array[String])] = ujf.map(x => x._1 -> x._2.split(","))
//将value值压平,变为(86913510,环境好)(86913510,好吃)....
val fmv: RDD[(String, String)] = ujfm.flatMapValues(x => x)
// s.take(50).foreach(println)
//给fmv中元祖都拼1,((86913510,环境好),1) ((86913510,好吃),1)....
val fmvm: RDD[((String, String), Int)] = fmv.map(x => (x._1,x._2) -> 1)
// a.take(20).foreach(println)
//进行聚合,聚合完以后为 ((78824187,干净卫生),7) ((77373671,分量少),1) ((84270191,体验好),2)
val reduceByKey: RDD[((String, String), Int)] = fmvm.reduceByKey(_+_)
// reduceByKey.foreach(println)
//变成(78824187,List((干净卫生,7))) (77373671,List((分量少,1)))...
val rsm: RDD[(String, List[(String, Int)])] = reduceByKey.map(x => x._1._1 -> List((x._1._2,x._2)))
// rsm.foreach(println)
//:::为数组的聚合,变为(83644298,List((体验好,1), (性价比高,1), (服务热情,1), (价格实惠,1), (味道赞,1)))
//(71039150,List((团建,1), (价格实惠,1), (朋友聚会,1), (环境优雅,1), (体验好,1)))....
val rsmr: RDD[(String, List[(String, Int)])] = rsm.reduceByKey(_:::_)
// rsmr.foreach(println)
//reverse是倒序排列
//最后变为(83644298,味道赞:1,价格实惠:1,服务热情:1,性价比高:1,体验好:1)
val res: RDD[(String, String)] = rsmr.map(x => x._1 -> x._2.sortBy(_._2).reverse.take(10).map(a => a._1 + ":" + a._2.toString).mkString(","))
// res.foreach(println)
//写出存储
res.map(x => x._1 + "\t" + x._2).coalesce(1).saveAsTextFile("file:///G:/res1.txt")
println("----success----")
}
}
收集到的数据: