package com.snailteam.simple_project import java.sql.DriverManager import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkContext import org.apache.spark.rdd.JdbcRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.huaban.analysis.jieba.JiebaSegmenter import com.huaban.analysis.jieba.JiebaSegmenter.SegMode import scala.collection.JavaConversions /** * 使用spark集群计算的时候,将jieba的类都 implements Serializable * * ./bin/spark-submit --class com.snailteam.simple_project.App --jars lib/mysql-connector-java-5.1.18.jar * * * */ object App { val url = "xx" val username = "xx" val password = "xx" def main(args: Array[String]): Unit = { // val conf = new SparkConf() // .setMaster("spark://192.168.56.102:7077") // .setAppName("mysql_test") // val sc = new SparkContext(conf) val sc = new SparkContext("local", "mysql") val seg = new JiebaSegmenter; val rdd = new JdbcRDD( sc, () => { Class.forName("com.mysql.jdbc.Driver").newInstance() DriverManager.getConnection(url, username, password) }, "select * from book limit ?,? ", 1, 10000000, 4) var result = rdd.flatMap { x => { var ts = seg.process(x(8).toString(), SegMode.SEARCH); for(t<-JavaConversions.asScalaBuffer(ts) if (t.word.getTokenType.contains("v") || t.word.getTokenType.contains("n") || t.word.getTokenType.contains("a")) ) yield t.word.getToken; } }.map { word => (word, 1) }.reduceByKey(_ + _).sortBy(f=>{f._2}, false).take(100); result.foreach(println) sc.stop() } }
使用spark 对文本分词统计
猜你喜欢
转载自xiaofancn.iteye.com/blog/2205024
今日推荐
周排行