Apache Spark is a fast and general-purpose cluster computing system.It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs.
It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Spark Streaming.
package big.data.analyse.wordcount
import org.apache.spark.sql.SparkSession
/**
* Created by zhen on 2019/3/9.
*/
object WordCount {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("WordCount")
.master("local[2]")
.getOrCreate()
// 加载数据
val textRDD = spark.sparkContext.textFile("src/big/data/analyse/wordcount/wordcount.txt")
val result = textRDD.map(row => row.replace(",", ""))//去除文字中的,防止出现歧义
.flatMap(row => row.split(" "))//把字符串转换为字符集合
.map(row => (row, 1))//把每个字符串转换为map,便于计数
.reduceByKey(_+_)//计数
// 打印结果
result.foreach(println)
}
}
(Spark,3)
(GraphX,1)
(graphs.,1)
(learning,1)
(general-purpose,1)
(Python,1)
(APIs,1)
(provides,1)
(that,1)
(is,1)
(a,2)
(R,1)
(high-level,1)
(general,1)
(processing,2)
(fast,1)
(including,1)
(higher-level,1)
(optimized,1)
(Apache,1)
(in,1)
(SQL,2)
(system.,1)
(Java,1)
(of,1)
(data,1)
(tools,1)
(cluster,1)
(also,1)
(graph,1)
(structured,1)
(execution,1)
(It,2)
(MLlib,1)
(for,3)
(Scala,1)
(an,1)
(computing,1)
(machine,1)
(supports,2)
(and,5)
(engine,1)
(set,1)
(rich,1)
(Streaming.,1)