一、SparkStreaming读取kafka数据
package org.apache.spark.examples.streaming
import java.sql.{PreparedStatement, Connection, DriverManager}
import java.util.concurrent.atomic.AtomicInteger
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.storage.StorageLevel
object NetworkWordCountStateful {
def main(args: Array[String]) {
//定义状态更新函数
val updateFunc = (values: Seq[Int], state: Option[Int]) => {
val currentCount = values.foldLeft(0)(_ + _)
val previousCount = state.getOrElse(0)
Some(currentCount + previousCount)
}
StreamingExamples.setStreamingLogLevels() //设置log4j日志级别
val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCountStateful")
val sc = new StreamingContext(conf, Seconds(5))
sc.checkpoint("file:///usr/local/spark/mycode/streaming/dstreamoutput/") //设置检查点,检查点具有容错机制
val lines = sc.socketTextStream("localhost", 9999)
val words = lines.flatMap(_.split(" "))
val wordDstream = words.map(x => (x, 1))
val stateDstream = wordDstream.updateStateByKey[Int](updateFunc)
stateDstream.print()
//下面是新增的语句,把DStream保存到MySQL数据库中
stateDstream.foreachRDD(rdd => {
//内部函数
def func(records: Iterator[(String,Int)]) {
var conn: Connection = null
var stmt: PreparedStatement = null
try {
val url = "jdbc:mysql://localhost:3306/spark"
val user = "root"
val password = "hadoop" //笔者设置的数据库密码是hadoop,请改成你自己的mysql数据库密码
conn = DriverManager.getConnection(url, user, password)
records.foreach(p => {
val sql = "insert into wordcount(word,count) values (?,?)"
stmt = conn.prepareStatement(sql);
stmt.setString(1, p._1.trim)
stmt.setInt(2,p._2.toInt)
stmt.executeUpdate()
})
} catch {
case e: Exception => e.printStackTrace()
} finally {
if (stmt != null) {
stmt.close()
}
if (conn != null) {
conn.close()
}
}
}
val repartitionedRDD = rdd.repartition(3)
repartitionedRDD.foreachPartition(func)
})
sc.start()
sc.awaitTermination()
}
}
二、使用sbt编译:
- cd /usr/local/spark/mycode/streaming/dstreamoutput
- rm simple.sbt
- vim simple.sbt
-
name := "Simple Project" version := "1.0" scalaVersion := "2.11.8" libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.1" libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % "2.3.1" libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.3.1"
我的spark是2.3.1 scala是2.11.8
-
使用sbt进行打包
- cd /usr/local/spark/mycode/streaming/dstreamoutput
- /usr/local/sbt/sbt package
6.开始运行
/usr/local/spark/bin/spark-submit --class "org.apache.spark.examples.streaming.NetworkWordCountStateful" --jars /usr/local/spark/jars/mysql-connector-java-5.1.42-bin.jar /usr/local/spark/mycode/streaming/dstreamoutput/target/scala-2.11/simple-project_2.11-1.0.jar
7.打开另一个终端:
nc -lk 9999
//现在你就可以在当前窗口内随意输入单词,输入一个单词就回车,比如输入下面单词
hello
hadoop
spark
hello
spark
这时可以去mysql数据库查看数据库里的数据是否填充进去了。
三、贴一下pom的配置文件:
<properties>
<spark.version>2.3.1</spark.version>
<scala.version>2.11</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.redis</groupId>-->
<!--<artifactId>redis</artifactId>-->
<!--<version>1.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.1</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.spark</groupId>-->
<!--<artifactId>spark-mllib_${scala.version}</artifactId>-->
<!--<version>${spark.version}</version>-->
<!--</dependency>-->
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
在这里有对于权限需要注意:
chmod -R 777 * 是递归修改目录及其下面的权限
rwx 对应于421 第一个字符代表文件(-)、目录(d),链接(l)
第一个是目录或文件所有者的权限,第二个是用户所属的组的权限,与文件所有者同一组的用户的权限是读、写但不能执行
第三个是:不与文件所有者同组的其他用户的权限是读不能写和执行