saveAsTextFiles("")
结果写入到MySQL
service mysql start
mysql -u hadoop -p
use spark
create table wordcount(word char(20),count int(4));
#得安装python连接MySQL的模块PyMySQL
sudo apt-get update
sudo apt-get install python3-pip
pip3 -V
sudo pip3 install PyMySQL
东西和上篇博客一样就是加个保存的
#!/usr/bin/env python3
from__future__import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if__name__==“main”:
if len(sys.argv)!=3:
print(“Usage:NetworkWordCountStateful.py<hostname.,port>”,file=sys.stderror)
exit(-1)
sc = SparkContext(appName = “PythonStreamingStatefulNetworkWordCount”)
ssc = StreamingContext(sc,1)
ssc.checkpoint(“file:///usr/local/spark/mycode/streaming/stateful/”)
initialRDD = sc.parallelize([(u’hello’,1),(u’world’,1)])#初始化RDD,u是unicode编码
#nc端发过来(hadoop,1)(hadoop,1)(spark,1)(spark,1)到这个updateStateByKey,先变成(hadoop,(1,1))(spark,(1,1)),然后先处理第一个把(1,1)传到这个updateFunc,把last_sum也传进来
def updateFunc(new_values,last_sum):
return sum(new_values)+(last_sum or 0)
def func(rdd):
repartitionedRDD=rdd.repartition(3)#不用打开很多数据库连接
repartitionedRDD.foreachPartition(dbfunc)
def dbfunc(records):
db = pymysql.connect(“localhost”,“hadoop”,“hadoop”,“spark”)
cursor =db.cursor()#指针
def doinsert§:
sql = “insert into wordcount(word,count) values(’%s’,’%s’)”%(str(p[0]),str(p[1]))
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
for item in records:
doinsert(item)
lines=ssc.socketTextStream(sys.argv[1],int(sys.argv[2]))#socketTextStream(“localhost”, 9999)
running_counts=lines.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1)).updateStateByKey(updateFunc,initialRDD=initialRDD)
running_counts.pprint()
running_counts.foreachRDD(func)
ssc.start()
ssc.awaitTermination()