一:word count词频统计
from pyspark import SparkContext,SparkConf
import sys
if __name__ == '__main__':
'''
sys.argv: 实现从程序外部向程序传递参数,是一个列表 sys.argv[0]表示代码本身文件路径
sys.exit([arg]) 程序中途退出 arg = 0是正常退出
sys.argv[0]表示代码本身文件路径
sys.argv[1]表示输入文件路径 eg:file:///Users/lotus/PycharmProjects/data
sys.argv[2]表示输出文件路径
'''
if len(sys.argv) != 3:
print("Usage:wordcount<input>",file=sys.stderr)
sys.exit(-1)
sc = SparkContext("local", "context")
'''
textFile:Read a text file from HDFS, a local file system (available on all nodes),
or any Hadoop-supported file system URI, and return it as an RDD of Strings.
注:可以读取文件
'''
def printResult():
counts = sc.textFile(sys.argv[1]).flatMap(lambda line:line.split(" ")).map(lambda x:(x,1)).reduceByKey(lambda a,b:a+b)
output = counts.collect() #Return a list that contains all of the elements in this RDD
for (word,count) in output:
print("%s:%i" % (word,count))
'''
工程上需要把结果写到文件系统里
'''
def saveFile():
sc.textFile(sys.argv[1]) \
.flatMap(lambda line:line.split(" ")) \
.map(lambda x:(x,1)) \
.reduceByKey(lambda a,b:a+b) \
.saveAsTextFile(sys.argv[2])
sc.stop()
二:TopK统计
import sys
from pyspark import SparkConf, SparkContext
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: topn <input>", file=sys.stderr)
sys.exit(-1)
conf = SparkConf()
sc = SparkContext(conf=conf)
counts = sc.textFile(sys.argv[1])\
.map(lambda x:x.split("\t"))\
.map(lambda x:(x[5],1))\ #取出6项,即用户ID
.reduceByKey(lambda a,b:a+b)\
.map(lambda x:(x[1],x[0]))\ #key和value换位
.sortByKey(False)\
.map(lambda x:(x[1],x[0])).take(5)
for (word, count) in counts:
print("%s: %i" % (word, count))
sc.stop()