Spark学习笔记(二):算子综合案例实战

一:word count词频统计

from pyspark import SparkContext,SparkConf
import sys

if __name__ == '__main__':

    '''
    sys.argv: 实现从程序外部向程序传递参数,是一个列表 sys.argv[0]表示代码本身文件路径
    sys.exit([arg]) 程序中途退出 arg = 0是正常退出
    
    sys.argv[0]表示代码本身文件路径
    sys.argv[1]表示输入文件路径 eg:file:///Users/lotus/PycharmProjects/data
    sys.argv[2]表示输出文件路径
    '''
    if len(sys.argv) != 3:
        print("Usage:wordcount<input>",file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext("local", "context")


    '''
    textFile:Read a text file from HDFS, a local file system (available on all nodes),
    or any Hadoop-supported file system URI, and return it as an RDD of Strings.
    注:可以读取文件
    '''

    def printResult():
        counts = sc.textFile(sys.argv[1]).flatMap(lambda line:line.split(" ")).map(lambda x:(x,1)).reduceByKey(lambda a,b:a+b)

        output = counts.collect() #Return a list that contains all of the elements in this RDD

        for (word,count) in output:
            print("%s:%i" % (word,count))

    '''
    工程上需要把结果写到文件系统里
    '''
    def saveFile():
        sc.textFile(sys.argv[1]) \
            .flatMap(lambda line:line.split(" ")) \
            .map(lambda x:(x,1)) \
            .reduceByKey(lambda a,b:a+b) \
            .saveAsTextFile(sys.argv[2])



    sc.stop()

二:TopK统计

import sys

from pyspark import SparkConf, SparkContext

if __name__ == '__main__':

    if len(sys.argv) != 2:
        print("Usage: topn <input>", file=sys.stderr)
        sys.exit(-1)

    conf = SparkConf()
    sc = SparkContext(conf=conf)

    counts = sc.textFile(sys.argv[1])\
        .map(lambda x:x.split("\t"))\ 
        .map(lambda x:(x[5],1))\  #取出6项,即用户ID
        .reduceByKey(lambda a,b:a+b)\
        .map(lambda x:(x[1],x[0]))\ #key和value换位
        .sortByKey(False)\ 
        .map(lambda x:(x[1],x[0])).take(5)

    for (word, count) in counts:
        print("%s: %i" % (word, count))


    sc.stop()

猜你喜欢

转载自blog.csdn.net/weixin_41993767/article/details/87641634