First, the data source
xiaoliu 64
xiaoliu 69
xiaoliu 79
xiaoji 98
xiaoliu 100
xiaoji 99
xiaowang 27
xiaowang 69
xiaowang 64
xiaozhang 67
xiaozhang 38
xiaozhang 93
xiaozhang 29
xiaozhang 85
xiaoliu 19
xiaoliu 53
xiaoliu 93
xiaoji 90
xiaoji 85
xiaoji 73
xiaoji 64
xiaoji 39
Second, the programming pyspark
2.1 Method One:
from pyspark import SparkContext,SparkConf
import os
import random
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = "E:\ProgramFiles\spark-2.2.1-bin-2.6.0-cdh5.14.2"
os.environ['PYSPARK_PYTHON'] = "E:\ProgramFiles\Anaconda3\python.exe"
config = SparkConf() \
.setMaster("local[*]") \
.setAppName("TopNDemo")
sc = SparkContext.getOrCreate(config)
path = "G:/liu/topn.txt"
rdd = sc.textFile(path)
rdd1 = rdd \
.map(lambda line : line.split(" ")) \
.filter(lambda arr : len(arr) == 2) \
.map(lambda arr : (arr[0],int(arr[1])))
def top3(key,iter):
sortedIter = sorted(iter,reverse=True)
top3 = sortedIter[0:3]
return map(lambda x : (key,x),top3)
result1 = rdd1 \
.map(lambda t : ((random.randint(1,10),t[0]),t[1])) \
.groupByKey() \
.flatMap(lambda t : top3(t[0][1],t[1])) \
.groupByKey() \
.flatMap(lambda t : top3(t[0],t[1]))
print(result1.collect())
result:
[('xiaoliu', 100), ('xiaoliu', 93), ('xiaoliu', 79), ('xiaowang', 69), ('xiaowang', 64), ('xiaowang', 27), ('xiaozhang', 93), ('xiaozhang', 85), ('xiaozhang', 67), ('xiaoji', 99), ('xiaoji', 98), ('xiaoji', 90)]
2.2 Method II (aggregateByKey)
2.2.1 Method One
from pyspark import SparkContext,SparkConf
import os
from functools import reduce
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = "E:\ProgramFiles\spark-2.2.1-bin-2.6.0-cdh5.14.2"
os.environ['PYSPARK_PYTHON'] = "E:\ProgramFiles\Anaconda3\python.exe"
config = SparkConf() \
.setMaster("local[*]") \
.setAppName("TopNDemo")
sc = SparkContext.getOrCreate(config)
path = "G:/liu/topn.txt"
rdd = sc.textFile(path)
rdd1 = rdd \
.map(lambda line : line.split(" ")) \
.filter(lambda arr : len(arr) == 2) \
.map(lambda arr : (arr[0],int(arr[1])))
zeroValue = []
def f(a,b):
a.append(b)
sortedIter = sorted(a,reverse=True)
top3 = sortedIter[0:3]
return top3
seqFunc = lambda a ,b : f(a,b)
def g(c,d):
for i in d:
c.append(i)
sortedIter = sorted(c,reverse=True)
top3 = sortedIter[0:3]
return top3
combFunc = lambda c , d: g(c,d)
result2 = rdd1 \
.aggregateByKey(zeroValue,seqFunc,combFunc)
print(result2.collect())
2.2.2 Method Two
from pyspark import SparkContext,SparkConf
import os
from functools import reduce
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = "E:\ProgramFiles\spark-2.2.1-bin-2.6.0-cdh5.14.2"
os.environ['PYSPARK_PYTHON'] = "E:\ProgramFiles\Anaconda3\python.exe"
config = SparkConf() \
.setMaster("local[*]") \
.setAppName("TopNDemo")
sc = SparkContext.getOrCreate(config)
path = "G:/liu/topn.txt"
rdd = sc.textFile(path)
rdd1 = rdd \
.map(lambda line : line.split(" ")) \
.filter(lambda arr : len(arr) == 2) \
.map(lambda arr : (arr[0],int(arr[1])))
zeroValue = []
def f(a,b):
a.append(b)
sortedIter = sorted(a,reverse=True)
top3 = sortedIter[0:3]
return top3
seqFunc = lambda a ,b : f(a,b)
combFunc = lambda c,d : reduce(lambda x , y : f(x,y),c,d)
result3 = rdd1 \
.aggregateByKey(zeroValue,seqFunc,combFunc)
print(result3.collect())