版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014281392/article/details/89032391
multiply
from pyspark import SparkConf, SparkContext
sc = SparkContext()
# 累乘
nums = sc.parallelize([1, 2, 3 ,4, 5])
mult = nums.fold(1, (lambda x, y : x *y))
print(mult)
120
# 累加
accumulate = nums.fold(0, (lambda x, y : x + y))
print(accumulate)
15
sort_by key
! cat ./data.txt
crazy crazy fox jumped
crazy for jumped
fox is fast
fox is smart
dog is smart
lines = sc.textFile('data.txt', 1)
lines.collect()
['crazy crazy fox jumped',
'crazy for jumped',
'fox is fast ',
'fox is smart',
'dog is smart']
# 词频统计
# flatMap :展平嵌套的可迭代对象
frequencies = lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x +y)
frequencies.collect()
[('crazy', 3),
('fox', 3),
('jumped', 2),
('for', 1),
('is', 3),
('fast', 1),
('', 1),
('smart', 2),
('dog', 1)]
frequencies.count()
9
lines.flatMap(lambda x : x.split(' ')).collect()
['crazy',
'crazy',
'fox',
'jumped',
'crazy',
'for',
'jumped',
'fox',
'is',
'fast',
'',
'fox',
'is',
'smart',
'dog',
'is',
'smart']
lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).collect()
[('crazy', 1),
('crazy', 1),
('fox', 1),
('jumped', 1),
('crazy', 1),
('for', 1),
('jumped', 1),
('fox', 1),
('is', 1),
('fast', 1),
('', 1),
('fox', 1),
('is', 1),
('smart', 1),
('dog', 1),
('is', 1),
('smart', 1)]
lines.collect()
['crazy crazy fox jumped',
'crazy for jumped',
'fox is fast ',
'fox is smart',
'dog is smart']
lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x + y).collect()
[('crazy', 3),
('fox', 3),
('jumped', 2),
('for', 1),
('is', 3),
('fast', 1),
('', 1),
('smart', 2),
('dog', 1)]
sum
nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8])
Sum = nums.fold(0, (lambda x, y : x+y))
print(Sum)
36
union
r1 = sc.parallelize([('k1', 1), ('k2', 2), ('k3', 3)])
r2 = sc.parallelize([('k1', 3), ('k2', 4), ('k4', 8)])
r3 = r1.union(r2)
print('r3 :', r3.collect())
r4 = r3.reduceByKey(lambda x, y : x + y)
print('r4 :', r4.collect())
r3 : [('k1', 1), ('k2', 2), ('k3', 3), ('k1', 3), ('k2', 4), ('k4', 8)]
r4 : [('k1', 4), ('k3', 3), ('k4', 8), ('k2', 6)]
Word frequency
!cat './data.txt'
crazy crazy fox jumped over the fence
crazy fox jumped
the fence is high of fox
crazy fox is smart
fox jumped very high
lines2 = sc.textFile('./data.txt')
print(lines2.collect())
['crazy crazy fox jumped over the fence', 'crazy fox jumped', 'the fence is high of fox', 'crazy fox is smart', 'fox jumped very high']
lines2 = lines.map(lambda x : x.split(' '))
print('lines2 is :')
print(lines2.collect())
lines2 is :
[['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence'], ['crazy', 'fox', 'jumped'], ['the', 'fence', 'is', 'high', 'of', 'fox']]
bigrams_list = lines2.fold([], lambda x ,y : x+y)
bigrams_list = sc.parallelize(bigarms_list)
print('bigrams list :')
print(bigrams_list.collect())
bigarms list :
['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence', 'crazy', 'fox', 'jumped', 'the', 'fence', 'is', 'high', 'of', 'fox']
word_counts = bigrams_list.map(lambda x : (x, 1)).reduceByKey(lambda x, y: x+y)
n_words = word_counts.count()
word_frequency =word_counts.map(lambda x : (x[0],float(x[1]/n_words)))
print('word frequency')
print(word_frequency.collect())
word frequency
[('crazy', 0.3333333333333333), ('of', 0.1111111111111111), ('jumped', 0.2222222222222222), ('high', 0.1111111111111111), ('fence', 0.2222222222222222), ('fox', 0.3333333333333333), ('over', 0.1111111111111111), ('is', 0.1111111111111111), ('the', 0.2222222222222222)]