multiply

from pyspark import SparkConf, SparkContext
sc = SparkContext()

# 累乘
nums = sc.parallelize([1, 2, 3 ,4, 5])
mult = nums.fold(1,  (lambda x, y : x *y))
print(mult)

# 累加
accumulate = nums.fold(0, (lambda x, y : x + y))
print(accumulate)

sort_by key

! cat ./data.txt

crazy crazy fox jumped
crazy for jumped
fox is fast 
fox is smart
dog is smart

lines = sc.textFile('data.txt', 1)
lines.collect()

['crazy crazy fox jumped',
 'crazy for jumped',
 'fox is fast ',
 'fox is smart',
 'dog is smart']

# 词频统计 
# flatMap ：展平嵌套的可迭代对象
frequencies = lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x +y)
frequencies.collect()

[('crazy', 3),
 ('fox', 3),
 ('jumped', 2),
 ('for', 1),
 ('is', 3),
 ('fast', 1),
 ('', 1),
 ('smart', 2),
 ('dog', 1)]

frequencies.count()

lines.flatMap(lambda x : x.split(' ')).collect()

['crazy',
 'crazy',
 'fox',
 'jumped',
 'crazy',
 'for',
 'jumped',
 'fox',
 'is',
 'fast',
 '',
 'fox',
 'is',
 'smart',
 'dog',
 'is',
 'smart']

lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).collect()

[('crazy', 1),
 ('crazy', 1),
 ('fox', 1),
 ('jumped', 1),
 ('crazy', 1),
 ('for', 1),
 ('jumped', 1),
 ('fox', 1),
 ('is', 1),
 ('fast', 1),
 ('', 1),
 ('fox', 1),
 ('is', 1),
 ('smart', 1),
 ('dog', 1),
 ('is', 1),
 ('smart', 1)]

lines.collect()

['crazy crazy fox jumped',
 'crazy for jumped',
 'fox is fast ',
 'fox is smart',
 'dog is smart']

lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x + y).collect()

[('crazy', 3),
 ('fox', 3),
 ('jumped', 2),
 ('for', 1),
 ('is', 3),
 ('fast', 1),
 ('', 1),
 ('smart', 2),
 ('dog', 1)]

sum

nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8])
Sum = nums.fold(0, (lambda x, y : x+y))

print(Sum)

union

r1 = sc.parallelize([('k1', 1), ('k2', 2), ('k3', 3)])
r2 = sc.parallelize([('k1', 3), ('k2', 4), ('k4', 8)])
r3 = r1.union(r2)

print('r3 :', r3.collect())

r4 = r3.reduceByKey(lambda x, y : x + y)

print('r4 :', r4.collect())

r3 : [('k1', 1), ('k2', 2), ('k3', 3), ('k1', 3), ('k2', 4), ('k4', 8)]
r4 : [('k1', 4), ('k3', 3), ('k4', 8), ('k2', 6)]

Word frequency

!cat './data.txt'

crazy crazy fox jumped over the fence
crazy fox jumped
the fence is high of fox
crazy fox is smart
fox jumped very high

lines2 = sc.textFile('./data.txt')
print(lines2.collect())

['crazy crazy fox jumped over the fence', 'crazy fox jumped', 'the fence is high of fox', 'crazy fox is smart', 'fox jumped very high']

lines2 = lines.map(lambda x : x.split(' '))
print('lines2 is :')
print(lines2.collect())

lines2 is :
[['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence'], ['crazy', 'fox', 'jumped'], ['the', 'fence', 'is', 'high', 'of', 'fox']]

bigrams_list  = lines2.fold([], lambda x ,y : x+y)
bigrams_list = sc.parallelize(bigarms_list)
print('bigrams list :')
print(bigrams_list.collect())

bigarms list :
['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence', 'crazy', 'fox', 'jumped', 'the', 'fence', 'is', 'high', 'of', 'fox']

word_counts = bigrams_list.map(lambda x : (x, 1)).reduceByKey(lambda x, y: x+y)

n_words = word_counts.count()
word_frequency =word_counts.map(lambda x : (x[0],float(x[1]/n_words)))
print('word frequency')
print(word_frequency.collect())

word frequency
[('crazy', 0.3333333333333333), ('of', 0.1111111111111111), ('jumped', 0.2222222222222222), ('high', 0.1111111111111111), ('fence', 0.2222222222222222), ('fox', 0.3333333333333333), ('over', 0.1111111111111111), ('is', 0.1111111111111111), ('the', 0.2222222222222222)]

pyspark:basic_operating_1

multiply

sort_by key

sum

union

Word frequency

猜你喜欢