Spark学习笔记(一):pySpark RDD编程

from pyspark import SparkContext,SparkConf

#spark配置信息 注意实际工程中不会用硬编码
sc = SparkContext("local", "context")


'''
RDD操作有两个方面:
1.transformations: map filter groupby... 
2.actions: reduce count collect...

特点:
	1) transformation are lazy, nothing actually happens until an action is called;
	2) action triggers the computation;
	3) action returns values to driver or writes data to external storage; 
'''


'''
1.map(func) 将函数作用到RDD中每一个元素上,返回一个新的RDD eg:word=>(word,1)
'''
def word_map():
    a = sc.parallelize(["Hadoop","HDFS","Mapreduce","Spark","Mapreduce","Spark"])
    b = a.map(lambda x:(x,1))
    print(b.collect())
#输出[('Hadoop', 1), ('HDFS', 1), ('Mapreduce', 1), ('Spark', 1), ('Mapreduce', 1), ('Spark', 1)]


'''
2.filter(func) 返回函数中对应true的元素
'''
def my_filter():
    data = [1,2,3,4,5]
    rdd = sc.parallelize(data)
    #链式写法
    new_rdd = rdd.map(lambda x:x**2).filter(lambda x:x>10)
    print(new_rdd.collect())
#输出 [16, 25]

'''
3.flatMap(func) Similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item).
'''
def my_flatMap():
    data = ["hello spark", "hello world", "hello world"]
    rdd = sc.parallelize(data)
    mapRdd = rdd.map(lambda x:x.split(" "))
    flatMapRdd = rdd.flatMap(lambda line:line.split(" "))
    print(mapRdd.collect()) #[['hello', 'spark'], ['hello', 'world'], ['hello', 'world']]
    print(flatMapRdd.collect()) #['hello', 'spark', 'hello', 'world', 'hello', 'world']


'''
4.groupByKey
'''
def word_groupbyKey():
    data = ["hello spark", "hello world", "hello world"]
    rdd = sc.parallelize(data)
    mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
    groupbyRdd = mapRdd.groupByKey()
    print(groupbyRdd.collect())
    #输出
    # [('hello', <pyspark.resultiterable.ResultIterable object at 0x1055b70b8>),
    # ('spark', <pyspark.resultiterable.ResultIterable object at 0x1055b70f0>),
    # ('world', <pyspark.resultiterable.ResultIterable object at 0x1055b7208>)]
    print(groupbyRdd.map(lambda x:{x[0]:list(x[1])}).collect())
    #[{'hello': [1, 1, 1]}, {'spark': [1]}, {'world': [1, 1]}]

'''
5.reduceByKey 默认根据key升序排序
'''
def word_reducebyKey():
    data = ["hello spark", "hello world", "hello world"]
    rdd = sc.parallelize(data)
    mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
    reducebyRdd = mapRdd.reduceByKey(lambda a,b:a+b)
    print(reducebyRdd.collect())
    #[('hello', 3), ('spark', 1), ('world', 2)]

'''
6.sortByKey
'''
def word_sortbyKey():
    data = ["hello spark", "hello world", "hello world"]
    rdd = sc.parallelize(data)
    mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
    reducebyRdd = mapRdd.reduceByKey(lambda a,b:a+b)
    print(reducebyRdd.sortByKey().collect())
    # [('hello', 3), ('spark', 1), ('world', 2)]

    #Topk实现技巧 把key value换个位置即可
    print(reducebyRdd.map(lambda x:([x[1],x[0]])).sortByKey(False).map(lambda x:(x[1],x[0])).collect());
    #[('hello', 3), ('world', 2), ('spark', 1)]


'''
7.union 把两个rdd整合成一个
'''
def my_union():
    a = sc.parallelize([1,2,3])
    b = sc.parallelize([3,4,5])
    print(a.union(b).collect())
    #[1, 2, 3, 3, 4, 5]

'''
7.distinct Return a new dataset that contains the distinct elements of the source dataset.
'''
def my_distinct():
    a = sc.parallelize([1,2,3])
    b = sc.parallelize([3,4,5])
    print(a.union(b).distinct().collect())

'''
8.join When called on datasets of type (K, V) and (K, W), 
returns a dataset of (K, (V, W)) pairs with all pairs of elements for each key. 
Outer joins are supported through leftOuterJoin, rightOuterJoin, and fullOuterJoin.
'''
def my_join():
    a = sc.parallelize([("A", "a1"), ("C", "c1"), ("D", "d1"), ("F", "f1"), ("F", "f2")])
    b = sc.parallelize([("A", "a2"), ("C", "c2"), ("C", "c3"), ("E", "e1")])

    print(a.fullOuterJoin(b).collect())

'''
9.action类操作 reduce foreach
'''
def my_action():
    data = [1,2,3,4,5,6,7,8,9,10]
    rdd = sc.parallelize(data)
    rdd.collect()
    #reduce
    print(rdd.reduce(lambda x,y:x+y))
    #foreach
    print(rdd.foreach(lambda x:print(x)))
Spark学习笔记(一):pySpark RDD编程

猜你喜欢