Python Spark-RDD 常用用法

Spark RDD 学习

导入pyspark

import pyspark

初始化SparkContext

sc = pyspark.SparkContext(master="local[*]",appName="test1")

RDD Transform算子

将一个列表构建成一个rdd

rdd1 = sc.parallelize([1,2,3,4,5])

收集并显示rdd中的数据

rdd1.collect()
[1, 2, 3, 4, 5]

map方法

具名函数方式

def adder(x):
    """
        使得传入的数加一
    """
    return x+1

rdd1.map(adder).collect()
[2, 3, 4, 5, 6]

匿名函数方式

rdd1.map(lambda x:x+1).collect()
[2, 3, 4, 5, 6]

filter用法

print("rdd1 rdd1中小于2的数:", rdd1.filter(lambda x:x<2).collect())
print("rdd1 中的偶数为:", rdd1.filter(lambda x:x%2==0).collect())
rdd1 rdd1中小于2的数: [1]
rdd1 中的偶数为: [2, 4]

distinct用法

rdd1.distinct().collect()
[1, 2, 3, 4, 5]

randomSplit用法

rdd_r1,rdd_r2 = rdd1.randomSplit([0.5,0.5])
print("随机划分集合1:",rdd_r1.collect())
print("随机划分集合2:",rdd_r2.collect())
随机划分集合1: [2, 4, 5]
随机划分集合2: [1, 3]

groupBy用法

group_rdd1 = rdd1.groupBy(lambda x:"偶数:" if x%2==0 else "奇数:").collect()
print("group_rdd1 type:", type(group_rdd1))
print("group_rdd1 content type:",type(group_rdd1[0]))
print("group name:",group_rdd1[0][0],"group member:",list(group_rdd1[0][1]))
print("group name:",group_rdd1[1][0],"group member:",list(group_rdd1[1][1]))
group_rdd1 type: <class 'list'>
group_rdd1 content type: <class 'tuple'>
group name: 奇数: group member: [1, 3, 5]
group name: 偶数: group member: [2, 4]

多个RDD的用法

intrdd1 = sc.parallelize((1,2,3,4))
intrdd2 = sc.parallelize((3,3,4))
intrdd3 = sc.parallelize((5,1))

并集计算

print(intrdd1.union(intrdd2).collect())
print(intrdd2.union(intrdd3).collect())
print(intrdd1.union(intrdd2).union(intrdd3).collect())
[1, 2, 3, 4, 3, 3, 4]
[3, 3, 4, 5, 1]
[1, 2, 3, 4, 3, 3, 4, 5, 1]

交集计算

print(intrdd1.intersection(intrdd2).collect())
print(intrdd1.intersection(intrdd3).collect())
[4, 3]
[1]

差集计算

print(intrdd1.subtract(intrdd2).collect())
print(intrdd2.subtract(intrdd1).collect())
[2, 1]
[]

笛卡尔积运算

print(intrdd1.cartesian(intrdd2).collect())
print(intrdd1.cartesian(intrdd2).cartesian(intrdd3).collect())
[(1, 3), (1, 3), (1, 4), (2, 3), (2, 3), (2, 4), (3, 3), (3, 3), (3, 4), (4, 3), (4, 3), (4, 4)]
[((1, 3), 5), ((1, 3), 1), ((1, 3), 5), ((1, 3), 1), ((1, 4), 5), ((1, 4), 1), ((2, 3), 5), ((2, 3), 1), ((2, 3), 5), ((2, 3), 1), ((2, 4), 5), ((2, 4), 1), ((3, 3), 5), ((3, 3), 1), ((3, 3), 5), ((3, 3), 1), ((3, 4), 5), ((3, 4), 1), ((4, 3), 5), ((4, 3), 1), ((4, 3), 5), ((4, 3), 1), ((4, 4), 5), ((4, 4), 1)]

RDD Action 算子

first用法

print(rdd1.first())
1

take用法

print(rdd1.take(4))
print(rdd1.take(2))
[1, 2, 3, 4]
[1, 2]

takeOrdered 用法

print(rdd1.takeOrdered(3))
print(rdd1.takeOrdered(num=2,key=lambda x:-x))
[1, 2, 3]
[5, 4]

(全统计) stats 用法

print(rdd1.stats())
# 总数,均值,标准差,最大值,最小值
(count: 5, mean: 3.0, stdev: 1.4142135623730951, max: 5.0, min: 1.0)

(总数) count 用法

print(rdd1.count())
5

(均值) mean 用法

print(rdd1.mean())
3.0

(标准差) stdev 用法

print(rdd1.stdev())
1.4142135623730951

(最大值) max 用法

print(rdd1.max())
5

(最小值) min 用法

print(rdd1.min())
1

(求和) sum 用法

print(rdd1.sum())
15

key-value RDD Transform 算子

创建key-value RDD

kvRdd1 = sc.parallelize([("one",1),("two",2),("three",3),("four",4),("five",5)])
kvRdd1.collect()
[('one', 1), ('two', 2), ('three', 3), ('four', 4), ('five', 5)]

keys 用法

print(kvRdd1.keys())
print(kvRdd1.keys().collect())
['one', 'two', 'three', 'four', 'five']

values 用法

print(kvRdd1.values())
print(kvRdd1.values().collect())
PythonRDD[301] at RDD at PythonRDD.scala:53
[1, 2, 3, 4, 5]

filter 用法

# 筛选key
print(kvRdd1.filter(lambda kv:kv[0] == "two").collect())
# 筛选value
print(kvRdd1.filter(lambda kv:kv[1] <= 3).collect())
[('two', 2)]
[('one', 1), ('two', 2), ('three', 3)]

mapValues 用法

# 每个元素加1
print(kvRdd1.mapValues(lambda x:x+1).collect())
# 每个元素取二次幂
print(kvRdd1.mapValues(lambda x:x**2).collect())
[('one', 2), ('two', 3), ('three', 4), ('four', 5), ('five', 6)]
[('one', 1), ('two', 4), ('three', 9), ('four', 16), ('five', 25)]

sortByKey 用法

# 键如果是字符串,是按照字典序来进行排列
# 正序: 小 -> 大
print(kvRdd1.sortByKey(ascending=True).collect())
# 逆序: 大 -> 小
print(kvRdd1.sortByKey(ascending=False).collect())
[('five', 5), ('four', 4), ('one', 1), ('three', 3), ('two', 2)]
[('two', 2), ('three', 3), ('one', 1), ('four', 4), ('five', 5)]

sortBy 用法

# 对其键进行排序
print(kvRdd1.sortBy(lambda kv:kv[0]).collect())
# 对其值进行排序 并且是逆序
print(kvRdd1.sortBy(lambda kv:kv[1],False).collect())
[('five', 5), ('four', 4), ('one', 1), ('three', 3), ('two', 2)]
[('five', 5), ('four', 4), ('three', 3), ('two', 2), ('one', 1)]

reduceByKey 用法

print(kvRdd1.union(kvRdd1).collect())
# 把相同的键聚合起来并相加
print(kvRdd1.union(kvRdd1).reduceByKey(lambda v1,v2:v1 + v2).collect())
[('one', 1), ('two', 2), ('three', 3), ('four', 4), ('five', 5), ('one', 1), ('two', 2), ('three', 3), ('four', 4), ('five', 5)]
[('two', 4), ('three', 6), ('four', 8), ('one', 2), ('five', 10)]

reduce 用法

# 将数据集中所有的值求和
kvRdd1.reduce(lambda kv1,kv2:("sum",kv1[1]+kv2[1]))
('sum', 15)

多个kev-value RDD的运算

kv_student1_rdd = sc.parallelize([("name","nick"),("age",18,),("gender","F")])
kv_student2_rdd = sc.parallelize([("name","nick"),("age",18,)])
kv_student3_rdd = sc.parallelize([("name","nick"),("gender","F")])

join用法

# 当其中一个rdd中没有该键时,双方会进行抛弃此键
kv_student1_rdd.join(kv_student2_rdd).collect()
[('name', ('nick', 'nick')), ('age', (18, 18))]

leftOuterJoin 用法

# 以左边的rdd 为主,右边没有左边的键时会以None代替
print(kv_student1_rdd.leftOuterJoin(kv_student3_rdd).collect())
# 当左边没有右边的键时 会抛弃此键
print(kv_student3_rdd.leftOuterJoin(kv_student1_rdd).collect())
[('name', ('nick', 'nick')), ('gender', ('F', 'F')), ('age', (18, None))]
[('name', ('nick', 'nick')), ('gender', ('F', 'F'))]

rightOuterJoin 用法

# 以右边的rdd 为主,左边没有右边的键时会以None代替
print(kv_student2_rdd.rightOuterJoin(kv_student1_rdd).collect())
# 当右边没有左边的键时 会抛弃此键
print(kv_student1_rdd.rightOuterJoin(kv_student2_rdd).collect())
[('name', ('nick', 'nick')), ('gender', (None, 'F')), ('age', (18, 18))]
[('name', ('nick', 'nick')), ('age', (18, 18))]

subtractByKey 用法

# 删除两个数据集中相同键 的键值对
print(kv_student1_rdd.subtractByKey(kv_student2_rdd).collect())
[('gender', 'F')]

key-value RDD Action算子

first 用法

print(kvRdd1.first())
('one', 1)

taken 用法

# 取头部 n 个
print(kvRdd1.take(1))
print(kvRdd1.take(2))
[('one', 1)]
[('one', 1), ('two', 2)]

统计kev-value RDD 中每个key的个数

print(sc.parallelize([("int",1),("int",3),("int",4),("float",2.0),("float",3.0)]).countByKey())
defaultdict(<class 'int'>, {'int': 3, 'float': 2})

collectAsMap 用法

# 转换成python中的字典
print(kvRdd1.collectAsMap())
{'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5}

lookup 用法

# 根据传入键找值
print(kvRdd1.lookup("one"))
print(kvRdd1.lookup("four"))
print(kvRdd1.lookup("five"))
[1]
[4]
[5]

释放SparkContext

sc.stop()
发布了27 篇原创文章 · 获赞 62 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_42359956/article/details/105315546