Chispa de aprendizaje RDD
pyspark importación
import pyspark
inicialización SparkContext
sc = pyspark.SparkContext(master="local[*]",appName="test1")
EET Transformar 算 子
Una lista construyó un rdd
rdd1 = sc.parallelize([1,2,3,4,5])
Recopilar datos y visualización en rdd
rdd1.collect()
[1, 2, 3, 4, 5]
método mapa
el modo de función llamada
def adder(x):
"""
使得传入的数加一
"""
return x+1
rdd1.map(adder).collect()
[2, 3, 4, 5, 6]
el modo de función anónima
rdd1.map(lambda x:x+1).collect()
[2, 3, 4, 5, 6]
uso del filtro
print("rdd1 rdd1中小于2的数:", rdd1.filter(lambda x:x<2).collect())
print("rdd1 中的偶数为:", rdd1.filter(lambda x:x%2==0).collect())
rdd1 rdd1中小于2的数: [1]
rdd1 中的偶数为: [2, 4]
uso distinto
rdd1.distinct().collect()
[1, 2, 3, 4, 5]
el uso randomSplit
rdd_r1,rdd_r2 = rdd1.randomSplit([0.5,0.5])
print("随机划分集合1:",rdd_r1.collect())
print("随机划分集合2:",rdd_r2.collect())
随机划分集合1: [2, 4, 5]
随机划分集合2: [1, 3]
el uso GroupBy
group_rdd1 = rdd1.groupBy(lambda x:"偶数:" if x%2==0 else "奇数:").collect()
print("group_rdd1 type:", type(group_rdd1))
print("group_rdd1 content type:",type(group_rdd1[0]))
print("group name:",group_rdd1[0][0],"group member:",list(group_rdd1[0][1]))
print("group name:",group_rdd1[1][0],"group member:",list(group_rdd1[1][1]))
group_rdd1 type: <class 'list'>
group_rdd1 content type: <class 'tuple'>
group name: 奇数: group member: [1, 3, 5]
group name: 偶数: group member: [2, 4]
RDD uso múltiple
intrdd1 = sc.parallelize((1,2,3,4))
intrdd2 = sc.parallelize((3,3,4))
intrdd3 = sc.parallelize((5,1))
Y el conjunto de la computación
print(intrdd1.union(intrdd2).collect())
print(intrdd2.union(intrdd3).collect())
print(intrdd1.union(intrdd2).union(intrdd3).collect())
[1, 2, 3, 4, 3, 3, 4]
[3, 3, 4, 5, 1]
[1, 2, 3, 4, 3, 3, 4, 5, 1]
La intersección de la informática
print(intrdd1.intersection(intrdd2).collect())
print(intrdd1.intersection(intrdd3).collect())
[4, 3]
[1]
Cálculo de un conjunto diferencia
print(intrdd1.subtract(intrdd2).collect())
print(intrdd2.subtract(intrdd1).collect())
[2, 1]
[]
el funcionamiento del producto cartesiano
print(intrdd1.cartesian(intrdd2).collect())
print(intrdd1.cartesian(intrdd2).cartesian(intrdd3).collect())
[(1, 3), (1, 3), (1, 4), (2, 3), (2, 3), (2, 4), (3, 3), (3, 3), (3, 4), (4, 3), (4, 3), (4, 4)]
[((1, 3), 5), ((1, 3), 1), ((1, 3), 5), ((1, 3), 1), ((1, 4), 5), ((1, 4), 1), ((2, 3), 5), ((2, 3), 1), ((2, 3), 5), ((2, 3), 1), ((2, 4), 5), ((2, 4), 1), ((3, 3), 5), ((3, 3), 1), ((3, 3), 5), ((3, 3), 1), ((3, 4), 5), ((3, 4), 1), ((4, 3), 5), ((4, 3), 1), ((4, 3), 5), ((4, 3), 1), ((4, 4), 5), ((4, 4), 1)]
Ota Acción 算 子
primer uso
print(rdd1.first())
1
tomar Uso
print(rdd1.take(4))
print(rdd1.take(2))
[1, 2, 3, 4]
[1, 2]
el uso takeOrdered
print(rdd1.takeOrdered(3))
print(rdd1.takeOrdered(num=2,key=lambda x:-x))
[1, 2, 3]
[5, 4]
(Con las estadísticas) estadísticas de uso
print(rdd1.stats())
(count: 5, mean: 3.0, stdev: 1.4142135623730951, max: 5.0, min: 1.0)
(Total) el uso de recuento
print(rdd1.count())
5
(Mean) media uso
print(rdd1.mean())
3.0
(Desviación estándar) de uso DESVEST
print(rdd1.stdev())
1.4142135623730951
(Max) Uso max
print(rdd1.max())
5
(Min) min Uso
print(rdd1.min())
1
(Suma) Uso suma
print(rdd1.sum())
15
clave-valor RDD Transformar 算 子
Crear clave-valor RDD
kvRdd1 = sc.parallelize([("one",1),("two",2),("three",3),("four",4),("five",5)])
kvRdd1.collect()
[('one', 1), ('two', 2), ('three', 3), ('four', 4), ('five', 5)]
Uso de las teclas
print(kvRdd1.keys())
print(kvRdd1.keys().collect())
['one', 'two', 'three', 'four', 'five']
Los valores de uso
print(kvRdd1.values())
print(kvRdd1.values().collect())
PythonRDD[301] at RDD at PythonRDD.scala:53
[1, 2, 3, 4, 5]
uso del filtro
print(kvRdd1.filter(lambda kv:kv[0] == "two").collect())
print(kvRdd1.filter(lambda kv:kv[1] <= 3).collect())
[('two', 2)]
[('one', 1), ('two', 2), ('three', 3)]
el uso mapValues
print(kvRdd1.mapValues(lambda x:x+1).collect())
print(kvRdd1.mapValues(lambda x:x**2).collect())
[('one', 2), ('two', 3), ('three', 4), ('four', 5), ('five', 6)]
[('one', 1), ('two', 4), ('three', 9), ('four', 16), ('five', 25)]
el uso sortByKey
print(kvRdd1.sortByKey(ascending=True).collect())
print(kvRdd1.sortByKey(ascending=False).collect())
[('five', 5), ('four', 4), ('one', 1), ('three', 3), ('two', 2)]
[('two', 2), ('three', 3), ('one', 1), ('four', 4), ('five', 5)]
el uso sortBy
print(kvRdd1.sortBy(lambda kv:kv[0]).collect())
print(kvRdd1.sortBy(lambda kv:kv[1],False).collect())
[('five', 5), ('four', 4), ('one', 1), ('three', 3), ('two', 2)]
[('five', 5), ('four', 4), ('three', 3), ('two', 2), ('one', 1)]
el uso reduceByKey
print(kvRdd1.union(kvRdd1).collect())
print(kvRdd1.union(kvRdd1).reduceByKey(lambda v1,v2:v1 + v2).collect())
[('one', 1), ('two', 2), ('three', 3), ('four', 4), ('five', 5), ('one', 1), ('two', 2), ('three', 3), ('four', 4), ('five', 5)]
[('two', 4), ('three', 6), ('four', 8), ('one', 2), ('five', 10)]
reducir el uso de
kvRdd1.reduce(lambda kv1,kv2:("sum",kv1[1]+kv2[1]))
('sum', 15)
Una pluralidad de aritmética kev-valor RDD
kv_student1_rdd = sc.parallelize([("name","nick"),("age",18,),("gender","F")])
kv_student2_rdd = sc.parallelize([("name","nick"),("age",18,)])
kv_student3_rdd = sc.parallelize([("name","nick"),("gender","F")])
Uso unirse
kv_student1_rdd.join(kv_student2_rdd).collect()
[('name', ('nick', 'nick')), ('age', (18, 18))]
leftOuterJoin 用法
print(kv_student1_rdd.leftOuterJoin(kv_student3_rdd).collect())
print(kv_student3_rdd.leftOuterJoin(kv_student1_rdd).collect())
[('name', ('nick', 'nick')), ('gender', ('F', 'F')), ('age', (18, None))]
[('name', ('nick', 'nick')), ('gender', ('F', 'F'))]
el uso rightOuterJoin
print(kv_student2_rdd.rightOuterJoin(kv_student1_rdd).collect())
print(kv_student1_rdd.rightOuterJoin(kv_student2_rdd).collect())
[('name', ('nick', 'nick')), ('gender', (None, 'F')), ('age', (18, 18))]
[('name', ('nick', 'nick')), ('age', (18, 18))]
el uso subtractByKey
print(kv_student1_rdd.subtractByKey(kv_student2_rdd).collect())
[('gender', 'F')]
clave-valor RDD Acción 算 子
primer uso
print(kvRdd1.first())
('one', 1)
Uso tomada
print(kvRdd1.take(1))
print(kvRdd1.take(2))
[('one', 1)]
[('one', 1), ('two', 2)]
El número de cada una de las estadísticas clave kev-RDD en valor
print(sc.parallelize([("int",1),("int",3),("int",4),("float",2.0),("float",3.0)]).countByKey())
defaultdict(<class 'int'>, {'int': 3, 'float': 2})
el uso collectAsMap
print(kvRdd1.collectAsMap())
{'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5}
el uso de las operaciones de búsqueda
print(kvRdd1.lookup("one"))
print(kvRdd1.lookup("four"))
print(kvRdd1.lookup("five"))
[1]
[4]
[5]
liberación SparkContext
sc.stop()