mapPartitions
对一个分区进行操作,如果要实现向map一样的处理,函数里面需要遍历分区中的每一行。
def f(partitionData):
for element in partitionData:
pass
# return updated data
df.rdd.mapPartitions(f)
+---------------+-----+
| name|bonus|
+---------------+-----+
| James,Smith|300.0|
| Anna,Rose|410.0|
|Robert,Williams|620.0|
+---------------+-----+
def reformat(partitionData):
updatedData = []
for row in partitionData:
name=row.firstname+","+row.lastname
bonus=row.salary*10/100
updatedData.append([name,bonus])
return iter(updatedData)
df2=df.rdd.mapPartitions(reformat).toDF(["name","bonus"])