Solving the problem of pyspark data skew-repartition & mapPartitions

In a project involving the calculation of browsing data for various categories, stores, and SKUs for 180 days, serious data skew was encountered during the calculation because categories, stores, and SKUs have large differences in user activity. as follows:

For previous blogs on why data is skewed and how to determine whether data is skewed, please refer to:

Spark handles data skew problem_Just Jump's blog-CSDN blog_spark data skew problem

To solve this problem, several methods were considered and tested experimentally, but in the end repartition + mapPartitions was used to solve it.

Example of using pyspark mapPartition:

PySpark mapPartitions() Examples - Spark By {Examples}

# 官方提供的代码示例
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [('James','Smith','M',3000),
  ('Anna','Rose','F',4100),
  ('Robert','Williams','M',6200), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

#Example 1 mapPartitions()
def reformat(partitionData):
    for row in partitionData:
        yield [row.firstname+","+row.lastname,row.salary*10/100]
df2=df.rdd.mapPartitions(reformat).toDF(["name","bonus"])
df2.show()

#Example 2 mapPartitions()
def reformat2(partitionData):
  updatedData = []
  for row in partitionData:
    name=row.firstname+","+row.lastname
    bonus=row.salary*10/100
    updatedData.append([name,bonus])
  return iter(updatedData)

df2=df.rdd.mapPartitions(reformat).toDF(["name","bonus"])
df2.show())

Personal practice examples:

De-duplicate statistics of users who have browsed all levels of categories in the mall are made on a quarterly and half-year basis. Some categories are popular categories with a large number of visitors and high frequency, while some categories are categories with relatively low demand and relatively few visitors. If you group them directly by category, the user deduplication statistics will easily lead to data skew, as shown in the screenshot above. Combine repartition + mapPartitions(+udf) to solve the data skew method in statistics:

(1) repartition() repartitions according to the columns that are prone to skew and the grouping columns that will be used in subsequent calculations. Pay attention to increasing these two parameters at the same time, the degree of parallelism and the number of shuffle partitions, so that the number of partitions can be increased.

--conf spark.sql.shuffle.partitions=10000 \
--conf spark.default.parallelism=10000 \

(2) Use mapPartition to perform a calculation in each partition first, and then perform the final aggregation calculation on the intermediate results.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time: 2022/12/22 4:41 下午
# @Author: TobyGao

import numpy as np
from utils import getNdays, getNMonth, filterUdf, getTaskSavePath,\
    keep_window, get_output_table
from pyspark.sql.functions import lit, col, expr, sum, count
from pyspark.sql.types import *
import math


class ViewUsersAnalysis:
    def __init__(self, spark, task_name, calc_date):
        self.spark = spark
        self.task_name = task_name
        self.calc_date = calc_date

        self.keep_window = keep_window
        self.__output_table = get_output_table(task_name)
        self.__view_data_schema = StructType([
            StructField("user_id", StringType(), False),
            StructField("first_cate_cd", StringType(), False),
            StructField("first_cate_name", StringType(), False),
            StructField("second_cate_cd", StringType(), False),
            StructField("second_cate_name", StringType(), False),
            StructField("third_cate_cd", StringType(), False),
            StructField("third_cate_name", StringType(), False),
            StructField("shop_id", StringType(), False),
            StructField("shop_name", StringType(), False),
            StructField("main_sku_id", StringType(), False),
            StructField("sku_name", StringType(), False),
            StructField("request_date", StringType(), False)])

        self.spark.udf.register("filterUdf", filterUdf)
    
    def __load_view_data(self, window_type):
        start_date = getNdays(self.calc_date, -window_type)
        action_info = self.spark.createDataFrame(
            self.spark.sparkContext.emptyRDD(), self.__view_data_schema)

        data_path_list = []
        for n in range(math.ceil(window_type / 30) + 1):
            data_path_list.append(getTaskSavePath(self.task_name,getNMonth(self.calc_date,
                                                                -n) + "-*"))

        for file_path in data_path_list:
            print("input file paths:", file_path)
            action_info = action_info.unionAll(self.spark.read.format("parquet").load(file_path))

        return action_info


    @staticmethod
    def __partition_agg_func_cate1(row_list):
        cate_code_dict = dict()
        res_cate_dict = dict()
        res_dict = dict()
        result = []
        for row in row_list:
            user_id = row.user_id
            first_cate_cd = row.first_cate_cd
            first_cate_name = row.first_cate_name

            if first_cate_cd not in cate_code_dict.keys():
                cate_code_dict[first_cate_cd] = first_cate_name

            cate_key = (user_id, first_cate_cd)
            if cate_key not in res_cate_dict.keys():
                res_cate_dict[cate_key] = 1

        for k in res_cate_dict.keys():
            agg_key = k[1]
            if agg_key in res_dict.keys():
                res_dict[agg_key] += 1
            else:
                res_dict[agg_key] = 1
        res_cate_dict.clear()

        for k in res_dict.keys():
            result.append([k, cate_code_dict[k], res_dict[k]])
        res_dict.clear()
        cate_code_dict.clear()
        return iter(result)

    
    def get_count_results_in_long_window(self, action_info, window_type):
        action_info.cache()
        view_cate1 = action_info.repartition(10000, col("user_id"), col("first_cate_cd")) \
            .rdd.mapPartitions(self.__partition_agg_func_cate1) \
            .toDF(["first_cate_cd", "first_cate_name", "user_cnt_first_cate_view"]) \
            .groupBy("first_cate_cd", "first_cate_name") \
            .agg(sum("user_cnt_first_cate_view").alias("user_cnt_first_cate_view"))

        view_cate1.cache()
        view_cate1.show(20)
        

        view_shop = action_info.repartition(10000, col("user_id"),
                                                col("shop_id")) \
            .dropDuplicates().groupBy("first_cate_cd",
                                      "first_cate_name",
                                      "second_cate_cd",
                                      "second_cate_name",
                                      "third_cate_cd",
                                      "third_cate_name",
                                      "shop_id",
                                      "shop_name") \
            .agg(count("user_id").alias("user_cnt_shop_view")) 

        view_shop.show()
            

    def get_count_results_in_windows(self):
        for window_type in [60, 90]:
            # self.spark.sql(
            #     """alter table {output_table} drop partition (dt='{calc_date}', window_type='{window_type}')""".format(
            #         output_table=self.__output_table, calc_date=self.calc_date, window_type=window_type))
            action_info = self.__load_view_data(window_type).drop("request_date").withColumn(
                "window_type", lit(window_type))
            self.get_count_results_in_long_window(action_info, window_type)

The limitation of this method is that it consumes a lot of memory. It is best not to have too many keys that need to be temporarily stored and calculated in each partition. But it solves the problem of data skew very well.

Solving the problem of pyspark data skew-repartition & mapPartitions

Personal practice examples:

Guess you like