PySpark Dataframe 添加新列

为spark dataframe 添加新的列的几种实现

from pyspark.sql import SparkSession
from pyspark.sql import Row
spark = SparkSession.builder.getOrCreate()
  • 测试数据准备
test_data = [
            Row(name='China', Population=1439323776, area=960.1),
            Row(name='India', Population=1380004385, area=328.8),
            Row(name='United States', Population=331002651, area=936.4)]
df = spark.createDataFrame(test_data)
df.show()
'''
+----------+-----+-------------+
|Population| area|         name|
+----------+-----+-------------+
|1439323776|960.1|        China|
|1380004385|328.8|        India|
| 331002651|936.4|United States|
+----------+-----+-------------+
'''
  1. 使用Spark自带函数
    import pyspark.sql.functions as F
    df_log = df.withColumn("PopulationLog", F.log(10.0, "Population"))
    df_log.show()
    '''
    +----------+-----+-------------+-----------------+
    |Population| area|         name|    PopulationLog|
    +----------+-----+-------------+-----------------+
    |1439323776|960.1|        China|9.158158499505339|
    |1380004385|328.8|        India|9.139880466385495|
    | 331002651|936.4|United States|8.519831472053848|
    +----------+-----+-------------+-----------------+
    '''

    import math
    math.log10(1439323776)

  2. 使用Spark UDFs
    a. UDFs
    from pyspark.sql.types import *
    def get_level(value):
        if  value > 1400000000: 
            return 'high'
        elif value > 1300000000: 
            return 'medium'
        else:
            return 'low'
    udf_level_func = F.udf(get_level, StringType())
    
    df_level = df.withColumn("PopulationLevel", udf_level_func("Population"))
    df_level.show()
    '''
    +----------+-----+-------------+---------------+
    |Population| area|         name|PopulationLevel|
    +----------+-----+-------------+---------------+
    |1439323776|960.1|        China|           high|
    |1380004385|328.8|        India|         medium|
    | 331002651|936.4|United States|            low|
    +----------+-----+-------------+---------------+
    '''

    b.Pandas UDFs

    out_schema = StructType([
                        StructField('Population',LongType(),True),
                        StructField('area',DoubleType(),True),
                        StructField('name',StringType(),True),
                        StructField('density',StringType(),True)
                            ])
    
    @F.pandas_udf(out_schema, F.PandasUDFType.GROUPED_MAP)
    def population_density(pdf):
        pdf['density'] =pdf.Population/pdf.area
        return pdf
    
    df_density = df.groupby("name").apply(population_density)
    df_density.printSchema()
    '''
    root
     |-- Population: long (nullable = true)
     |-- area: double (nullable = true)
     |-- name: string (nullable = true)
     |-- density: string (nullable = true)
    
    '''
  3. 使用Spark SQL
    df.registerTempTable('t_population_table')
    newDF = spark.sql('select *, 2*Population as DoublePopulation from t_population_table')
    newDF.show()
    '''
    +----------+-----+-------------+----------------+
    |Population| area|         name|DoublePopulation|
    +----------+-----+-------------+----------------+
    |1439323776|960.1|        China|      2878647552|
    |1380004385|328.8|        India|      2760008770|
    | 331002651|936.4|United States|       662005302|
    +----------+-----+-------------+----------------+
    '''
  4. 使用Spark RDDs
    from pyspark.sql import Row
    def rowwise_function(row):
        # convert row to dict:
        row_dict = row.asDict()
        # 设置新列的值
        row_dict['NameReverse'] = row_dict['name'][::-1]
        # convert dict to row:
        newrow = Row(**row_dict)
        return newrow
    
    # dataframe convert to RDD
    df_rdd = df.rdd
    # apply function to RDD
    df_name = df_rdd.map(lambda row: rowwise_function(row))
    # Convert RDD Back to DataFrame
    df_name_reverse = spark.createDataFrame(df_name)
    df_name_reverse.show()
    """
    +-------------+----------+-------------+
    |  NameReverse|Population|         name|
    +-------------+----------+-------------+
    |        anihC|1439323776|        China|
    |        aidnI|1380004385|        India|
    |setatS detinU| 331002651|United States|
    +-------------+----------+-------------+
    """

猜你喜欢

转载自www.cnblogs.com/similarface/p/12974756.html