data processing using pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DoubleType, IntegerType
from pyspark.sql.functions import pandas_udf, PandasUDFType

read.csv()

# 创建sparkSession对象
# 应用名 'data_processing', 会在spark web UI中显示
spark = SparkSession.builder.appName('data_processing').getOrCreate()
# read csv file
# inferSchema : 从输入的数据自动推断数据类型
# header : 是否使用第一行作为列名
df = spark.read.csv('./Data/sample_data.csv', inferSchema=True, header=True)

columns

df.columns

['ratings', 'age', 'experience', 'family', 'mobile']

len(df.columns)

df.count()      # 行数

# pyspark's dataframe's shape
print((df.count(), len(df.columns)))

(33, 5)

printSchema()

# 数据类型类似于pandas df.info()
df.printSchema()

root
 |-- ratings: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: double (nullable = true)
 |-- family: integer (nullable = true)
 |-- mobile: string (nullable = true)

show()

# 输出前5行，类似于pandas ,df.head(5)
df.show(5)

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      3| 32|       9.0|     3|   Vivo|
|      3| 27|      13.0|     3|  Apple|
|      4| 22|       2.5|     0|Samsung|
|      4| 37|      16.5|     4|  Apple|
|      5| 27|       9.0|     1|     MI|
+-------+---+----------+------+-------+
only showing top 5 rows

select()

# only show age and mobile
df.select('age', 'mobile').show(5)

+---+-------+
|age| mobile|
+---+-------+
| 32|   Vivo|
| 27|  Apple|
| 22|Samsung|
| 37|  Apple|
| 27|     MI|
+---+-------+
only showing top 5 rows

describe()

# descirbe()
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------+
|summary|           ratings|               age|        experience|            family|mobile|
+-------+------------------+------------------+------------------+------------------+------+
|  count|                33|                33|                33|                33|    33|
|   mean|3.5757575757575757|30.484848484848484|10.303030303030303|1.8181818181818181|  null|
| stddev|1.1188806636071336|  6.18527087180309| 6.770731351213326|1.8448330794164254|  null|
|    min|                 1|                22|               2.5|                 0| Apple|
|    max|                 5|                42|              23.0|                 5|  Vivo|
+-------+------------------+------------------+------------------+------------------+------+

withColumn() : add new column or change type

# 增加新列, 返回新的df, 并不修改df本身
df.withColumn('age_add_10', (df['age']+10)).show(10, False)    # False 左对齐

+-------+---+----------+------+-------+----------+
|ratings|age|experience|family|mobile |age_add_10|
+-------+---+----------+------+-------+----------+
|3      |32 |9.0       |3     |Vivo   |42        |
|3      |27 |13.0      |3     |Apple  |37        |
|4      |22 |2.5       |0     |Samsung|32        |
|4      |37 |16.5      |4     |Apple  |47        |
|5      |27 |9.0       |1     |MI     |37        |
|4      |27 |9.0       |0     |Oppo   |37        |
|5      |37 |23.0      |5     |Vivo   |47        |
|5      |37 |23.0      |5     |Samsung|47        |
|3      |22 |2.5       |0     |Apple  |32        |
|3      |27 |6.0       |0     |MI     |37        |
+-------+---+----------+------+-------+----------+
only showing top 10 rows

# dataframe 类型转换
df.withColumn('age_double', df['age'].cast(DoubleType())).show(10)

+-------+---+----------+------+-------+----------+
|ratings|age|experience|family| mobile|age_double|
+-------+---+----------+------+-------+----------+
|      3| 32|       9.0|     3|   Vivo|      32.0|
|      3| 27|      13.0|     3|  Apple|      27.0|
|      4| 22|       2.5|     0|Samsung|      22.0|
|      4| 37|      16.5|     4|  Apple|      37.0|
|      5| 27|       9.0|     1|     MI|      27.0|
|      4| 27|       9.0|     0|   Oppo|      27.0|
|      5| 37|      23.0|     5|   Vivo|      37.0|
|      5| 37|      23.0|     5|Samsung|      37.0|
|      3| 22|       2.5|     0|  Apple|      22.0|
|      3| 27|       6.0|     0|     MI|      27.0|
+-------+---+----------+------+-------+----------+
only showing top 10 rows

filter()

# 过滤是vivo手机的用户
df.filter(df['mobile'] == 'Vivo').show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      3| 32|       9.0|     3|  Vivo|
|      5| 37|      23.0|     5|  Vivo|
|      4| 37|       6.0|     0|  Vivo|
|      5| 37|      13.0|     1|  Vivo|
|      4| 37|       6.0|     0|  Vivo|
+-------+---+----------+------+------+

df.filter(df['mobile'] == 'Vivo').select('age', 'ratings').show()

+---+-------+
|age|ratings|
+---+-------+
| 32|      3|
| 37|      5|
| 37|      4|
| 37|      5|
| 37|      4|
+---+-------+

# 嵌套filter
df.filter(df['mobile'] == 'Vivo').filter(df['age'] <= 32).show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      3| 32|       9.0|     3|  Vivo|
+-------+---+----------+------+------+

df.filter((df['mobile'] =='Vivo')&(df['age'] <= 32)).show()

+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
|      3| 32|       9.0|     3|  Vivo|
+-------+---+----------+------+------+

distinct()

df.select('mobile').distinct().show()

+-------+
| mobile|
+-------+
|     MI|
|   Oppo|
|Samsung|
|   Vivo|
|  Apple|
+-------+

df.select('mobile').distinct().count()

groupBy()

max()
min()
mean()
sum()
count()
agg()

# 使用不同手机的用户数量
df.groupBy('mobile').count().orderBy('count', ascending=False).show(5, False)

+-------+-----+
|mobile |count|
+-------+-----+
|MI     |8    |
|Oppo   |7    |
|Apple  |7    |
|Samsung|6    |
|Vivo   |5    |
+-------+-----+

df.groupBy('mobile').mean().show()

+-------+------------------+------------------+------------------+------------------+
| mobile|      avg(ratings)|          avg(age)|   avg(experience)|       avg(family)|
+-------+------------------+------------------+------------------+------------------+
|     MI|               3.5|            30.125|           10.1875|             1.375|
|   Oppo| 2.857142857142857|28.428571428571427|10.357142857142858|1.4285714285714286|
|Samsung| 4.166666666666667|28.666666666666668| 8.666666666666666|1.8333333333333333|
|   Vivo|               4.2|              36.0|              11.4|               1.8|
|  Apple|3.4285714285714284|30.571428571428573|              11.0|2.7142857142857144|
+-------+------------------+------------------+------------------+------------------+

agg()

# 使用不同品牌手机用户年龄均值
df.groupBy('mobile').agg({'age': 'mean'}).show()

+-------+------------------+
| mobile|          avg(age)|
+-------+------------------+
|     MI|            30.125|
|   Oppo|28.428571428571427|
|Samsung|28.666666666666668|
|   Vivo|              36.0|
|  Apple|30.571428571428573|
+-------+------------------+

UDF : convert a python function to udf

# 价格标签
# (Samsung Apple) : High Price
# MI : Mid Price
# other : Low Price
def price_label(brand):
    if brand in ['Samsumg', 'Apple']:
        return 'High Price'
    elif brand == 'MI':
        return 'Mid Price'
    else :
        return 'Low Price'

# 新增price_label 列
# convert python function to a udf
brand_udf = udf(price_label)
df.withColumn('price_label', brand_udf(df['mobile'])).show(10)

+-------+---+----------+------+-------+-----------+
|ratings|age|experience|family| mobile|price_label|
+-------+---+----------+------+-------+-----------+
|      3| 32|       9.0|     3|   Vivo|  Low Price|
|      3| 27|      13.0|     3|  Apple| High Price|
|      4| 22|       2.5|     0|Samsung|  Low Price|
|      4| 37|      16.5|     4|  Apple| High Price|
|      5| 27|       9.0|     1|     MI|  Mid Price|
|      4| 27|       9.0|     0|   Oppo|  Low Price|
|      5| 37|      23.0|     5|   Vivo|  Low Price|
|      5| 37|      23.0|     5|Samsung|  Low Price|
|      3| 22|       2.5|     0|  Apple| High Price|
|      3| 27|       6.0|     0|     MI|  Mid Price|
+-------+---+----------+------+-------+-----------+
only showing top 10 rows

匿名函数 lambda

convert a lambda function to udf

age_udf = udf(lambda age: 'young' if age < 30 else 'old', StringType())
df.withColumn('age_label', age_udf(df['age'])).show(10)

+-------+---+----------+------+-------+---------+
|ratings|age|experience|family| mobile|age_label|
+-------+---+----------+------+-------+---------+
|      3| 32|       9.0|     3|   Vivo|      old|
|      3| 27|      13.0|     3|  Apple|    young|
|      4| 22|       2.5|     0|Samsung|    young|
|      4| 37|      16.5|     4|  Apple|      old|
|      5| 27|       9.0|     1|     MI|    young|
|      4| 27|       9.0|     0|   Oppo|    young|
|      5| 37|      23.0|     5|   Vivo|      old|
|      5| 37|      23.0|     5|Samsung|      old|
|      3| 22|       2.5|     0|  Apple|    young|
|      3| 27|       6.0|     0|     MI|    young|
+-------+---+----------+------+-------+---------+
only showing top 10 rows

pandas_udf, PandasUDFType

Pnadas UDF are much more powerful in terms of speed and processing time.

def remain_years(age):
    left_years = 100 - age
    return left_years

left_years_udf = pandas_udf(remain_years, IntegerType())     # PyArrow >= 0.8 , 等价于udf（reamin_years, IntegerType()）
df.withColumn('left_years', left_years_udf(df['age'])).show()

+-------+---+----------+------+-------+----------+
|ratings|age|experience|family| mobile|left_years|
+-------+---+----------+------+-------+----------+
|      3| 32|       9.0|     3|   Vivo|        68|
|      3| 27|      13.0|     3|  Apple|        73|
|      4| 22|       2.5|     0|Samsung|        78|
|      4| 37|      16.5|     4|  Apple|        63|
|      5| 27|       9.0|     1|     MI|        73|
|      4| 27|       9.0|     0|   Oppo|        73|
|      5| 37|      23.0|     5|   Vivo|        63|
|      5| 37|      23.0|     5|Samsung|        63|
|      3| 22|       2.5|     0|  Apple|        78|
|      3| 27|       6.0|     0|     MI|        73|
+-------+---+----------+------+-------+----------+
only showing top 10 rows

def prod(rating, exp):
    r=  rating*exp
    return r

prod_udf = pandas_udf(prod, DoubleType())
#prod_udf = udf(prod, DoubleType())
df.withColumn("product", prod_udf(df['ratings'],df['experience'])).show(10,False)

+-------+---+----------+------+-------+-------+
|ratings|age|experience|family|mobile |product|
+-------+---+----------+------+-------+-------+
|3      |32 |9.0       |3     |Vivo   |27.0   |
|3      |27 |13.0      |3     |Apple  |39.0   |
|4      |22 |2.5       |0     |Samsung|10.0   |
|4      |37 |16.5      |4     |Apple  |66.0   |
|5      |27 |9.0       |1     |MI     |45.0   |
|4      |27 |9.0       |0     |Oppo   |36.0   |
|5      |37 |23.0      |5     |Vivo   |115.0  |
|5      |37 |23.0      |5     |Samsung|115.0  |
|3      |22 |2.5       |0     |Apple  |7.5    |
|3      |27 |6.0       |0     |MI     |18.0   |
+-------+---+----------+------+-------+-------+
only showing top 10 rows

dropDuplicates()

去掉重复的样本

new_df = df.dropDuplicates()
new_df.count()

new_df.show(5)

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      3| 32|       9.0|     3|   Vivo|
|      4| 22|       2.5|     0|Samsung|
|      5| 27|       6.0|     0|     MI|
|      4| 22|       6.0|     1|   Oppo|
|      3| 27|       6.0|     0|     MI|
+-------+---+----------+------+-------+
only showing top 5 rows

dorp()

new_df.drop('mobile').show(5)

+-------+---+----------+------+
|ratings|age|experience|family|
+-------+---+----------+------+
|      3| 32|       9.0|     3|
|      4| 22|       2.5|     0|
|      5| 27|       6.0|     0|
|      4| 22|       6.0|     1|
|      3| 27|       6.0|     0|
+-------+---+----------+------+
only showing top 5 rows

Saving data

如果当前目录不存在，会创建目录
parquet格式，如果目录已存在，可能会报错

new_df.coalesce(1).show(5)     # coalesce(numPartitions) 返回一个dataframe对象，并指定分区的数量

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      3| 32|       9.0|     3|   Vivo|
|      4| 22|       2.5|     0|Samsung|
|      5| 27|       6.0|     0|     MI|
|      4| 22|       6.0|     1|   Oppo|
|      3| 27|       6.0|     0|     MI|
+-------+---+----------+------+-------+
only showing top 5 rows

# csv
new_df.coalesce(1).write.format('csv').option('header', 'true').save('./Data/New_sample_data')

# csv
new_df.write.format('csv').option('header', 'true').save('./Date/New_sample_data')

# parquet
new_df.coalesce(1).write.format('parquet').save('./Data/New_data')

new_df.show()

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      3| 32|       9.0|     3|   Vivo|
|      4| 22|       2.5|     0|Samsung|
|      5| 27|       6.0|     0|     MI|
|      4| 22|       6.0|     1|   Oppo|
|      3| 27|       6.0|     0|     MI|
|      2| 32|      16.5|     2|   Oppo|
|      4| 27|       9.0|     0|   Oppo|
|      2| 27|       9.0|     2|Samsung|
|      3| 37|      16.5|     5|  Apple|
|      4| 27|       6.0|     1|  Apple|
|      5| 37|      23.0|     5|   Vivo|
|      2| 27|       6.0|     2|   Oppo|
|      4| 37|       6.0|     0|   Vivo|
|      5| 37|      23.0|     5|Samsung|
|      4| 37|       9.0|     2|Samsung|
|      5| 37|      13.0|     1|   Vivo|
|      5| 27|       2.5|     0|     MI|
|      3| 42|      23.0|     5|     MI|
|      5| 22|       2.5|     0|Samsung|
|      1| 37|      23.0|     5|     MI|
+-------+---+----------+------+-------+
only showing top 20 rows

PySpark: DataProcessing(csv file)

data processing using pyspark

read.csv()

columns

printSchema()

show()

select()

describe()

withColumn() : add new column or change type

filter()

distinct()

groupBy()

agg()

UDF : convert a python function to udf

匿名函数 lambda

pandas_udf, PandasUDFType

dropDuplicates()

dorp()

Saving data

猜你喜欢