版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014281392/article/details/89408033
data processing using pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DoubleType, IntegerType
from pyspark.sql.functions import pandas_udf, PandasUDFType
read.csv()
# 创建sparkSession对象
# 应用名 'data_processing', 会在spark web UI中显示
spark = SparkSession.builder.appName('data_processing').getOrCreate()
# read csv file
# inferSchema : 从输入的数据自动推断数据类型
# header : 是否使用第一行作为列名
df = spark.read.csv('./Data/sample_data.csv', inferSchema=True, header=True)
columns
df.columns
['ratings', 'age', 'experience', 'family', 'mobile']
len(df.columns)
5
df.count() # 行数
33
# pyspark's dataframe's shape
print((df.count(), len(df.columns)))
(33, 5)
printSchema()
# 数据类型类似于pandas df.info()
df.printSchema()
root
|-- ratings: integer (nullable = true)
|-- age: integer (nullable = true)
|-- experience: double (nullable = true)
|-- family: integer (nullable = true)
|-- mobile: string (nullable = true)
show()
# 输出前5行,类似于pandas ,df.head(5)
df.show(5)
+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
| 3| 32| 9.0| 3| Vivo|
| 3| 27| 13.0| 3| Apple|
| 4| 22| 2.5| 0|Samsung|
| 4| 37| 16.5| 4| Apple|
| 5| 27| 9.0| 1| MI|
+-------+---+----------+------+-------+
only showing top 5 rows
select()
# only show age and mobile
df.select('age', 'mobile').show(5)
+---+-------+
|age| mobile|
+---+-------+
| 32| Vivo|
| 27| Apple|
| 22|Samsung|
| 37| Apple|
| 27| MI|
+---+-------+
only showing top 5 rows
describe()
# descirbe()
df.describe().show()
+-------+------------------+------------------+------------------+------------------+------+
|summary| ratings| age| experience| family|mobile|
+-------+------------------+------------------+------------------+------------------+------+
| count| 33| 33| 33| 33| 33|
| mean|3.5757575757575757|30.484848484848484|10.303030303030303|1.8181818181818181| null|
| stddev|1.1188806636071336| 6.18527087180309| 6.770731351213326|1.8448330794164254| null|
| min| 1| 22| 2.5| 0| Apple|
| max| 5| 42| 23.0| 5| Vivo|
+-------+------------------+------------------+------------------+------------------+------+
withColumn() : add new column or change type
# 增加新列, 返回新的df, 并不修改df本身
df.withColumn('age_add_10', (df['age']+10)).show(10, False) # False 左对齐
+-------+---+----------+------+-------+----------+
|ratings|age|experience|family|mobile |age_add_10|
+-------+---+----------+------+-------+----------+
|3 |32 |9.0 |3 |Vivo |42 |
|3 |27 |13.0 |3 |Apple |37 |
|4 |22 |2.5 |0 |Samsung|32 |
|4 |37 |16.5 |4 |Apple |47 |
|5 |27 |9.0 |1 |MI |37 |
|4 |27 |9.0 |0 |Oppo |37 |
|5 |37 |23.0 |5 |Vivo |47 |
|5 |37 |23.0 |5 |Samsung|47 |
|3 |22 |2.5 |0 |Apple |32 |
|3 |27 |6.0 |0 |MI |37 |
+-------+---+----------+------+-------+----------+
only showing top 10 rows
# dataframe 类型转换
df.withColumn('age_double', df['age'].cast(DoubleType())).show(10)
+-------+---+----------+------+-------+----------+
|ratings|age|experience|family| mobile|age_double|
+-------+---+----------+------+-------+----------+
| 3| 32| 9.0| 3| Vivo| 32.0|
| 3| 27| 13.0| 3| Apple| 27.0|
| 4| 22| 2.5| 0|Samsung| 22.0|
| 4| 37| 16.5| 4| Apple| 37.0|
| 5| 27| 9.0| 1| MI| 27.0|
| 4| 27| 9.0| 0| Oppo| 27.0|
| 5| 37| 23.0| 5| Vivo| 37.0|
| 5| 37| 23.0| 5|Samsung| 37.0|
| 3| 22| 2.5| 0| Apple| 22.0|
| 3| 27| 6.0| 0| MI| 27.0|
+-------+---+----------+------+-------+----------+
only showing top 10 rows
filter()
# 过滤是vivo手机的用户
df.filter(df['mobile'] == 'Vivo').show()
+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
| 3| 32| 9.0| 3| Vivo|
| 5| 37| 23.0| 5| Vivo|
| 4| 37| 6.0| 0| Vivo|
| 5| 37| 13.0| 1| Vivo|
| 4| 37| 6.0| 0| Vivo|
+-------+---+----------+------+------+
df.filter(df['mobile'] == 'Vivo').select('age', 'ratings').show()
+---+-------+
|age|ratings|
+---+-------+
| 32| 3|
| 37| 5|
| 37| 4|
| 37| 5|
| 37| 4|
+---+-------+
# 嵌套filter
df.filter(df['mobile'] == 'Vivo').filter(df['age'] <= 32).show()
+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
| 3| 32| 9.0| 3| Vivo|
+-------+---+----------+------+------+
df.filter((df['mobile'] =='Vivo')&(df['age'] <= 32)).show()
+-------+---+----------+------+------+
|ratings|age|experience|family|mobile|
+-------+---+----------+------+------+
| 3| 32| 9.0| 3| Vivo|
+-------+---+----------+------+------+
distinct()
df.select('mobile').distinct().show()
+-------+
| mobile|
+-------+
| MI|
| Oppo|
|Samsung|
| Vivo|
| Apple|
+-------+
df.select('mobile').distinct().count()
5
groupBy()
- max()
- min()
- mean()
- sum()
- count()
- agg()
# 使用不同手机的用户数量
df.groupBy('mobile').count().orderBy('count', ascending=False).show(5, False)
+-------+-----+
|mobile |count|
+-------+-----+
|MI |8 |
|Oppo |7 |
|Apple |7 |
|Samsung|6 |
|Vivo |5 |
+-------+-----+
df.groupBy('mobile').mean().show()
+-------+------------------+------------------+------------------+------------------+
| mobile| avg(ratings)| avg(age)| avg(experience)| avg(family)|
+-------+------------------+------------------+------------------+------------------+
| MI| 3.5| 30.125| 10.1875| 1.375|
| Oppo| 2.857142857142857|28.428571428571427|10.357142857142858|1.4285714285714286|
|Samsung| 4.166666666666667|28.666666666666668| 8.666666666666666|1.8333333333333333|
| Vivo| 4.2| 36.0| 11.4| 1.8|
| Apple|3.4285714285714284|30.571428571428573| 11.0|2.7142857142857144|
+-------+------------------+------------------+------------------+------------------+
agg()
# 使用不同品牌手机用户年龄均值
df.groupBy('mobile').agg({'age': 'mean'}).show()
+-------+------------------+
| mobile| avg(age)|
+-------+------------------+
| MI| 30.125|
| Oppo|28.428571428571427|
|Samsung|28.666666666666668|
| Vivo| 36.0|
| Apple|30.571428571428573|
+-------+------------------+
UDF : convert a python function to udf
# 价格标签
# (Samsung Apple) : High Price
# MI : Mid Price
# other : Low Price
def price_label(brand):
if brand in ['Samsumg', 'Apple']:
return 'High Price'
elif brand == 'MI':
return 'Mid Price'
else :
return 'Low Price'
# 新增price_label 列
# convert python function to a udf
brand_udf = udf(price_label)
df.withColumn('price_label', brand_udf(df['mobile'])).show(10)
+-------+---+----------+------+-------+-----------+
|ratings|age|experience|family| mobile|price_label|
+-------+---+----------+------+-------+-----------+
| 3| 32| 9.0| 3| Vivo| Low Price|
| 3| 27| 13.0| 3| Apple| High Price|
| 4| 22| 2.5| 0|Samsung| Low Price|
| 4| 37| 16.5| 4| Apple| High Price|
| 5| 27| 9.0| 1| MI| Mid Price|
| 4| 27| 9.0| 0| Oppo| Low Price|
| 5| 37| 23.0| 5| Vivo| Low Price|
| 5| 37| 23.0| 5|Samsung| Low Price|
| 3| 22| 2.5| 0| Apple| High Price|
| 3| 27| 6.0| 0| MI| Mid Price|
+-------+---+----------+------+-------+-----------+
only showing top 10 rows
匿名函数 lambda
- convert a lambda function to udf
age_udf = udf(lambda age: 'young' if age < 30 else 'old', StringType())
df.withColumn('age_label', age_udf(df['age'])).show(10)
+-------+---+----------+------+-------+---------+
|ratings|age|experience|family| mobile|age_label|
+-------+---+----------+------+-------+---------+
| 3| 32| 9.0| 3| Vivo| old|
| 3| 27| 13.0| 3| Apple| young|
| 4| 22| 2.5| 0|Samsung| young|
| 4| 37| 16.5| 4| Apple| old|
| 5| 27| 9.0| 1| MI| young|
| 4| 27| 9.0| 0| Oppo| young|
| 5| 37| 23.0| 5| Vivo| old|
| 5| 37| 23.0| 5|Samsung| old|
| 3| 22| 2.5| 0| Apple| young|
| 3| 27| 6.0| 0| MI| young|
+-------+---+----------+------+-------+---------+
only showing top 10 rows
pandas_udf, PandasUDFType
- Pnadas UDF are much more powerful in terms of speed and processing time.
def remain_years(age):
left_years = 100 - age
return left_years
left_years_udf = pandas_udf(remain_years, IntegerType()) # PyArrow >= 0.8 , 等价于udf(reamin_years, IntegerType())
df.withColumn('left_years', left_years_udf(df['age'])).show()
+-------+---+----------+------+-------+----------+
|ratings|age|experience|family| mobile|left_years|
+-------+---+----------+------+-------+----------+
| 3| 32| 9.0| 3| Vivo| 68|
| 3| 27| 13.0| 3| Apple| 73|
| 4| 22| 2.5| 0|Samsung| 78|
| 4| 37| 16.5| 4| Apple| 63|
| 5| 27| 9.0| 1| MI| 73|
| 4| 27| 9.0| 0| Oppo| 73|
| 5| 37| 23.0| 5| Vivo| 63|
| 5| 37| 23.0| 5|Samsung| 63|
| 3| 22| 2.5| 0| Apple| 78|
| 3| 27| 6.0| 0| MI| 73|
+-------+---+----------+------+-------+----------+
only showing top 10 rows
def prod(rating, exp):
r= rating*exp
return r
prod_udf = pandas_udf(prod, DoubleType())
#prod_udf = udf(prod, DoubleType())
df.withColumn("product", prod_udf(df['ratings'],df['experience'])).show(10,False)
+-------+---+----------+------+-------+-------+
|ratings|age|experience|family|mobile |product|
+-------+---+----------+------+-------+-------+
|3 |32 |9.0 |3 |Vivo |27.0 |
|3 |27 |13.0 |3 |Apple |39.0 |
|4 |22 |2.5 |0 |Samsung|10.0 |
|4 |37 |16.5 |4 |Apple |66.0 |
|5 |27 |9.0 |1 |MI |45.0 |
|4 |27 |9.0 |0 |Oppo |36.0 |
|5 |37 |23.0 |5 |Vivo |115.0 |
|5 |37 |23.0 |5 |Samsung|115.0 |
|3 |22 |2.5 |0 |Apple |7.5 |
|3 |27 |6.0 |0 |MI |18.0 |
+-------+---+----------+------+-------+-------+
only showing top 10 rows
dropDuplicates()
去掉重复的样本
new_df = df.dropDuplicates()
new_df.count()
26
new_df.show(5)
+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
| 3| 32| 9.0| 3| Vivo|
| 4| 22| 2.5| 0|Samsung|
| 5| 27| 6.0| 0| MI|
| 4| 22| 6.0| 1| Oppo|
| 3| 27| 6.0| 0| MI|
+-------+---+----------+------+-------+
only showing top 5 rows
dorp()
new_df.drop('mobile').show(5)
+-------+---+----------+------+
|ratings|age|experience|family|
+-------+---+----------+------+
| 3| 32| 9.0| 3|
| 4| 22| 2.5| 0|
| 5| 27| 6.0| 0|
| 4| 22| 6.0| 1|
| 3| 27| 6.0| 0|
+-------+---+----------+------+
only showing top 5 rows
Saving data
- 如果当前目录不存在,会创建目录
- parquet格式,如果目录已存在,可能会报错
new_df.coalesce(1).show(5) # coalesce(numPartitions) 返回一个dataframe对象,并指定分区的数量
+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
| 3| 32| 9.0| 3| Vivo|
| 4| 22| 2.5| 0|Samsung|
| 5| 27| 6.0| 0| MI|
| 4| 22| 6.0| 1| Oppo|
| 3| 27| 6.0| 0| MI|
+-------+---+----------+------+-------+
only showing top 5 rows
# csv
new_df.coalesce(1).write.format('csv').option('header', 'true').save('./Data/New_sample_data')
# csv
new_df.write.format('csv').option('header', 'true').save('./Date/New_sample_data')
# parquet
new_df.coalesce(1).write.format('parquet').save('./Data/New_data')
new_df.show()
+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
| 3| 32| 9.0| 3| Vivo|
| 4| 22| 2.5| 0|Samsung|
| 5| 27| 6.0| 0| MI|
| 4| 22| 6.0| 1| Oppo|
| 3| 27| 6.0| 0| MI|
| 2| 32| 16.5| 2| Oppo|
| 4| 27| 9.0| 0| Oppo|
| 2| 27| 9.0| 2|Samsung|
| 3| 37| 16.5| 5| Apple|
| 4| 27| 6.0| 1| Apple|
| 5| 37| 23.0| 5| Vivo|
| 2| 27| 6.0| 2| Oppo|
| 4| 37| 6.0| 0| Vivo|
| 5| 37| 23.0| 5|Samsung|
| 4| 37| 9.0| 2|Samsung|
| 5| 37| 13.0| 1| Vivo|
| 5| 27| 2.5| 0| MI|
| 3| 42| 23.0| 5| MI|
| 5| 22| 2.5| 0|Samsung|
| 1| 37| 23.0| 5| MI|
+-------+---+----------+------+-------+
only showing top 20 rows