pyspark学习系列（四）数据清洗

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession 
from pyspark.sql import SQLContext
from pyspark.sql.types import *
spark=SparkSession.builder.appName("lz").getOrCreate()  
sc = SparkContext.getOrCreate()

#检查重复项、缺失值和异常值
 df = spark.createDataFrame([
... (1,144.5,5.9,33,'M'),
... (2,167.2,5.4,45,'M'),
... (3,124.1,5.2,23,'F'),
... (4,144.5,5.9,33,'M'),
... (5,133.2,5.7,54,'F'),
... (3,124.1,5.2,23,'F'),
... (5,129.2,5.3,42,'M'),
... ],['id','weight','height','age','gender'])

print('count of rows: {0}'.format(df.count()))
count of rows: 7
#用.distinct()方法检查
print('count of distinct rows: {0}'.format(df.distinct().count()))
count of distinct rows: 6

df = df.dropDuplicates()#去重 默认根据第一列去除完全重复的行

df.show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  5| 133.2|   5.7| 54|     F|
|  5| 129.2|   5.3| 42|     M|
|  1| 144.5|   5.9| 33|     M|
|  4| 144.5|   5.9| 33|     M|
|  2| 167.2|   5.4| 45|     M|
|  3| 124.1|   5.2| 23|     F|
+---+------+------+---+------+


print ('count of ids: {0}'.format(df.count()))
count of ids: 6

print('count of distinct ids: {0}'.format(df.select([c for c in df.columns if c!='id']).distinct().count()))
count of distinct ids: 5

df = df.dropDuplicates(subset = [c for c in df.columns if c!='id'])#使用ID列以外的列再次去重。

df.show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  5| 133.2|   5.7| 54|     F|
|  1| 144.5|   5.9| 33|     M|
|  2| 167.2|   5.4| 45|     M|
|  3| 124.1|   5.2| 23|     F|
|  5| 129.2|   5.3| 42|     M|
+---+------+------+---+------+


import pyspark.sql.functions as fn
#现在我们知道没有完整的行重复，或任何相同的行只有ID不同，我们来检查是否有重复的ID。 
#要在一步中计算ID的总数和不同数量，我们可以使用.agg（…）方法
df.agg(fn.count('id').alias('count'),
... fn.countDistinct('id').alias('distinct')).show()
+-----+--------+
|count|distinct|
+-----+--------+
|    5|       4|
+-----+--------+

#使用 fn.monotonically_increasing_id() 方法重新分配ID。
df.withColumn('new_id',fn.monotonically_increasing_id()).show()
+---+------+------+---+------+-------------+
| id|weight|height|age|gender|       new_id|
+---+------+------+---+------+-------------+
|  5| 133.2|   5.7| 54|     F|  25769803776|
|  1| 144.5|   5.9| 33|     M| 171798691840|
|  2| 167.2|   5.4| 45|     M| 592705486848|
|  3| 124.1|   5.2| 23|     F|1236950581248|
|  5| 129.2|   5.3| 42|     M|1365799600128|
+---+------+------+---+------+-------------+


df_miss = spark.createDataFrame([
... (1,143.5,5.6,28,'M',100000),
... (2,167.2,5.4,45,'M',None),
... (3,None,5.2,None,None,None),
... (4,144.5,5.9,33,'M',None),
... (5,133.2,5.7,54,'F',None),
... (6,124.1,5.2,None,'F',None)],
... ['id','weight','height','age','gender','income'])
#删除'income'列
df_miss_no_income=df_miss.select([c for c in df_miss.columns if c!='income'])
#注意观察thresh这个参数的含义
#注意观察thresh这个参数的含义 每行非空数据的阀值。只要非空数据的数量低于这个阀值，则删除该行
df_miss_no_income.dropna(thresh=3).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=2).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=1).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=4).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=5).show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  1| 143.5|   5.6| 28|     M|
|  2| 167.2|   5.4| 45|     M|
|  4| 144.5|   5.9| 33|     M|
|  5| 133.2|   5.7| 54|     F|
+---+------+------+---+------+


df_miss_no_income.dropna(thresh=6).show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
+---+------+------+---+------+


df_miss_no_income.dropna(thresh=0).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh>3).show()
Traceback (most recent call last):

  File "<ipython-input-40-60394ae64986>", line 1, in <module>
    df_miss_no_income.dropna(thresh>3).show()

NameError: name 'thresh' is not defined


df_miss_no_income.show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=2).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=3).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=1).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=0).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=2).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=3).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss_no_income.dropna(thresh=1).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  3|  null|   5.2|null|  null|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
+---+------+------+----+------+


df_miss.dropna(thresh=2).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna(thresh=3).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna(thresh=1).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna().show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
|  1| 143.5|   5.6| 28|     M|100000|
+---+------+------+---+------+------+


df_miss.dropna().show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
|  1| 143.5|   5.6| 28|     M|100000|
+---+------+------+---+------+------+


df_miss.dropna(thresh=0).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna(thresh=1).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna(thresh=2).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna(thresh=3).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna(thresh=4).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
+---+------+------+----+------+------+


df_miss.dropna(thresh=5).show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
|  1| 143.5|   5.6| 28|     M|100000|
|  2| 167.2|   5.4| 45|     M|  null|
|  4| 144.5|   5.9| 33|     M|  null|
|  5| 133.2|   5.7| 54|     F|  null|
+---+------+------+---+------+------+


df_miss.dropna(thresh=6).show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
|  1| 143.5|   5.6| 28|     M|100000|
+---+------+------+---+------+------+
pyspark学习系列（四）数据清洗

猜你喜欢