from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql.types import * spark=SparkSession.builder.appName("lz").getOrCreate() sc = SparkContext.getOrCreate() #检查重复项、缺失值和异常值 df = spark.createDataFrame([ ... (1,144.5,5.9,33,'M'), ... (2,167.2,5.4,45,'M'), ... (3,124.1,5.2,23,'F'), ... (4,144.5,5.9,33,'M'), ... (5,133.2,5.7,54,'F'), ... (3,124.1,5.2,23,'F'), ... (5,129.2,5.3,42,'M'), ... ],['id','weight','height','age','gender']) print('count of rows: {0}'.format(df.count())) count of rows: 7 #用.distinct()方法检查 print('count of distinct rows: {0}'.format(df.distinct().count())) count of distinct rows: 6 df = df.dropDuplicates()#去重 默认根据第一列去除完全重复的行 df.show() +---+------+------+---+------+ | id|weight|height|age|gender| +---+------+------+---+------+ | 5| 133.2| 5.7| 54| F| | 5| 129.2| 5.3| 42| M| | 1| 144.5| 5.9| 33| M| | 4| 144.5| 5.9| 33| M| | 2| 167.2| 5.4| 45| M| | 3| 124.1| 5.2| 23| F| +---+------+------+---+------+ print ('count of ids: {0}'.format(df.count())) count of ids: 6 print('count of distinct ids: {0}'.format(df.select([c for c in df.columns if c!='id']).distinct().count())) count of distinct ids: 5 df = df.dropDuplicates(subset = [c for c in df.columns if c!='id'])#使用ID列以外的列再次去重。 df.show() +---+------+------+---+------+ | id|weight|height|age|gender| +---+------+------+---+------+ | 5| 133.2| 5.7| 54| F| | 1| 144.5| 5.9| 33| M| | 2| 167.2| 5.4| 45| M| | 3| 124.1| 5.2| 23| F| | 5| 129.2| 5.3| 42| M| +---+------+------+---+------+ import pyspark.sql.functions as fn #现在我们知道没有完整的行重复,或任何相同的行只有ID不同,我们来检查是否有重复的ID。 #要在一步中计算ID的总数和不同数量,我们可以使用.agg(…)方法 df.agg(fn.count('id').alias('count'), ... fn.countDistinct('id').alias('distinct')).show() +-----+--------+ |count|distinct| +-----+--------+ | 5| 4| +-----+--------+ #使用 fn.monotonically_increasing_id() 方法重新分配ID。 df.withColumn('new_id',fn.monotonically_increasing_id()).show() +---+------+------+---+------+-------------+ | id|weight|height|age|gender| new_id| +---+------+------+---+------+-------------+ | 5| 133.2| 5.7| 54| F| 25769803776| | 1| 144.5| 5.9| 33| M| 171798691840| | 2| 167.2| 5.4| 45| M| 592705486848| | 3| 124.1| 5.2| 23| F|1236950581248| | 5| 129.2| 5.3| 42| M|1365799600128| +---+------+------+---+------+-------------+ df_miss = spark.createDataFrame([ ... (1,143.5,5.6,28,'M',100000), ... (2,167.2,5.4,45,'M',None), ... (3,None,5.2,None,None,None), ... (4,144.5,5.9,33,'M',None), ... (5,133.2,5.7,54,'F',None), ... (6,124.1,5.2,None,'F',None)], ... ['id','weight','height','age','gender','income']) #删除'income'列 df_miss_no_income=df_miss.select([c for c in df_miss.columns if c!='income']) #注意观察thresh这个参数的含义 #注意观察thresh这个参数的含义 每行非空数据的阀值。只要非空数据的数量低于这个阀值,则删除该行 df_miss_no_income.dropna(thresh=3).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=2).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=1).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=4).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=5).show() +---+------+------+---+------+ | id|weight|height|age|gender| +---+------+------+---+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| +---+------+------+---+------+ df_miss_no_income.dropna(thresh=6).show() +---+------+------+---+------+ | id|weight|height|age|gender| +---+------+------+---+------+ +---+------+------+---+------+ df_miss_no_income.dropna(thresh=0).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh>3).show() Traceback (most recent call last): File "<ipython-input-40-60394ae64986>", line 1, in <module> df_miss_no_income.dropna(thresh>3).show() NameError: name 'thresh' is not defined df_miss_no_income.show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=2).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=3).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=1).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=0).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=2).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=3).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss_no_income.dropna(thresh=1).show() +---+------+------+----+------+ | id|weight|height| age|gender| +---+------+------+----+------+ | 1| 143.5| 5.6| 28| M| | 2| 167.2| 5.4| 45| M| | 3| null| 5.2|null| null| | 4| 144.5| 5.9| 33| M| | 5| 133.2| 5.7| 54| F| | 6| 124.1| 5.2|null| F| +---+------+------+----+------+ df_miss.dropna(thresh=2).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 3| null| 5.2|null| null| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna(thresh=3).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 3| null| 5.2|null| null| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna(thresh=1).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 3| null| 5.2|null| null| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna().show() +---+------+------+---+------+------+ | id|weight|height|age|gender|income| +---+------+------+---+------+------+ | 1| 143.5| 5.6| 28| M|100000| +---+------+------+---+------+------+ df_miss.dropna().show() +---+------+------+---+------+------+ | id|weight|height|age|gender|income| +---+------+------+---+------+------+ | 1| 143.5| 5.6| 28| M|100000| +---+------+------+---+------+------+ df_miss.dropna(thresh=0).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 3| null| 5.2|null| null| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna(thresh=1).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 3| null| 5.2|null| null| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna(thresh=2).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 3| null| 5.2|null| null| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna(thresh=3).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna(thresh=4).show() +---+------+------+----+------+------+ | id|weight|height| age|gender|income| +---+------+------+----+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| | 6| 124.1| 5.2|null| F| null| +---+------+------+----+------+------+ df_miss.dropna(thresh=5).show() +---+------+------+---+------+------+ | id|weight|height|age|gender|income| +---+------+------+---+------+------+ | 1| 143.5| 5.6| 28| M|100000| | 2| 167.2| 5.4| 45| M| null| | 4| 144.5| 5.9| 33| M| null| | 5| 133.2| 5.7| 54| F| null| +---+------+------+---+------+------+ df_miss.dropna(thresh=6).show() +---+------+------+---+------+------+ | id|weight|height|age|gender|income| +---+------+------+---+------+------+ | 1| 143.5| 5.6| 28| M|100000| +---+------+------+---+------+------+
pyspark学习系列(四)数据清洗
猜你喜欢
转载自blog.csdn.net/qq_36603091/article/details/79855578
今日推荐
周排行