数据清洗 - 缺失值,异常值,重复值处理

数据清洗
数据清洗的对象就是缺失值,重复值,异常值
一 重复值的处理

一般用删除法,但是在业务上有意义的重复值不能删除

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import os
import missingno as msno
os.chdir(r"D:/jupyter_notebook/eg/eg3-零售行业数据分析")
data = pd.read_csv("data.csv",encoding="ISO-8859-1")
data.head()
InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
#1.判断是否有重复值
data.duplicated()
#2 查看某行是否有重复值
data.duplicated(subset = ['StockCode'],keep='last')
0          True
1          True
2          True
3          True
4          True
5          True
6          True
7          True
8          True
9          True
10         True
11         True
12         True
13         True
14         True
15         True
16         True
17         True
18         True
19         True
20         True
21         True
22         True
23         True
24         True
25         True
26         True
27         True
28         True
29         True
          ...  
541879     True
541880     True
541881    False
541882    False
541883    False
541884    False
541885    False
541886    False
541887    False
541888    False
541889    False
541890    False
541891    False
541892    False
541893    False
541894    False
541895    False
541896    False
541897    False
541898    False
541899    False
541900    False
541901    False
541902    False
541903    False
541904    False
541905    False
541906    False
541907    False
541908    False
Length: 541909, dtype: bool
#3 查看有多少行重复
np.sum(data.duplicated())
#4 查看某一列有多少重复值
np.sum(data.duplicated(subset=['StockCode']))
537839
#5 删除重复行
data.drop_duplicates(inplace=True)#6 根据列重复进行删除
data.drop_duplicates(subset=['StockCode'],keep='first')
InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
5	536365	22752	SET 7 BABUSHKA NESTING BOXES	2	12/1/2010 8:26	7.65	17850.0	United Kingdom
6	536365	21730	GLASS STAR FROSTED T-LIGHT HOLDER	6	12/1/2010 8:26	4.25	17850.0	United Kingdom
7	536366	22633	HAND WARMER UNION JACK	6	12/1/2010 8:28	1.85	17850.0	United Kingdom
8	536366	22632	HAND WARMER RED POLKA DOT	6	12/1/2010 8:28	1.85	17850.0	United Kingdom
9	536367	84879	ASSORTED COLOUR BIRD ORNAMENT	32	12/1/2010 8:34	1.69	13047.0	United Kingdom
10	536367	22745	POPPY'S PLAYHOUSE BEDROOM	6	12/1/2010 8:34	2.10	13047.0	United Kingdom
11	536367	22748	POPPY'S PLAYHOUSE KITCHEN	6	12/1/2010 8:34	2.10	13047.0	United Kingdom
12	536367	22749	FELTCRAFT PRINCESS CHARLOTTE DOLL	8	12/1/2010 8:34	3.75	13047.0	United Kingdom
13	536367	22310	IVORY KNITTED MUG COSY	6	12/1/2010 8:34	1.65	13047.0	United Kingdom
14	536367	84969	BOX OF 6 ASSORTED COLOUR TEASPOONS	6	12/1/2010 8:34	4.25	13047.0	United Kingdom
15	536367	22623	BOX OF VINTAGE JIGSAW BLOCKS	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
16	536367	22622	BOX OF VINTAGE ALPHABET BLOCKS	2	12/1/2010 8:34	9.95	13047.0	United Kingdom
17	536367	21754	HOME BUILDING BLOCK WORD	3	12/1/2010 8:34	5.95	13047.0	United Kingdom
18	536367	21755	LOVE BUILDING BLOCK WORD	3	12/1/2010 8:34	5.95	13047.0	United Kingdom
19	536367	21777	RECIPE BOX WITH METAL HEART	4	12/1/2010 8:34	7.95	13047.0	United Kingdom
20	536367	48187	DOORMAT NEW ENGLAND	4	12/1/2010 8:34	7.95	13047.0	United Kingdom
21	536368	22960	JAM MAKING SET WITH JARS	6	12/1/2010 8:34	4.25	13047.0	United Kingdom
22	536368	22913	RED COAT RACK PARIS FASHION	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
23	536368	22912	YELLOW COAT RACK PARIS FASHION	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
24	536368	22914	BLUE COAT RACK PARIS FASHION	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
25	536369	21756	BATH BUILDING BLOCK WORD	3	12/1/2010 8:35	5.95	13047.0	United Kingdom
26	536370	22728	ALARM CLOCK BAKELIKE PINK	24	12/1/2010 8:45	3.75	12583.0	France
27	536370	22727	ALARM CLOCK BAKELIKE RED	24	12/1/2010 8:45	3.75	12583.0	France
28	536370	22726	ALARM CLOCK BAKELIKE GREEN	12	12/1/2010 8:45	3.75	12583.0	France
29	536370	21724	PANDA AND BUNNIES STICKER SHEET	12	12/1/2010 8:45	0.85	12583.0	France
...	...	...	...	...	...	...	...	...
471168	576624	23449	SET OF 6 EASTER RAINBOW CHICKS	12	11/15/2011 17:14	1.25	16156.0	United Kingdom
475731	576911	23428	IVORY RETRO KITCHEN WALL CLOCK	1	11/17/2011 10:01	8.15	17461.0	United Kingdom
478073	577078	23652	SET 10 CARD CHRISTMAS STAMPS 16963	2	11/17/2011 15:17	2.91	NaN	United Kingdom
483388	577503	84206B	CAT WITH SUNGLASSES BLANK CARD	5	11/20/2011 12:34	0.19	18110.0	United Kingdom
485126	577554	23430	BLUE RETRO KITCHEN WALL CLOCK	1	11/20/2011 15:35	8.15	15533.0	United Kingdom
485759	577633	23429	RED RETRO KITCHEN WALL CLOCK	2	11/21/2011 10:35	8.15	14194.0	United Kingdom
491468	578067	90179B	PURPLE FINE BEAD NECKLACE W TASSEL	1	11/22/2011 15:43	5.82	NaN	United Kingdom
492577	578129	23441	HAND PAINTED HANGING EASTER EGG	10	11/23/2011 10:39	0.42	13502.0	United Kingdom
492988	578149	72802c	VANILLA SCENT CANDLE JEWELLED BOX	1	11/23/2011 11:11	8.29	NaN	United Kingdom
493011	578149	85049c	ROMANTIC PINKS RIBBONS	1	11/23/2011 11:11	2.46	NaN	United Kingdom
497301	578360	84971L	NaN	2	11/24/2011 10:36	0.00	NaN	United Kingdom
498914	578544	23440	PAINT YOUR OWN EGGS IN CRATE	6	11/24/2011 15:01	2.08	17096.0	United Kingdom
499913	578702	23552	BICYCLE PUNCTURE REPAIR KIT	6	11/25/2011 10:58	2.08	15382.0	United Kingdom
499914	578702	23498	CLASSIC BICYCLE CLIPS	12	11/25/2011 10:58	1.45	15382.0	United Kingdom
501156	578824	84550	CROCHET LILAC/RED BEAR KEYRING	1	11/25/2011 14:02	1.65	17883.0	United Kingdom
502046	578833	84971l	LARGE HEART FLOWERS HOOK	2	11/25/2011 15:23	2.46	NaN	United Kingdom
502047	578833	85034b	3 WHITE CHOC MORRIS BOXED CANDLES	1	11/25/2011 15:23	8.29	NaN	United Kingdom
502971	578856	23578	SNACK TRAY RED GINGHAM	3	11/27/2011 11:18	1.95	17769.0	United Kingdom
503150	578921	23580	SNACK TRAY HAPPY FOREST	2	11/27/2011 11:57	1.95	13596.0	United Kingdom
503195	578921	23575	SNACK TRAY PAISLEY PARK	3	11/27/2011 11:57	1.95	13596.0	United Kingdom
503198	578921	23560	SET OF 6 RIBBONS COUNTRY STYLE	1	11/27/2011 11:57	2.89	13596.0	United Kingdom
503375	578925	23576	SNACK TRAY RED VINTAGE DOILY	1	11/27/2011 12:13	1.95	17365.0	United Kingdom
503602	578928	23562	SET OF 6 RIBBONS PERFECTLY PRETTY	3	11/27/2011 12:29	2.89	15443.0	United Kingdom
504104	578942	23561	SET OF 6 RIBBONS PARTY	1	11/27/2011 13:29	2.89	17288.0	United Kingdom
507867	579187	23609	SET 10 CARDS SNOWY ROBIN 17099	1	11/28/2011 15:31	2.91	NaN	United Kingdom
509369	579297	85179a	GREEN BITTY LIGHT CHAIN	1	11/29/2011 11:23	2.46	NaN	United Kingdom
512588	579512	23617	SET 10 CARDS SWIRLY XMAS TREE 17104	2	11/29/2011 16:47	2.91	NaN	United Kingdom
527065	580691	90214U	LETTER "U" BLING KEY RING	12	12/5/2011 15:48	0.29	13790.0	United Kingdom
537224	581238	47591b	SCOTTIES CHILDRENS APRON	1	12/8/2011 10:53	4.13	NaN	United Kingdom
540421	581483	23843	PAPER CRAFT , LITTLE BIRDIE	80995	12/9/2011 9:15	2.08	16446.0	United Kingdom
4070 rows × 8 columns

二 缺失值的处理

采用删除法,替换法,插值法 。缺失值需要根据实际情况定义

#1. 缺失值判断
data.isnull()#2. 缺失值计数
np.sum(data.isnull(),axis=0)   #默认沿着行操作,对列进行统计#3. 计算缺失值比例
data.apply(lambda x: sum(x.isnull())/len(x),axis=0)
InvoiceNo      0.000000
StockCode      0.000000
Description    0.002709
Quantity       0.000000
InvoiceDate    0.000000
UnitPrice      0.000000
CustomerID     0.251634
Country        0.000000
dtype: float64
#4. 删除缺失值
data.dropna()          #一行上有一个缺失值就会删除此行#删除行数据缺失值
data.dropna(how='any')  #all - 全部,any- 任意
data.head()
InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
#5. 填充缺失值 
#5,1  分类数据用众数填补
data.Description.fillna(data.Description.mode()[0])#5.2  数值型数据用均值,中位数填补
data.CustomerID.fillna(data.CustomerID.mean())
data.CustomerID.fillna(data.CustomerID.median())
data.CustomerID.fillna(1000)#5.3 不同值采取不同处理方式-字典形式填补
data.fillna(value={'CustomerID':data.CustomerID.mean(),
                  'Description':data.Description.mode()[0]})#5.4 前向填补和后向填补
data.fillna(method='ffil')         #第一个数据缺失无法填充
data.fillna(method='bfill')        #最后一个数据缺失无法填充#5.5  差值填补(一般不用) - 根据线性插值函数自动计算
data.interpolate(method='polynomial',order=1)
InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
5	536365	22752	SET 7 BABUSHKA NESTING BOXES	2	12/1/2010 8:26	7.65	17850.0	United Kingdom
6	536365	21730	GLASS STAR FROSTED T-LIGHT HOLDER	6	12/1/2010 8:26	4.25	17850.0	United Kingdom
7	536366	22633	HAND WARMER UNION JACK	6	12/1/2010 8:28	1.85	17850.0	United Kingdom
8	536366	22632	HAND WARMER RED POLKA DOT	6	12/1/2010 8:28	1.85	17850.0	United Kingdom
9	536367	84879	ASSORTED COLOUR BIRD ORNAMENT	32	12/1/2010 8:34	1.69	13047.0	United Kingdom
10	536367	22745	POPPY'S PLAYHOUSE BEDROOM	6	12/1/2010 8:34	2.10	13047.0	United Kingdom
11	536367	22748	POPPY'S PLAYHOUSE KITCHEN	6	12/1/2010 8:34	2.10	13047.0	United Kingdom
12	536367	22749	FELTCRAFT PRINCESS CHARLOTTE DOLL	8	12/1/2010 8:34	3.75	13047.0	United Kingdom
13	536367	22310	IVORY KNITTED MUG COSY	6	12/1/2010 8:34	1.65	13047.0	United Kingdom
14	536367	84969	BOX OF 6 ASSORTED COLOUR TEASPOONS	6	12/1/2010 8:34	4.25	13047.0	United Kingdom
15	536367	22623	BOX OF VINTAGE JIGSAW BLOCKS	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
16	536367	22622	BOX OF VINTAGE ALPHABET BLOCKS	2	12/1/2010 8:34	9.95	13047.0	United Kingdom
17	536367	21754	HOME BUILDING BLOCK WORD	3	12/1/2010 8:34	5.95	13047.0	United Kingdom
18	536367	21755	LOVE BUILDING BLOCK WORD	3	12/1/2010 8:34	5.95	13047.0	United Kingdom
19	536367	21777	RECIPE BOX WITH METAL HEART	4	12/1/2010 8:34	7.95	13047.0	United Kingdom
20	536367	48187	DOORMAT NEW ENGLAND	4	12/1/2010 8:34	7.95	13047.0	United Kingdom
21	536368	22960	JAM MAKING SET WITH JARS	6	12/1/2010 8:34	4.25	13047.0	United Kingdom
22	536368	22913	RED COAT RACK PARIS FASHION	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
23	536368	22912	YELLOW COAT RACK PARIS FASHION	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
24	536368	22914	BLUE COAT RACK PARIS FASHION	3	12/1/2010 8:34	4.95	13047.0	United Kingdom
25	536369	21756	BATH BUILDING BLOCK WORD	3	12/1/2010 8:35	5.95	13047.0	United Kingdom
26	536370	22728	ALARM CLOCK BAKELIKE PINK	24	12/1/2010 8:45	3.75	12583.0	France
27	536370	22727	ALARM CLOCK BAKELIKE RED	24	12/1/2010 8:45	3.75	12583.0	France
28	536370	22726	ALARM CLOCK BAKELIKE GREEN	12	12/1/2010 8:45	3.75	12583.0	France
29	536370	21724	PANDA AND BUNNIES STICKER SHEET	12	12/1/2010 8:45	0.85	12583.0	France
...	...	...	...	...	...	...	...	...
541879	581585	22726	ALARM CLOCK BAKELIKE GREEN	8	12/9/2011 12:31	3.75	15804.0	United Kingdom
541880	581585	22727	ALARM CLOCK BAKELIKE RED	4	12/9/2011 12:31	3.75	15804.0	United Kingdom
541881	581585	16016	LARGE CHINESE STYLE SCISSOR	10	12/9/2011 12:31	0.85	15804.0	United Kingdom
541882	581585	21916	SET 12 RETRO WHITE CHALK STICKS	24	12/9/2011 12:31	0.42	15804.0	United Kingdom
541883	581585	84692	BOX OF 24 COCKTAIL PARASOLS	25	12/9/2011 12:31	0.42	15804.0	United Kingdom
541884	581585	84946	ANTIQUE SILVER T-LIGHT GLASS	12	12/9/2011 12:31	1.25	15804.0	United Kingdom
541885	581585	21684	SMALL MEDINA STAMPED METAL BOWL	12	12/9/2011 12:31	0.85	15804.0	United Kingdom
541886	581585	22398	MAGNETS PACK OF 4 SWALLOWS	12	12/9/2011 12:31	0.39	15804.0	United Kingdom
541887	581585	23328	SET 6 SCHOOL MILK BOTTLES IN CRATE	4	12/9/2011 12:31	3.75	15804.0	United Kingdom
541888	581585	23145	ZINC T-LIGHT HOLDER STAR LARGE	12	12/9/2011 12:31	0.95	15804.0	United Kingdom
541889	581585	22466	FAIRY TALE COTTAGE NIGHT LIGHT	12	12/9/2011 12:31	1.95	15804.0	United Kingdom
541890	581586	22061	LARGE CAKE STAND HANGING STRAWBERY	8	12/9/2011 12:49	2.95	13113.0	United Kingdom
541891	581586	23275	SET OF 3 HANGING OWLS OLLIE BEAK	24	12/9/2011 12:49	1.25	13113.0	United Kingdom
541892	581586	21217	RED RETROSPOT ROUND CAKE TINS	24	12/9/2011 12:49	8.95	13113.0	United Kingdom
541893	581586	20685	DOORMAT RED RETROSPOT	10	12/9/2011 12:49	7.08	13113.0	United Kingdom
541894	581587	22631	CIRCUS PARADE LUNCH BOX	12	12/9/2011 12:50	1.95	12680.0	France
541895	581587	22556	PLASTERS IN TIN CIRCUS PARADE	12	12/9/2011 12:50	1.65	12680.0	France
541896	581587	22555	PLASTERS IN TIN STRONGMAN	12	12/9/2011 12:50	1.65	12680.0	France
541897	581587	22728	ALARM CLOCK BAKELIKE PINK	4	12/9/2011 12:50	3.75	12680.0	France
541898	581587	22727	ALARM CLOCK BAKELIKE RED	4	12/9/2011 12:50	3.75	12680.0	France
541899	581587	22726	ALARM CLOCK BAKELIKE GREEN	4	12/9/2011 12:50	3.75	12680.0	France
541900	581587	22730	ALARM CLOCK BAKELIKE IVORY	4	12/9/2011 12:50	3.75	12680.0	France
541901	581587	22367	CHILDRENS APRON SPACEBOY DESIGN	8	12/9/2011 12:50	1.95	12680.0	France
541902	581587	22629	SPACEBOY LUNCH BOX	12	12/9/2011 12:50	1.95	12680.0	France
541903	581587	23256	CHILDRENS CUTLERY SPACEBOY	4	12/9/2011 12:50	4.15	12680.0	France
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	12/9/2011 12:50	0.85	12680.0	France
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	12/9/2011 12:50	2.10	12680.0	France
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	12/9/2011 12:50	4.15	12680.0	France
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	12/9/2011 12:50	4.15	12680.0	France
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	12/9/2011 12:50	4.95	12680.0	France
536641 rows × 8 columns

三 异常值的处理

异常值:偏离正常范围的值,不是错误值,出现频率较低

异常值判断:均值±两个标准差为正常值,范围之外则为异常值,(箱线图判断)

data['price'] = data['Quantity']*data['UnitPrice']
q_mean = data['price'].mean()
q_std = data['price'].std()any(data['price']>q_mean+2*q_std)any(data['price']<q_mean-2*q_std)
True
#2分位数判断#2.1 下四分位数
Q1=data['price'].quantile(0.25)#2.2 上四分位数
Q3=data['price'].quantile(0.75)#2.3 分位差
IQR= Q3-Q1
any(data['price']>Q3 + 1.5*IQR)
True
any(data['price']>Q1 - 1.5*IQR)
True
#2.4 箱线图
plt.figure(figsize=(6.4,4.8))
data['price'].plot(kind="box")
<matplotlib.axes._subplots.AxesSubplot at 0x28636ab1400>

#3. 异常值处理 - 替换值
UL = Q3+1.5*IQR
#3.1 计算数据在正常范围内的最大值
replace_value  = data['price'][data['price']<UL].max()    
print(replace_value)
#3.2 进行离群点的替换
data.loc[data['price']<UL,'price'] = replace_value
23.400000000000002
data['price'].describe()
count    541909.000000
mean        114.159394
std        2186.736084
min          23.400000
25%          28.750000
50%          48.300000
75%          95.450000
max      896310.000000
Name: price, dtype: float64
#3.2 百分数替换 - 百分之一分位数替换,百分之九十九分位数替换
quantile()
发布了5 篇原创文章 · 获赞 0 · 访问量 32

猜你喜欢

转载自blog.csdn.net/qq_27924553/article/details/105195545
今日推荐