Cleaning Data in R

R 中清洗数据

为了更好的用data
找数据和处理数据都是数据挖据中比较重要的步骤

常见三种查看数据的函数

# View the first 6 rows of data
head(weather)

# View the last 6 rows of data
tail(weather)

# View a condensed summary of the data
str(weather)

Exploring raw data

> # Check the class of bmi
> class(bmi)
[1] "data.frame"
> 
> # Check the dimensions of bmi
> dim(bmi)
[1] 199  30
> 
> # View the column names of bmi
> names(bmi)
 [1] "Country" "Y1980"   "Y1981"   "Y1982"   "Y1983"   "Y1984"   "Y1985"  
 [8] "Y1986"   "Y1987"   "Y1988"   "Y1989"   "Y1990"   "Y1991"   "Y1992"  
[15] "Y1993"   "Y1994"   "Y1995"   "Y1996"   "Y1997"   "Y1998"   "Y1999"  
[22] "Y2000"   "Y2001"   "Y2002"   "Y2003"   "Y2004"   "Y2005"   "Y2006"  
[29] "Y2007"   "Y2008"

使用dplyr包里面的glimpse函数查看数据结构

> # Load dplyr
> library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
> 
> # Check the structure of bmi, the dplyr way
> 
> glimpse(bmi)
Observations: 199
Variables: 30
$ Country <chr> "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "...
$ Y1980   <dbl> 21.48678, 25.22533, 22.25703, 25.66652, 20.94876, 23.31424,...
$ Y1981   <dbl> 21.46552, 25.23981, 22.34745, 25.70868, 20.94371, 23.39054,...
$ Y1982   <dbl> 21.45145, 25.25636, 22.43647, 25.74681, 20.93754, 23.45883,...
$ Y1983   <dbl> 21.43822, 25.27176, 22.52105, 25.78250, 20.93187, 23.53735,...
$ Y1984   <dbl> 21.42734, 25.27901, 22.60633, 25.81874, 20.93569, 23.63584,...
$ Y1985   <dbl> 21.41222, 25.28669, 22.69501, 25.85236, 20.94857, 23.73109,...
$ Y1986   <dbl> 21.40132, 25.29451, 22.76979, 25.89089, 20.96030, 23.83449,...
$ Y1987   <dbl> 21.37679, 25.30217, 22.84096, 25.93414, 20.98025, 23.93649,...
$ Y1988   <dbl> 21.34018, 25.30450, 22.90644, 25.98477, 21.01375, 24.05364,...
$ Y1989   <dbl> 21.29845, 25.31944, 22.97931, 26.04450, 21.05269, 24.16347,...
$ Y1990   <dbl> 21.24818, 25.32357, 23.04600, 26.10936, 21.09007, 24.26782,...
$ Y1991   <dbl> 21.20269, 25.28452, 23.11333, 26.17912, 21.12136, 24.36568,...
$ Y1992   <dbl> 21.14238, 25.23077, 23.18776, 26.24017, 21.14987, 24.45644,...
$ Y1993   <dbl> 21.06376, 25.21192, 23.25764, 26.30356, 21.13938, 24.54096,...
$ Y1994   <dbl> 20.97987, 25.22115, 23.32273, 26.36793, 21.14186, 24.60945,...
$ Y1995   <dbl> 20.91132, 25.25874, 23.39526, 26.43569, 21.16022, 24.66461,...
$ Y1996   <dbl> 20.85155, 25.31097, 23.46811, 26.50769, 21.19076, 24.72544,...
$ Y1997   <dbl> 20.81307, 25.33988, 23.54160, 26.58255, 21.22621, 24.78714,...
$ Y1998   <dbl> 20.78591, 25.39116, 23.61592, 26.66337, 21.27082, 24.84936,...
$ Y1999   <dbl> 20.75469, 25.46555, 23.69486, 26.75078, 21.31954, 24.91721,...
$ Y2000   <dbl> 20.69521, 25.55835, 23.77659, 26.83179, 21.37480, 24.99158,...
$ Y2001   <dbl> 20.62643, 25.66701, 23.86256, 26.92373, 21.43664, 25.05857,...
$ Y2002   <dbl> 20.59848, 25.77167, 23.95294, 27.02525, 21.51765, 25.13039,...
$ Y2003   <dbl> 20.58706, 25.87274, 24.05243, 27.12481, 21.59924, 25.20713,...
$ Y2004   <dbl> 20.57759, 25.98136, 24.15957, 27.23107, 21.69218, 25.29898,...
$ Y2005   <dbl> 20.58084, 26.08939, 24.27001, 27.32827, 21.80564, 25.39965,...
$ Y2006   <dbl> 20.58749, 26.20867, 24.38270, 27.43588, 21.93881, 25.51382,...
$ Y2007   <dbl> 20.60246, 26.32753, 24.48846, 27.53363, 22.08962, 25.64247,...
$ Y2008   <dbl> 20.62058, 26.44657, 24.59620, 27.63048, 22.25083, 25.76602,...
> # View a summary of bmi
> summary(bmi)
   Country              Y1980           Y1981           Y1982      
 Length:199         Min.   :19.01   Min.   :19.04   Min.   :19.07  
 Class :character   1st Qu.:21.27   1st Qu.:21.31   1st Qu.:21.36  
 Mode  :character   Median :23.31   Median :23.39   Median :23.46  
                    Mean   :23.15   Mean   :23.21   Mean   :23.26  
                    3rd Qu.:24.82   3rd Qu.:24.89   3rd Qu.:24.94  
                    Max.   :28.12   Max.   :28.36   Max.   :28.58  
     Y1983           Y1984           Y1985           Y1986      
 Min.   :19.10   Min.   :19.13   Min.   :19.16   Min.   :19.20  
 1st Qu.:21.42   1st Qu.:21.45   1st Qu.:21.47   1st Qu.:21.49  
 Median :23.57   Median :23.64   Median :23.73   Median :23.82  
 Mean   :23.32   Mean   :23.37   Mean   :23.42   Mean   :23.48  
 3rd Qu.:25.02   3rd Qu.:25.06   3rd Qu.:25.11   3rd Qu.:25.20  
 Max.   :28.82   Max.   :29.05   Max.   :29.28   Max.   :29.52  
     Y1987           Y1988           Y1989           Y1990      
 Min.   :19.23   Min.   :19.27   Min.   :19.31   Min.   :19.35  
 1st Qu.:21.50   1st Qu.:21.52   1st Qu.:21.55   1st Qu.:21.57  
 Median :23.87   Median :23.93   Median :24.03   Median :24.14  
 Mean   :23.53   Mean   :23.59   Mean   :23.65   Mean   :23.71  
 3rd Qu.:25.27   3rd Qu.:25.34   3rd Qu.:25.37   3rd Qu.:25.39  
 Max.   :29.75   Max.   :29.98   Max.   :30.20   Max.   :30.42  
     Y1991           Y1992           Y1993           Y1994      
 Min.   :19.40   Min.   :19.45   Min.   :19.51   Min.   :19.59  
 1st Qu.:21.60   1st Qu.:21.65   1st Qu.:21.74   1st Qu.:21.76  
 Median :24.20   Median :24.19   Median :24.27   Median :24.36  
 Mean   :23.76   Mean   :23.82   Mean   :23.88   Mean   :23.94  
 3rd Qu.:25.42   3rd Qu.:25.48   3rd Qu.:25.54   3rd Qu.:25.62  
 Max.   :30.64   Max.   :30.85   Max.   :31.04   Max.   :31.23  
     Y1995           Y1996           Y1997           Y1998      
 Min.   :19.67   Min.   :19.71   Min.   :19.74   Min.   :19.77  
 1st Qu.:21.83   1st Qu.:21.89   1st Qu.:21.94   1st Qu.:22.00  
 Median :24.41   Median :24.42   Median :24.50   Median :24.49  
 Mean   :24.00   Mean   :24.07   Mean   :24.14   Mean   :24.21  
 3rd Qu.:25.70   3rd Qu.:25.78   3rd Qu.:25.85   3rd Qu.:25.94  
 Max.   :31.41   Max.   :31.59   Max.   :31.77   Max.   :31.95  
     Y1999           Y2000           Y2001           Y2002      
 Min.   :19.80   Min.   :19.83   Min.   :19.86   Min.   :19.84  
 1st Qu.:22.04   1st Qu.:22.12   1st Qu.:22.22   1st Qu.:22.29  
 Median :24.61   Median :24.66   Median :24.73   Median :24.81  
 Mean   :24.29   Mean   :24.36   Mean   :24.44   Mean   :24.52  
 3rd Qu.:26.01   3rd Qu.:26.09   3rd Qu.:26.19   3rd Qu.:26.30  
 Max.   :32.13   Max.   :32.32   Max.   :32.51   Max.   :32.70  
     Y2003           Y2004           Y2005           Y2006      
 Min.   :19.81   Min.   :19.79   Min.   :19.79   Min.   :19.80  
 1st Qu.:22.37   1st Qu.:22.45   1st Qu.:22.54   1st Qu.:22.63  
 Median :24.89   Median :25.00   Median :25.11   Median :25.24  
 Mean   :24.61   Mean   :24.70   Mean   :24.79   Mean   :24.89  
 3rd Qu.:26.38   3rd Qu.:26.47   3rd Qu.:26.53   3rd Qu.:26.59  
 Max.   :32.90   Max.   :33.10   Max.   :33.30   Max.   :33.49  
     Y2007           Y2008      
 Min.   :19.83   Min.   :19.87  
 1st Qu.:22.73   1st Qu.:22.83  
 Median :25.36   Median :25.50  
 Mean   :24.99   Mean   :25.10  
 3rd Qu.:26.66   3rd Qu.:26.82  
 Max.   :33.69   Max.   :33.90

\(提取指定元素 ```{r} # Histogram of BMIs from 2008 hist(bmi\)Y2008)

Scatter plot comparing BMIs from 1980 to those from 2008

plot(bmi\(Y1980, bmi\)Y2008)
```

Introduction to tidyr

gather()[gather][1]

gather函数类似于Excel(2016起)中的数据透视的功能,能把一个变量名含有变量的二维表转换成一个规范的二维表(类似数据库中关系的那种表,具体看例子)
参数说明

第一个参数放的是原数据,数据类型要是一个数据框;

下面传一个键值对,名字是自己起的,这两个值是做新转换成的二维表的表头,即两个变量名;

第四个是选中要转置的列,这个参数不写的话就默认全部转置;

stu<-data.frame(grade=c("A","B","C","D","E"), female=c(5, 4, 1, 2, 3), male=c(1, 2, 3, 4, 5))

gather(stu, gender, count,-grade)

spread()

spread用来扩展表,把某一列的值(键值对)分开拆成多列。

# Apply spread() to bmi_long
bmi_wide <- spread(bmi_long, year, bmi_val)

# View the head of bmi_wide
head(bmi_wide)
 Country    Y1980    Y1981    Y1982    Y1983    Y1984    Y1985
1         Afghanistan 21.48678 21.46552 21.45145 21.43822 21.42734 21.41222
2             Albania 25.22533 25.23981 25.25636 25.27176 25.27901 25.28669
3             Algeria 22.25703 22.34745 22.43647 22.52105 22.60633 22.69501
4             Andorra 25.66652 25.70868 25.74681 25.78250 25.81874 25.85236
5              Angola 20.94876 20.94371 20.93754 20.93187 20.93569 20.94857
6 Antigua and Barbuda 23.31424 23.39054 23.45883 23.53735 23.63584 23.73109
     Y1986    Y1987    Y1988    Y1989    Y1990    Y1991    Y1992    Y1993
1 21.40132 21.37679 21.34018 21.29845 21.24818 21.20269 21.14238 21.06376
2 25.29451 25.30217 25.30450 25.31944 25.32357 25.28452 25.23077 25.21192
3 22.76979 22.84096 22.90644 22.97931 23.04600 23.11333 23.18776 23.25764
4 25.89089 25.93414 25.98477 26.04450 26.10936 26.17912 26.24017 26.30356
5 20.96030 20.98025 21.01375 21.05269 21.09007 21.12136 21.14987 21.13938
6 23.83449 23.93649 24.05364 24.16347 24.26782 24.36568 24.45644 24.54096
     Y1994    Y1995    Y1996    Y1997    Y1998    Y1999    Y2000    Y2001
1 20.97987 20.91132 20.85155 20.81307 20.78591 20.75469 20.69521 20.62643
2 25.22115 25.25874 25.31097 25.33988 25.39116 25.46555 25.55835 25.66701
3 23.32273 23.39526 23.46811 23.54160 23.61592 23.69486 23.77659 23.86256
4 26.36793 26.43569 26.50769 26.58255 26.66337 26.75078 26.83179 26.92373
5 21.14186 21.16022 21.19076 21.22621 21.27082 21.31954 21.37480 21.43664
6 24.60945 24.66461 24.72544 24.78714 24.84936 24.91721 24.99158 25.05857
     Y2002    Y2003    Y2004    Y2005    Y2006    Y2007    Y2008
1 20.59848 20.58706 20.57759 20.58084 20.58749 20.60246 20.62058
2 25.77167 25.87274 25.98136 26.08939 26.20867 26.32753 26.44657
3 23.95294 24.05243 24.15957 24.27001 24.38270 24.48846 24.59620
4 27.02525 27.12481 27.23107 27.32827 27.43588 27.53363 27.63048
5 21.51765 21.59924 21.69218 21.80564 21.93881 22.08962 22.25083
6 25.13039 25.20713 25.29898 25.39965 25.51382 25.64247 25.76602

spreate()

The separate() function allows you to separate one column into multiple columns. Unless you tell it otherwise, it will attempt to separate on any character that is not a letter or number. You can also specify a specific separator using the sep argument.


> # Apply separate() to bmi_cc
> bmi_cc_clean <- separate(bmi_cc, col = Country_ISO, into = c("Country", "ISO"), sep = "/")
> 
> # Print the head of the result
> head(bmi_cc_clean)
              Country ISO  year  bmi_val
1         Afghanistan  AF Y1980 21.48678
2             Albania  AL Y1980 25.22533
3             Algeria  DZ Y1980 22.25703
4             Andorra  AD Y1980 25.66652
5              Angola  AO Y1980 20.94876
6 Antigua and Barbuda  AG Y1980 23.31424

unite()

reference

[1] https://blog.csdn.net/six66667/article/details/84888644 "dplyr 包常见函"

猜你喜欢

转载自www.cnblogs.com/gaowenxingxing/p/12045714.html