数据清洗整理基本操作(R:dplyr、tidyr、lubridate)

把以前在swirl课程学的基础数据清洗操作重新整理一遍,主要包括:

  1. dplyr包中的select、filter、arrange、mutate、group_by、summarize函数,以及%>% 管道操算符(pip operation),“then”的意思。
  2. tidyr包中的gather、seperate、spread函数。
  3. lubridate包中date型变量记录更新等操作。

1、dplyr包(select列选择、filter行选择、arrange排序、mutate新增、group_by分组统计、summarize汇总)

(1)首先读入数据并查看数据基本信息:

1 library(dplyr)
2 
3 ### Intro
4 path2csv <- file.path('2014-07-08.csv')
5 df <- read.csv(path2csv, as.is = TRUE)
6 dim(df)
7 head(df)
8 cran <- tbl_df(df)
9 cran

 (2)利用几个常用函数进行数据提取,筛选,排序等操作:

 1 ##基本函数
 2 select(cran,ip_id,package,country)  #cran为数据集名称,ip_id,package,country为列名
 3 select(cran,r_arch:country)  #r_arch:country表示从r_arch列取到country列
 4 select(cran,country:r_arch)  #反过来排序
 5 select(cran,-time)  #不取time列
 6 select(cran,-(X:size))  #不取X到size的列
 7 
 8 filter(cran,package=="swirl")  #cran为数据集,筛选package为”swirl“的所有行
 9 filter(cran,r_version=="3.1.1",country=="US")  #两个筛选条件
10 filter(cran,r_version<="3.0.2",country=="IN")
11 filter(cran,country=="US"|country=="IN")  #添加正则表达式
12 filter(cran,!is.na(r_version))  #筛选所有r_version非空的行
13 
14 arrange(cran2,ip_id)  #按ip_id排升序(默认排序方式)
15 arrange(cran2,desc(ip_id))  #按ip_id排降序
16 arrange(cran2,package,ip_id)  #先按package列排序,再对ip_id排序
17 
18 mutate(cran3,size_mb=size/2^20)  #利用已有列新建一列size_mb
19 mutate(cran3,size_mb=size/2^20,size_gb=size_mb/2^10)  #新建俩列
20 
21 summarise(cran3,avg_bytes=mean(size))  #汇总,查看整体统计数据

(3)分组统计并排序:

 1 by_package<-group_by(cran,package)  #按package列进行分组统计
 2 summarise(by_package,mean(size))  #计算分组后每组记录的平均size
 3 pack_sum <- summarize(by_package,  #对分组数据by_package进行汇总统计
 4                       count =n(),  #n()统计每组频数  
 5                       unique = n_distinct(ip_id),  #统计每组有多少不同的ip_id
 6                       countries = n_distinct(country),
 7                       avg_bytes = mean(size))
 8 
 9 quantile(pack_sum$unique, probs = 0.99)  #计算unique的99%分位数
10 top_unique<-filter(pack_sum,unique>465)  #筛选频数大于465的记录
11 top_unique_sorted<-arrange(top_unique,desc(unique))
12 top_countries <- filter(pack_sum, countries > 60)
13 result1 <- arrange(top_countries, desc(countries), avg_bytes)
14 print(result1)

(4)利用嵌套精简(3)的过程:

 1 result2 <-
 2   arrange(
 3     filter(
 4       summarize(  #分组数据用summarize进行汇总
 5         group_by(cran,
 6                  package
 7         ),
 8         count = n(),
 9         unique = n_distinct(ip_id),
10         countries = n_distinct(country),
11         avg_bytes = mean(size)
12       ),
13       countries > 60
14     ),
15     desc(countries),
16     avg_bytes
17   )
18 
19 print(result2)

(5)利用%>%符号操作(3)中的过程:

 1 # you read it, you can pronounce the %>% operator as
 2 # the word 'then'.
 3 result3 <-
 4   cran %>%
 5   group_by(package) %>%
 6   summarize(count = n(),
 7             unique = n_distinct(ip_id),
 8             countries = n_distinct(country),
 9             avg_bytes = mean(size)
10   ) %>%
11   filter(countries > 60) %>%
12   arrange(desc(countries), avg_bytes)
13 
14 # Print result to console
15 print(result3)

(6)几个函数及管道符号的一起使用:

1 cran %>%
2   select(ip_id, country, package, size) %>%
3   mutate(size_mb = size / 2^20) %>%
4   filter(size_mb <= 0.5) %>%
5   arrange(desc(size_mb)) %>%
6   print

2、tidyr包

(1)gather() 和 seperate():

1 library(tidyr)
2 library(readr)
3 library(dplyr)
4 
5 students2 %>%
6   gather( sex_class,count ,-grade ) %>%
7   separate( sex_class, c("sex", "class")) %>%
8   print

       

下面的gather( students2,sex_class,count ,-grade ) 
#gather Gather columns into key-value pairs;students2为数据集,grade为第一列,不参与gather;剩下的列名及数据作为键值对放入sex_class(key)和count(value)下。

接下来seperate将sex_class列分成俩列。separate :Separate one column into multiple columns.

(2)spread():   # spread:Spread a key-value pair across multiple columns.

1 students3 %>%
2   gather(class, grade, class1:class5, na.rm = TRUE) %>%
3   #name未参与gather,因此照原格式输出
4   spread(test, grade) %>%   
5   # spread:Spread a key-value pair across multiple columns.
6   mutate(class=parse_number(class)) %>%
7   # parse_numeric    Extract numeric component of variable.
8   print

(3)行合并以及列合并:

 1 student_info <- students4 %>%
 2   select(id, name, sex) %>%
 3   unique() %>%  #去除重复记录
 4   print
 5 
 6 gradebook <- students4 %>%
 7   select(id,class,midterm,final) %>%
 8   print
 9 
10 merge.data.frame(gradebook,student_info,by="id")
11 #列合并不同行数的数据框
12 
13 passed<-mutate(passed,status="passed")
14 failed<-mutate(failed,status="failed")
15 bind_rows(passed,failed)
16 #行合并俩个数据框

  

 

(4)几个函数一起使用:

1 sat %>%
2   select(-contains("total")) %>%
3   gather(part_sex, count, -score_range) %>%
4   separate(part_sex, c("part", "sex")) %>%
5   group_by(part,sex) %>%
6   mutate( total = sum(count),  #统计每组频数
7           prop = count / total) %>%  #统计每组里面各分数段比例
8   print

3、lubradate包

(1)获取当前日期时间

1 library(lubridate)

wday(today()) #get the day of the week,such that 1 = Sunday, 2 = Monday

(2)时间数据的录入

1 ymd("1989-05-17")
2 ymd("1989 May 17")
3 mdy("March 12,1975")
4 dmy(25081985)
5 ymd("1920/1/2")
6 
7 hms("03:22:14")
8 ymd_hms(c("2010-04-14-04-35-59", "2010-01-01 12:00:00"))
9 ymd_hms(now())

(3)更新时间数据,计算时间间隔

猜你喜欢

转载自www.cnblogs.com/lhjc/p/11295323.html