Pandas+groupby用法讲解

import pandas as pd
import numpy as np
#导入数据
type_specified={"trip_id":"object","bikeid":"object","from_station_id":"object","to_station_id":"object",
               "hour":"object","weekday":"object"}
ridership=pd.read_csv(r"E:\chicago_bikes_data\txt&csv\ridership_2016.txt",
                      nrows=10000,dtype=type_specified,usecols=range(1,15))
ridership.head()
trip_id starttime stoptime bikeid tripduration from_station_id from_station_name to_station_id to_station_name usertype gender birthyear hour weekday
0 9080551 3/31/2016 23:53 4/1/2016 0:07 155 841 344 Ravenswood Ave & Lawrence Ave 458 Broadway & Thorndale Ave Subscriber Male 1986.0 23 4
1 9080550 3/31/2016 23:46 3/31/2016 23:57 4831 649 128 Damen Ave & Chicago Ave 213 Leavitt St & North Ave Subscriber Male 1980.0 23 4
2 9080549 3/31/2016 23:42 3/31/2016 23:46 4232 210 350 Ashland Ave & Chicago Ave 210 Ashland Ave & Division St Subscriber Male 1979.0 23 4
3 9080548 3/31/2016 23:37 3/31/2016 23:55 3464 1045 303 Broadway & Cornelia Ave 458 Broadway & Thorndale Ave Subscriber Male 1980.0 23 4
4 9080547 3/31/2016 23:33 3/31/2016 23:37 1750 202 334 Lake Shore Dr & Belmont Ave 329 Lake Shore Dr & Diversey Pkwy Subscriber Male 1969.0 23 4
#groupby可以通过传入需要分组的参数实现对数据的分组
ridership_user=ridership.groupby("usertype")
ridership_user_week=ridership.groupby(["usertype","weekday"])
#groupby之后的数据并不是DataFrame格式的数据,而是特殊的groupby类型,此时,可以通过size()方法返回分组后的记录数目统计结果,该结果
#是Series类型
ridership_user.size()
usertype
Customer      1023
Subscriber    8977
dtype: int64
ridership_user_week.size()
usertype    weekday
Customer    3           286
            4           737
Subscriber  3          2714
            4          6263
dtype: int64
#可以通过索引的方式实现对分组后各类别记录数的提取
ridership_user_week.size().loc["Customer","3"]
286
#通过groupby完成对数据的分组后,可以通过get_group方法来获取某一制定分组的结果
ridership_user.get_group("Customer").head()
trip_id starttime stoptime bikeid tripduration from_station_id from_station_name to_station_id to_station_name usertype gender birthyear hour weekday
23 9080526 3/31/2016 23:04 3/31/2016 23:32 1638 1648 505 Winchester Ave & Elston Ave 505 Winchester Ave & Elston Ave Customer NaN NaN 23 4
65 9080481 3/31/2016 22:24 3/31/2016 22:45 3438 1220 152 Lincoln Ave & Diversey Pkwy 316 Damen Ave & Sunnyside Ave Customer NaN NaN 22 4
77 9080463 3/31/2016 22:15 3/31/2016 22:17 3464 97 47 State St & Kinzie St 47 State St & Kinzie St Customer NaN NaN 22 4
113 9080426 3/31/2016 22:02 3/31/2016 22:29 2716 1582 340 Clark St & Wrightwood Ave 161 Rush St & Superior St Customer NaN NaN 22 4
115 9080424 3/31/2016 22:02 3/31/2016 22:29 1052 1611 340 Clark St & Wrightwood Ave 161 Rush St & Superior St Customer NaN NaN 22 4
#对于多层分组后的数据,如果需要使用多重索引,此处需要传入一个元组
ridership_user_week.get_group(("Customer","3")).head()
trip_id starttime stoptime bikeid tripduration from_station_id from_station_name to_station_id to_station_name usertype gender birthyear hour weekday
7002 9071238 3/30/2016 23:57 3/31/2016 0:03 4018 317 305 Western Ave & Division St 374 Western Ave & Walton St Customer NaN NaN 23 3
7007 9071233 3/30/2016 23:51 3/30/2016 23:56 4018 316 374 Western Ave & Walton St 305 Western Ave & Division St Customer NaN NaN 23 3
7014 9071226 3/30/2016 23:28 3/30/2016 23:45 1142 993 71 Morgan St & Lake St 15 Racine Ave & 18th St Customer NaN NaN 23 3
7117 9071031 3/30/2016 20:13 3/30/2016 20:34 1919 1302 284 Michigan Ave & Jackson Blvd 173 Mies van der Rohe Way & Chicago Ave Customer NaN NaN 20 3
7118 9071030 3/30/2016 20:13 3/30/2016 20:34 2909 1312 284 Michigan Ave & Jackson Blvd 173 Mies van der Rohe Way & Chicago Ave Customer NaN NaN 20 3
#对于分组后的数据,如果还需要进行一系列查询,可以使用apply方法,传入一个处理的函数
#假设我们需要查找上述分组后每组中用时最长的一组记录的起始和终止站点,可执行如下命令
max_time_from_id=ridership_user_week.apply(lambda record:record["from_station_id"][record["tripduration"].idxmax()])
max_time_to_id=ridership_user_week.apply(lambda record:record["to_station_id"][record["tripduration"].idxmax()])
max_time_time=ridership_user_week.apply(lambda record:record["tripduration"][record["tripduration"].idxmax()])
max_time_ridership=pd.DataFrame({"from_station_id":max_time_from_id,"to_station_id":max_time_to_id,
                                "tripduratio":max_time_time})
max_time_ridership
from_station_id to_station_id tripduratio
usertype weekday
Customer 3 35 277 81702
4 340 229 68571
Subscriber 3 287 254 71758
4 133 49 70977

猜你喜欢

转载自blog.csdn.net/zhangxiaojiakele/article/details/78198568