【数据分析可视化】数据分箱技术Binning

分箱:抽象理解为苹果根据大小不同分级分箱
在这里插入图片描述

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
# 模拟成绩分箱
score_list = np.random.randint(35, 100, size=20)
score_list
array([93, 35, 83, 44, 56, 62, 37, 86, 44, 82, 49, 91, 49, 82, 53, 89, 47,
       56, 38, 86])
# 成绩评级分段
bins = [0,59,70,80,100]
# 分箱(返回Categories类型)
score_cut = pd.cut(score_list, bins)
score_cut
[(80, 100], (0, 59], (80, 100], (0, 59], (0, 59], ..., (80, 100], (0, 59], (0, 59], (0, 59], (80, 100]]
Length: 20
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
# 每一个分箱多少人
pd.value_counts(score_cut)
(0, 59]      11
(80, 100]     8
(59, 70]      1
(70, 80]      0
dtype: int64
# 将模拟的成绩 放入DataFrame
df = DataFrame()
df['score'] = score_list
df
score
0 93
1 35
2 83
3 44
4 56
5 62
6 37
7 86
8 44
9 82
10 49
11 91
12 49
13 82
14 53
15 89
16 47
17 56
18 38
19 86
# 填充长度为3的随机字符串
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df
score student
0 93 8c1
1 35 cHy
2 83 6xy
3 44 6gY
4 56 tc5
5 62 r5T
6 37 3z3
7 86 vsy
8 44 F6h
9 82 hgC
10 49 xA9
11 91 iLZ
12 49 BVK
13 82 E9C
14 53 rbE
15 89 hSL
16 47 AIt
17 56 Gdk
18 38 AFX
19 86 JhU
# 利用pd.cut将数据处理并填充到DataFrame
pd.cut(df['score'], bins)
0     (80, 100]
1       (0, 59]
2     (80, 100]
3       (0, 59]
4       (0, 59]
5      (59, 70]
6       (0, 59]
7     (80, 100]
8       (0, 59]
9     (80, 100]
10      (0, 59]
11    (80, 100]
12      (0, 59]
13    (80, 100]
14      (0, 59]
15    (80, 100]
16      (0, 59]
17      (0, 59]
18      (0, 59]
19    (80, 100]
Name: score, dtype: category
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
# cut的标签化
df['Categories'] = pd.cut(df['score'], bins, labels=['low','ok','good','great'])
df
score student Categories
0 93 8c1 great
1 35 cHy low
2 83 6xy great
3 44 6gY low
4 56 tc5 low
5 62 r5T ok
6 37 3z3 low
7 86 vsy great
8 44 F6h low
9 82 hgC great
10 49 xA9 low
11 91 iLZ great
12 49 BVK low
13 82 E9C great
14 53 rbE low
15 89 hSL great
16 47 AIt low
17 56 Gdk low
18 38 AFX low
19 86 JhU great
原创文章 257 获赞 187 访问量 16万+

猜你喜欢

转载自blog.csdn.net/weixin_43469680/article/details/105644436