❝本文分享最常用的「11个分布(Distribution)关系图」。
❞
目录
21、连续变量堆积直方图(Stacked Histogram for Continuous Variable)
22、类别变量堆积直方图(Stacked Histogram for Categorical Variable)
24、带直方图的密度图(Density Curves with Histogram)
四、分布(Distribution)关系图
21、连续变量堆积直方图(Stacked Histogram for Continuous Variable)
该图展示给定连续变量的频率分布。
# Import Data
df = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Prepare data
x_var = 'displ'
groupby_var = 'class'
df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
vals = [df[x_var].values.tolist() for i, df in df_agg]
# Draw
plt.figure(figsize=(10, 6), dpi=80)
colors = [plt.cm.Set1(i / float(len(vals) - 1)) for i in range(len(vals))]
n, bins, patches = plt.hist(vals,
30,
stacked=True,
density=False,
color=colors[:len(vals)])
# Decoration
plt.legend({
group: col
for group, col in zip(
np.unique(df[groupby_var]).tolist(), colors[:len(vals)])
})
plt.title(f"Stacked Histogram of ${x_var}$ colored by ${groupby_var}$",
fontsize=22)
plt.xlabel(x_var)
plt.ylabel("Frequency")
#plt.ylim(0, 25)
plt.xticks(ticks=bins[::3], labels=[round(b, 1) for b in bins[::3]])
plt.show()
22、类别变量堆积直方图(Stacked Histogram for Categorical Variable)
该图展示给定类别变量的频率分布。
# Import Data
df = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Prepare data
x_var = 'manufacturer'
groupby_var = 'class'
df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
vals = [df[x_var].values.tolist() for i, df in df_agg]
# Draw
plt.figure(figsize=(10, 6), dpi=80)
colors = [plt.cm.Set1(i / float(len(vals) - 1)) for i in range(len(vals))]
n, bins, patches = plt.hist(vals,
df[x_var].unique().__len__(),
stacked=True,
density=False,
color=colors[:len(vals)])
# Decoration
plt.legend({
group: col
for group, col in zip(
np.unique(df[groupby_var]).tolist(), colors[:len(vals)])
})
plt.title(f"Stacked Histogram of ${x_var}$ colored by ${groupby_var}$",
fontsize=22)
plt.xlabel(x_var)
plt.ylabel("Frequency")
plt.ylim(0, 40)
plt.xticks(ticks=bins,
labels=np.unique(df[x_var]).tolist(),
rotation=90,
horizontalalignment='left')
plt.show()
了解更多直方图:
23、密度图(Density Plot)
该图展示连续变量的分布情况。
# Import Data
df = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Draw Plot
plt.figure(figsize=(10, 8), dpi=80)
sns.kdeplot(df.loc[df['cyl'] == 4, "cty"],
shade=True,
color="#01a2d9",
label="Cyl=4",
alpha=.7)
sns.kdeplot(df.loc[df['cyl'] == 5, "cty"],
shade=True,
color="#dc2624",
label="Cyl=5",
alpha=.7)
sns.kdeplot(df.loc[df['cyl'] == 6, "cty"],
shade=True,
color="#C89F91",
label="Cyl=6",
alpha=.7)
sns.kdeplot(df.loc[df['cyl'] == 8, "cty"],
shade=True,
color="#649E7D",
label="Cyl=8",
alpha=.7)
# Decoration
sns.set(style="whitegrid", font_scale=1.1)
plt.title('Density Plot of City Mileage by n_Cylinders', fontsize=18)
plt.legend()
plt.show()
24、带直方图的密度图(Density Curves with Histogram)
# Import Data
df = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Draw Plot
plt.figure(figsize=(10, 8), dpi=80)
sns.distplot(df.loc[df['class'] == 'compact', "cty"],
color="#01a2d9",
label="Compact",
hist_kws={'alpha': .7},
kde_kws={'linewidth': 3})
sns.distplot(df.loc[df['class'] == 'suv', "cty"],
color="#dc2624",
label="SUV",
hist_kws={'alpha': .7},
kde_kws={'linewidth': 3})
sns.distplot(df.loc[df['class'] == 'minivan', "cty"],
color="g",
label="#C89F91",
hist_kws={'alpha': .7},
kde_kws={'linewidth': 3})
plt.ylim(0, 0.35)
# Decoration
sns.set(style="whitegrid", font_scale=1.1)
plt.title('Density Plot of City Mileage by Vehicle Type', fontsize=18)
plt.legend()
plt.show()
更多核密度图:
25、山峰叠峦图(Joy Plot)
该图展示大量分组之间的关系,比heatmap形象。
!pip install joypy#安装依赖包
#每组数据绘制核密度图,R中有ggjoy
import joypy
# Import Data
mpg = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Draw Plot
plt.figure(figsize=(10, 6), dpi=80)
fig, axes = joypy.joyplot(mpg,
column=['hwy', 'cty'],
by="class",
ylim='own',
colormap=plt.cm.Set1,
figsize=(10, 6))
# Decoration
plt.title('Joy Plot of City and Highway Mileage by Class', fontsize=18)
plt.show()
26、分布点图(Distributed Dot Plot)
分布点图显示了按组划分的点的单变量分布。点色越浅,该区域中数据点的集中度越高。通过对中位数进行不同的着色,各组的实际位置会立即变得明显。
import matplotlib.patches as mpatches
# Prepare Data
df_raw = pd.read_csv("./datasets/mpg_ggplot2.csv")
cyl_colors = {4: 'tab:red', 5: 'tab:green', 6: 'tab:blue', 8: 'tab:orange'}
df_raw['cyl_color'] = df_raw.cyl.map(cyl_colors)
# Mean and Median city mileage by make
df = df_raw[['cty',
'manufacturer']].groupby('manufacturer').apply(lambda x: x.mean())
df.sort_values('cty', ascending=False, inplace=True)
df.reset_index(inplace=True)
df_median = df_raw[['cty', 'manufacturer'
]].groupby('manufacturer').apply(lambda x: x.median())
# Draw horizontal lines
fig, ax = plt.subplots(figsize=(11, 7), dpi=80)
ax.hlines(y=df.index,
xmin=0,
xmax=40,
color='#01a2d9',
alpha=0.5,
linewidth=.5,
linestyles='dashdot')
# Draw the Dots
for i, make in enumerate(df.manufacturer):
df_make = df_raw.loc[df_raw.manufacturer == make, :]
ax.scatter(y=np.repeat(i, df_make.shape[0]),
x='cty',
data=df_make,
s=75,
edgecolors='#01a2d9',
c='w',
alpha=0.5)
ax.scatter(y=i,
x='cty',
data=df_median.loc[df_median.index == make, :],
s=75,
c='#dc2624')
# Annotate
ax.text(33,
13,
"$red \; dots \; are \; the \: median$",
fontdict={'size': 12},
color='#dc2624')
# Decorations
red_patch = plt.plot([], [],
marker="o",
ms=10,
ls="",
mec=None,
color='#dc2624',
label="Median")
plt.legend(handles=red_patch)
ax.set_title('Distribution of City Mileage by Make', fontdict={'size': 18})
ax.set_xlabel('Miles Per Gallon (City)')
ax.set_yticks(df.index)
ax.set_yticklabels(df.manufacturer.str.title(),
fontdict={'horizontalalignment': 'right'})
ax.set_xlim(1, 40)
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.grid(axis='both', alpha=.4, linewidth=.1)
plt.show()
27、箱图(boxplot)
很好的展示数据的分布情况~
# Import Data
df = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Draw Plot
plt.figure(figsize=(10, 6), dpi=80)
sns.boxplot(
x='class',
y='hwy',
data=df,
notch=False,
palette="Set1",
)
# Add N Obs inside boxplot (optional)
def add_n_obs(df, group_col, y):
medians_dict = {
grp[0]: grp[1][y].median()
for grp in df.groupby(group_col)
}
xticklabels = [x.get_text() for x in plt.gca().get_xticklabels()]
n_obs = df.groupby(group_col)[y].size().values
for (x, xticklabel), n_ob in zip(enumerate(xticklabels), n_obs):
plt.text(x,
medians_dict[xticklabel] * 1.01,
"#obs : " + str(n_ob),
horizontalalignment='center',
fontdict={'size': 12},
color='black')
add_n_obs(df, group_col='class', y='hwy')
# Decoration
sns.set(style="whitegrid", font_scale=1.1)
plt.title('Box Plot of Highway Mileage by Vehicle Class', fontsize=16)
plt.ylim(10, 40)
plt.show()
28、箱图结合点图(Dot + Box Plot)
该图展示箱图及箱图绘制所用的详细点。
# Import Data
df = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Draw Plot
plt.figure(figsize=(13, 10), dpi=80)
sns.boxplot(
x='class',
y='hwy',
data=df,
hue='cyl',
palette="Set1",
)
plt.legend(loc=9)
sns.stripplot(x='class',
y='hwy',
data=df,
color='#dc2624',
size=5,
jitter=1)
for i in range(len(df['class'].unique()) - 1):
plt.vlines(i + .5, 10, 45, linestyles='solid', colors='gray', alpha=0.2)
# Decoration
plt.title('Box Plot of Highway Mileage by Vehicle Class', fontsize=18)
plt.show()
更多关于箱图:
29、小提琴图(Violin Plot)
比箱图更好看,但不常用,小提琴的形状或面积由该位置数据次数决定。
# Import Data
df = pd.read_csv("./datasets/mpg_ggplot2.csv")
# Draw Plot
plt.figure(figsize=(13, 10), dpi=80)
sns.violinplot(x='class',
y='hwy',
data=df,
scale='width',
palette='Set1',
inner='quartile')
# Decoration
plt.title('Violin Plot of Highway Mileage by Vehicle Class', fontsize=18)
plt.show()
30、金字塔图(Population Pyramid)
可以理解为一种排过序的分组水平柱状图barplot,可很好展示不同分组之间的差异,可可视化逐级过滤或者漏斗的每个阶段。
# Read data
df = pd.read_csv("./datasets/email_campaign_funnel.csv")
# Draw Plot
plt.figure(figsize=(12, 8), dpi=80)
group_col = 'Gender'
order_of_bars = df.Stage.unique()[::-1]
colors = [
plt.cm.Set1(i / float(len(df[group_col].unique()) - 1))
for i in range(len(df[group_col].unique()))
]
for c, group in zip(colors, df[group_col].unique()):
sns.barplot(x='Users',
y='Stage',
data=df.loc[df[group_col] == group, :],
order=order_of_bars,
color=c,
label=group)
# Decorations
plt.xlabel("$Users$")
plt.ylabel("Stage of Purchase")
plt.yticks(fontsize=12)
plt.title("Population Pyramid of the Marketing Funnel", fontsize=18)
plt.legend()
plt.show()
31、分类图(Categorical Plots)
展示彼此相关多个(>=2个)分类变量的计数分布,其实就是seaborn的分面图。
# Load Dataset
titanic = pd.read_csv('./datasets/titanic.csv')
# Plot
g = sns.catplot("alive",
col="deck",
col_wrap=4,
data=titanic[titanic.deck.notnull()],
kind="count",
height=3.5,
aspect=.8,
palette='Set1')
plt.show()
# Plot
sns.catplot(x="age",
y="embark_town",
hue="sex",
col="class",
data=titanic[titanic.embark_town.notnull()],
orient="h",
height=5,
aspect=1,
palette="Set1",
kind="violin",
dodge=True,
cut=0,
bw=.2)
更多关于分面图:
有用请“点赞”“在看”“分享”