关系数据可视化

本文所做的数据的数据可视化实现基于python 3.9.4，需安装matplotlib、numpy、pyecharts、pandas、plotly等依赖库，可通过下述命令完成。

pip install matplotlib
pip install numpy
pip install -v pyecharts==1.1.0
pip install plotly
pip install pandas

散点图

又称XY散点图，将数据以点的形式展现，显示变量间的相互关系或者影响程度，点的位置由变量的数值决定。

场景：显示若干数据系列中各数值之间的关系，类似XY轴，判断两变量之间是否存在某种关联，或者发现数据的分布或聚合情况

优点：

可以展示数据的分布和聚合情况
适合展示较大的数据集

缺点：

散点图看上去比较乱，基本只能看相关、分布和聚合，其他信息均不能很好展现

类似图表： 气泡图

示例

import numpy as np
import matplotlib.pyplot as plt

# Scatter plot
N = 50
x = np.random.rand(N)
y = np.random.rand(N)
colors = np.random.rand(N)
area = (30 * np.random.rand(N))**2  # 0 to 15 point radii

plt.scatter(x, y, s=area, c=colors, alpha=0.5)
plt.title('Scatter plot')
plt.show()

# Scatter plot on polar axis
# Compute areas and colors
N = 150
r = 2 * np.random.rand(N)
theta = 2 * np.pi * np.random.rand(N)
area = 200 * r**2
colors = theta

fig = plt.figure()
ax = fig.add_subplot(111, projection='polar')
c = ax.scatter(theta, r, c=colors, s=area, cmap='hsv', alpha=0.75)
plt.title('Scatter plot on polar axis')
plt.show()

from matplotlib.ticker import NullFormatter
# Scatter plot with histograms
# the random data
N = 1000
x = np.random.randn(N)
y = np.random.randn(N)

# no labels
nullfmt = NullFormatter()         

# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.02

rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.2]
rect_histy = [left_h, bottom, 0.2, height]

# start with a rectangular Figure
plt.figure(1, figsize=(8, 8))

axScatter = plt.axes(rect_scatter)
axHistx = plt.axes(rect_histx)
axHisty = plt.axes(rect_histy)

# no labels
axHistx.xaxis.set_major_formatter(nullfmt)
axHisty.yaxis.set_major_formatter(nullfmt)

# the scatter plot:
axScatter.scatter(x, y)

# now determine nice limits by hand:
binwidth = 0.25
xymax = max(np.max(np.abs(x)), np.max(np.abs(y)))
lim = (int(xymax/binwidth) + 1) * binwidth

axScatter.set_xlim((-lim, lim))
axScatter.set_ylim((-lim, lim))

bins = np.arange(-lim, lim + binwidth, binwidth)
axHistx.hist(x, bins=bins)
axHisty.hist(y, bins=bins, orientation='horizontal')

axHistx.set_xlim(axScatter.get_xlim())
axHisty.set_ylim(axScatter.get_ylim())

plt.title('Scatter plot with histograms')
plt.show()

# Scatter Masked
N = 100
r0 = 0.6
x = 0.9 * np.random.rand(N)
y = 0.9 * np.random.rand(N)
area = (20 * np.random.rand(N))**2  # 0 to 10 point radii
c = np.sqrt(area)
r = np.sqrt(x * x + y * y)
area1 = np.ma.masked_where(r < r0, area)
area2 = np.ma.masked_where(r >= r0, area)
plt.scatter(x, y, s=area1, marker='^', c=c)
plt.scatter(x, y, s=area2, marker='o', c=c)
# Show the boundary between the regions:
theta = np.arange(0, np.pi / 2, 0.01)
plt.plot(r0 * np.cos(theta), r0 * np.sin(theta))

plt.title('Scatter Masked')
plt.show()

# 3D scatter plot
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

def randrange(n, vmin, vmax):
    '''
    Helper function to make an array of random numbers having shape (n, )
    with each number distributed Uniform(vmin, vmax).
    '''
    return (vmax - vmin)*np.random.rand(n) + vmin

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

n = 100

# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].
for c, m, zlow, zhigh in [('r', 'o', -50, -25), ('b', '^', -30, -5)]:
    xs = randrange(n, 23, 32)
    ys = randrange(n, 0, 100)
    zs = randrange(n, zlow, zhigh)
    ax.scatter(xs, ys, zs, c=c, marker=m)

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.title('3D scatter plot')
plt.show()

散点图
极坐标散点图
直方图散点图
分割线散点图

漏斗图

有多个梯形从上到下叠加而成。从上到下的项有逻辑上的顺序关系，梯形面积表示某个环节上业务量与上一个环节的差异。

场景： 适用于业务流程比较规范、周期长、环节多的单流程单项分析，通过漏斗各环节业务数据的比较能够直观地发现和说明问题所在的环节，进而做出决策。

说明：

漏斗图总是开始于一个100%的数量，结束于一个较小的数量；
在开始和结束之间由N个流程环节组成，每个环节用一个梯形来表示；
梯形的上底宽度表示当前环节的输入情况，下底表示当前环节的输出，上底与下底之间的差表现了在当前环节业务量的减小量，当前梯形边的斜率表现了当前环节的减小率；
漏斗图的所有环节的流量都应该使用同一个度量。

类似图表： 金字塔图、对称漏斗图（旋风）、对比漏斗图

示例

from plotly import graph_objects as go

fig = go.Figure(go.Funnel(
    y=[u"喜欢音乐", u"喜欢中国音乐", u"喜欢国风音乐", u"喜欢近现代国风", u"喜欢安九的歌曲"],
    x=[98367, 83164, 73532, 58172, 32829],
    textposition="inside",
    textinfo="value+percent initial",
    opacity=0.65,
    marker={
    
    "color": ["deepskyblue", "lightsalmon", "tan", "teal", "silver"],
            "line": {
    
    
        "width": [4, 2, 2, 3, 1, 1],
        "color": ["wheat", "wheat", "blue", "wheat", "wheat"]
    }
    },
    connector={
    
    "line": {
    
    
        "color": "royalblue",
        "dash": "dot",
        "width": 3
    }
    })
)
fig.show()

fig = go.Figure()
fig.add_trace(go.Funnel(
    name='广东',
    y=[u"喜欢音乐", u"喜欢中国音乐", u"喜欢国风音乐", u"喜欢近现代国风", u"喜欢安九的歌曲"],
    x=[82732, 72635, 62735, 47272, 45262],
    textinfo="value+percent initial"))

fig.add_trace(go.Funnel(
    name='台湾',
    orientation="h",
    y=[u"喜欢音乐", u"喜欢中国音乐", u"喜欢国风音乐", u"喜欢近现代国风", u"喜欢安九的歌曲"],
    x=[63738, 42826, 41272, 40832, 35262],
    textposition="inside",
    textinfo="value+percent previous"))

fig.add_trace(go.Funnel(
    name='云南',
    orientation="h",
    y=[u"喜欢音乐", u"喜欢中国音乐", u"喜欢国风音乐", u"喜欢近现代国风", u"喜欢安九的歌曲"],
    x=[98273, 83736, 76353, 69354, 54362],
    textposition="outside",
    textinfo="value+percent total"))

fig.show()

漏斗图
多层漏斗图

树图

通过树形结构来展现层级数据的组织关系，以父子层次结构来组织对象，是枚举法的一种表达方式

场景：适用于与组织结构相关的分析，即有明确的层次关系数据

优点：

展现层次关系
可以看到各层级指标间的关系，科进行简单的上卷、下钻等操作

缺点：

数据层级不宜过多
每层的成员不宜过多
无法展现个部分的占比关系

类似图表：矩阵树图

示例

from pyecharts import options as opts
from pyecharts.charts import Tree

data = [{
    
    
    "name": "诗歌分类",
    "children": [
        {
    
    
            "name": "古诗分类",
            "children": [
                {
    
    "name": "按音律分类",
                 "children": [
                    {
    
    "name": "古体诗", "value": 1000},
                    {
    
    "name": "近体诗", "value": 2000},
                    {
    
    "name": "词", "value": 3000},
                    {
    
    "name": "曲", "value": 4000}
                 ]
                 },
                {
    
    "name": "按内容分类",
                 "children": [
                     {
    
    "name": "叙事诗", "value": 1000},
                     {
    
    "name": "抒情诗", "value": 2000},
                     {
    
    "name": "送别诗", "value": 3000},
                     {
    
    "name": "边塞诗", "value": 4000},
                     {
    
    "name": "山水田园诗", "value": 5000},
                     {
    
    "name": "咏史诗", "value": 6000},
                     {
    
    "name": "咏物诗", "value": 7000},
                     {
    
    "name": "悼亡诗", "value": 8000},
                     {
    
    "name": "讽喻诗", "value": 9000}
                 ]}
            ]
        },
        {
    
    
            "name": "新诗分类",
            "children": [
                {
    
    "name": "叙事诗", "value": 1000},
                {
    
    "name": "抒情诗", "value": 2000},
                {
    
    "name": "格律诗", "value": 3000},
                {
    
    "name": "自由诗", "value": 4000},
                {
    
    "name": "散文诗", "value": 5000},
                {
    
    "name": "韵脚诗", "value": 6000},
                {
    
    "name": "现代诗", "value": 7000}
            ]
        }
    ]
}]

c = (
    Tree()
    .add(
        "诗歌分类",
        data,
        collapse_interval=10,
        # BT: Bottom to top
        # RL: Right to left
        # LR: Left to right
        # TB: Top to buttom
        orient="TB",
        # Divergent tree graph
        # layout="radial",
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="诗歌分类"))
)
c.render("诗歌分类.html")

矩阵树图

采用矩形表示层次结构的节点，父子层次关系用矩阵间相互嵌套来表达。从根节点开始，空间根据相应的子节点数目被分成多个矩形，矩形的面积大小对应节点的属性，每个矩形又按照相应节点的子节点递归的进行分割，直到叶子节点为止。

场景：适合展现具有层级关系的数据，能够直观体现同级之间的比较

优点：

图形更紧凑，同样大小的画布可以展现更多信息
可以展现成员间的权重

缺点：

不够直观、明确，不像树图那么清晰
分类占比太小时不容易排布

类似图表：树图、马赛克图、热力图

from pyecharts import options as opts
from pyecharts.charts import TreeMap

data = [
    {
    
    
        "value": 120,
        "name": "四书",
        "children": [
            {
    
    "value": 30, "name": "大学"},
            {
    
    "value": 40, "name": "中庸"},
            {
    
    "value": 40, "name": "论语"},
            {
    
    "value": 10, "name": "孟子"},
        ],
    },
    {
    
    
        "value": 180,
        "name": "五经",
        "children": [
            {
    
    "value": 30, "name": "礼记"},
            {
    
    "value": 40, "name": "尚书"},
            {
    
    "value": 40, "name": "诗经"},
            {
    
    "value": 25, "name": "周易"},
            {
    
    "value": 45, "name": "春秋"},
        ],
    },
    {
    
    
        "value": 240,
        "name": "六艺",
        "children": [
            {
    
    "value": 30, "name": "礼"},
            {
    
    "value": 40, "name": "乐"},
            {
    
    "value": 50, "name": "射"},
            {
    
    "value": 25, "name": "御"},
            {
    
    "value": 45, "name": "书"},
            {
    
    "value": 50, "name": "数"},
        ],
    },
]

c = (
    TreeMap()
    .add("四书五经六艺", data)
    .set_global_opts(title_opts=opts.TitleOpts(title="四书五经六艺"))
)
c.render("四书五经六艺.html")

矩阵树图

桑基图

通过页面访问量PV和访客数量UV推算出网页的转化率，进而可以了解网站的整体运营效果和某一类商品的最终成交量

场景： 适用于电商或与营销相关的分析，比如分析购物网站中，哪些商品最畅销或者哪一个时间段是访问高峰

优点：

特别适合分析展现网站流量的运营数据
显示结果直观，可以清晰的看到各个维度指标变化的情况
支持以某个节点查看该节点所在的流程情况

缺点：

应用面窄，只能显示三级维度的流程数据
对显示的度量要求很严格

示例

import pandas as pd
from pyecharts.charts import Sankey
from pyecharts import options as opts

df = pd.DataFrame({
    
    
    '舞蹈类别': ['古典舞', '民族舞', '惊鸿舞', '民族舞', '惊鸿舞', '古典舞', '惊鸿舞'],
    '舞蹈名称': ['唐宫夜宴', '喜上眉梢', '一舞倾城', '碎梦忧人', '飞雪玉花', '鸿音', '一舞倾城'],
    '推荐结果': ['未观看', '观看', '观看', '未观看', '未观看', '观看', "未观看"],
    '观看次数': [937, 732, 837, 635, 1823, 1298, 397]
})

nodes = []
for i in range(3):
    vales = df.iloc[:, i].unique()
    for value in vales:
        dic = {
    
    }
        dic['name'] = value
        nodes.append(dic)

first = df.groupby(['舞蹈类别', '舞蹈名称'])['观看次数'].sum().reset_index()
second = df.iloc[:, 1:]
first.columns = ['source', 'target', 'value']
second.columns = ['source', 'target', 'value']
result = pd.concat([first, second])

linkes = []
for i in result.values:
    dic = {
    
    }
    dic['source'] = i[0]
    dic['target'] = i[1]
    dic['value'] = i[2]
    linkes.append(dic)

pic = (
    Sankey().add(
        '舞蹈推荐分析',
        nodes,
        linkes,
        linestyle_opt=opts.LineStyleOpts(
            opacity=0.3, curve=0.5, color='source'),
        label_opts=opts.LabelOpts(position='right'),
        node_gap=30,
        #orient="vertical",
    )
    .set_global_opts(title_opts=opts.TitleOpts(title='舞蹈推荐分析'))
)
pic.render('舞蹈推荐分析.html')

桑基图