测试minpy 调用gpu 加速矩阵相乘. accelerate matrix multiplication

测试minpy 调用gpu加速矩阵相乘,已经写了几篇文章.前几篇文章得到的结果不太好,主要原因是跟想象中的结果并不是很相同.
主要有两点,一个是前几篇测试加速的效果并不是很好,矩阵要很大的时候才能看到明显的加速.另一个是我一个先验的经验认为float32的加速效果要明显比float64的加速效果要好很多. 但是这两点在前面的测试中并没有得到.
这个就能感受到理论跟实验之间差距.如果你相信理论,相信自己的先验经验知识,那就说明你实验中存在局限性.但关键点是我们不知道我们实验的局限性在哪里,我们手里也没有准确的可以对比的实验结果数据,也就是说我们不知道我们实验的哪个地方是出了问题的.我们尝试去找问题出现在哪里,但如果我们找不到呢? 是把自己的实验结果搁置,还是自己从心里去怀疑这个先验知识呢? 我们一直得不到我们想要的跟我们认为的理论上相近的结果.
前几篇文章关于这个gpu加速主题,基本上就是围绕这个过程.但是依然没有得到我们想要的结果.
可能这篇文章的测试也依然是有局限性,也就是说也可能说不太准确的.
这里我们只是把这个过程记录下来.

main.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
#####################################
# File name : main.py
# Create date : 2019-01-10 16:39
# Modified date : 2019-01-11 14:52
# Author : DARREN
# Describe : not set
# Email : [email protected]
#####################################
from __future__ import division
from __future__ import print_function

import os
import minpy.numpy as np
import minpy.numpy.random as random
from minpy.context import cpu, gpu
import time

import matplotlib.pyplot as plt

def create_path(path):
    if not os.path.isdir(path):
        os.makedirs(path)

def get_file_full_name(path, name):
    create_path(path)
    if path[-1] == "/":
        full_name = path +  name
    else:
        full_name = path + "/" +  name
    return full_name

def create_file(path, name, open_type='w'):
    file_name = get_file_full_name(path, name)
    return open(file_name, open_type)

def _plot_record(record,full_path):
    _plot_cpu_gpu_time(record, full_path)
    _plot_acceleration(record,full_path)

def _get_full_path(repeats, size_begin, size_end):
    if not os.path.exists("./output"):
        os.makedirs("./output")
    path_str = "./output/%s_%s_%s" % (repeats, size_begin, size_end)
    return path_str

def _plot_cpu_gpu_time(record, full_path):
    float32_cpu_lt = []
    float64_cpu_lt = []
    float32_gpu_lt = []
    float64_gpu_lt = []
    steps = []
    for key in record:
        steps.append([key])
    steps.sort()


    for i in range(len(steps)):
        step_dic = record[steps[i][0]]
        float32_cpu_value = step_dic["float32_cpu"]
        float32_cpu_lt.append(float32_cpu_value)
        float64_cpu_value = step_dic["float64_cpu"]
        float64_cpu_lt.append(float64_cpu_value)

        float32_gpu_value = step_dic["float32_gpu"]
        float32_gpu_lt.append(float32_gpu_value)
        float64_gpu_value = step_dic["float64_gpu"]
        float64_gpu_lt.append(float64_gpu_value)

    float32_cpu_lt = np.array(float32_cpu_lt)
    float64_cpu_lt = np.array(float64_cpu_lt)
    float32_gpu_lt = np.array(float32_gpu_lt)
    float64_gpu_lt = np.array(float64_gpu_lt)

    float32_cpu_lt = float32_cpu_lt.asnumpy()
    float64_cpu_lt = float64_cpu_lt.asnumpy()
    float32_gpu_lt = float32_gpu_lt.asnumpy()
    float64_gpu_lt = float64_gpu_lt.asnumpy()

    steps = np.array(steps)
    steps = steps*steps
    steps = steps.asnumpy()

    float32_gpu_line, = plt.plot(steps, float32_gpu_lt)
    float64_gpu_line, = plt.plot(steps, float64_gpu_lt)
    float32_cpu_line, = plt.plot(steps, float32_cpu_lt)
    float64_cpu_line, = plt.plot(steps, float64_cpu_lt)

    line_lt = [
    float32_gpu_line,
    float64_gpu_line,
    float32_cpu_line,
    float64_cpu_line,
    ]

    labels_lt = (
    "float32 gpu",
    "float64 gpu",
    "float32 cpu",
    "float64 cpu",
    )
    plt.legend(handles=line_lt,labels=labels_lt,loc='best')
    full_path_name = "%s/cpu_gpu.jpg" % (full_path)
#    plt.show()
    plt.savefig(full_path_name)
    plt.close()

def _plot_acceleration(record, full_path):
    float64_acceleration_lt= []
    float32_acceleration_lt= []
    steps = []
    for key in record:
        steps.append([key])
    steps.sort()

    for i in range(len(steps)):
        step_dic = record[steps[i][0]]
        float64_acceleration_value = step_dic["float64_acceleration"]
        float64_acceleration_lt.append(float64_acceleration_value)
        float32_acceleration_value = step_dic["float32_acceleration"]
        float32_acceleration_lt.append(float32_acceleration_value)

    float64_acceleration_lt = np.array(float64_acceleration_lt)
    float64_acceleration_lt = float64_acceleration_lt.asnumpy()
    float32_acceleration_lt = np.array(float32_acceleration_lt)
    float32_acceleration_lt = float32_acceleration_lt.asnumpy()
    steps = np.array(steps)
    steps = steps*steps
    steps = steps.asnumpy()
    float32_acceleration_line, = plt.plot(steps, float32_acceleration_lt)
    float64_acceleration_line, = plt.plot(steps, float64_acceleration_lt)

    line_lt = [
        float32_acceleration_line,
        float64_acceleration_line,
    ]

    labels_lt = (
    'float32 acceleration',
    'float64 acceleration',
    )

    plt.legend(handles=line_lt,labels=labels_lt,loc='best')
    full_path_name = "%s/acceleration.jpg" % (full_path)
#    plt.show()
    plt.savefig(full_path_name)
    plt.close()

def _write_status(file_obj, i, time_lt):
    float32_acceleration = time_lt[1] / time_lt[3]
    float64_acceleration = time_lt[0] / time_lt[2]

    float64_cpu_str = "i:%s float64 cpu:%s" % (i, time_lt[0])
    float32_cpu_str = "i:%s float32 cpu:%s" % (i, time_lt[1])
    float64_gpu_str = "i:%s float64 gpu:%s" % (i, time_lt[2])
    float32_gpu_str = "i:%s float32 gpu:%s" % (i, time_lt[3])

    float32_acceleration_str = "float32 acceleration:%s" % float32_acceleration
    float64_acceleration_str = "float64 acceleration:%s" % float64_acceleration

    file_obj.write("%s\n" % float64_cpu_str)
    file_obj.write("%s\n" % float32_cpu_str)
    file_obj.write("%s\n" % float64_gpu_str)
    file_obj.write("%s\n" % float32_gpu_str)
    file_obj.write("%s\n" % float32_acceleration_str)
    file_obj.write("%s\n" % float64_acceleration_str)

    print(float64_cpu_str)
    print(float32_cpu_str)
    print(float64_gpu_str)
    print(float32_gpu_str)
    print(float32_acceleration_str)
    print(float64_acceleration_str)

def _record_status(record, i,time_lt):
    dic = {}
    dic["float64_cpu"] = time_lt[0]
    dic["float32_cpu"] = time_lt[1]
    dic["float64_gpu"] = time_lt[2]
    dic["float32_gpu"] = time_lt[3]
    dic["float64_acceleration"] =  time_lt[0]/ time_lt[2]
    dic["float32_acceleration"] =  time_lt[1]/ time_lt[3]

    record[i] = dic

def _randn(l,c):
    return random.randn(l,c)

def _get_take_time(s, repeats, data_type):
    x = _randn(s,s)
    y = _randn(s,s)
    x = np.array(x, dtype=data_type)
    y = np.array(y, dtype=data_type)

    t0 = time.time()
    for i in range(repeats):
        z = np.dot(x, y)
    z.asnumpy()
    t1 = time.time()

    all_time = t1 - t0
    avg_time = all_time / repeats
    return avg_time

def test_cpu_gpu(repeats,size_begin, size_end, step=1):
    record = {}
    full_path = _get_full_path(repeats, size_begin, size_end)
    file_obj = create_file(full_path, "output")
    for s in range(size_begin, size_end, step):
        time_lt = []
        with cpu():
            float64_cpu_time = _get_take_time(s, repeats, np.float64)
            float32_cpu_time = _get_take_time(s, repeats, np.float32)
            time_lt.append(float64_cpu_time)
            time_lt.append(float32_cpu_time)

        with gpu(0):
            float64_gpu_time = _get_take_time(s, repeats, np.float64)
            float32_gpu_time = _get_take_time(s, repeats, np.float32)
            time_lt.append(float64_gpu_time)
            time_lt.append(float32_gpu_time)

        _write_status(file_obj, s, time_lt)
        _record_status(record, s, time_lt)

    file_obj.close()
    _plot_record(record,full_path)

def test_matmul(repeats, max_size, step):
    for i in range(int(max_size / step)):
        size_begin = 1 + i*step
        size_end = (i+1)*step
        test_cpu_gpu(repeats, size_begin, size_end)

    size_begin = 1
    size_end = max_size
    test_cpu_gpu(repeats, size_begin, size_end)

def test():
#   repeats = 500
#   max_size = 1000
#   step = 100
#   test_matmul(repeats, max_size, step)

#   repeats = 5
#   size_begin = 1
#   size_end = 3000
#   test_cpu_gpu(repeats, size_begin, size_end)

    repeats = 1
    size_begin = 1
    size_end = 10000
    step = 50
    test_cpu_gpu(repeats, size_begin, size_end, step)

    repeats = 1
    size_begin = 10000
    size_end = 20000
    step = 100
    test_cpu_gpu(repeats, size_begin, size_end, step)

test()

下面是我机器中的cpu和gpu型号

31.4 GiB
Intel® Core™ i7-8700K CPU @ 3.70GHz × 12 
GeForce GTX 1080 Ti/PCIe/SSE2
64-bit

先看下整体的输出效果
运行500次方阵大小1-1000,也就是元素数1-100万
500_1_100
cpu 与gpu 运行时间对比图
在这里插入图片描述
这个结果跟我预想还算比较接近,首先在矩阵比较不是很大的时候就能看到加速效果,第二float32与float64 的加速效果明显不同.
下面是cpu与gpu的加速效果对比图

能够看到在矩阵比较小的时候,float32就有加速效果,float64的加速效果并不是很明显.