基于python与CUDA的N卡GPU并行程序——taichi语言笔记

如果要做并行程序，就要使用CUDA，这个原本是要用C语言来写的，但是C语言的开发稍微有点麻烦，经常出现内存报错之类的bug，如果可以使用语法更加简单的python语言来开发，就会更加快捷方便，这时可以有一个选择，就是使用taichi语言，这里记录一些零散的笔记。详细资料可以参考taichi语言的官方英文文档。

简单Julia集示例

首先来看一个简单的例子，生成一个分形Julia集。

# fractal.py
#导入taichi语言
import taichi as ti
#在gpu上运行，自动选择后端，也可以是arch=ti.cuda,ti.opengl,cpu,metal
ti.init(arch=ti.gpu)
n = 320
#创建了一个二维变量数组，size为640*320，元素为浮点类型
pixels = ti.field(dtype=float, shape=(n * 2, n))
#ti.func这个修饰符表示taichi函数，只能被taichi语言域的代码调用
@ti.func
def complex_sqr(z):
    return ti.Vector([z[0]**2 - z[1]**2, z[1] * z[0] * 2])
#ti.kernel这个修饰符表示taichi核函数，只能被python语言域的代码调用
#ti.kernel和ti.func类似于CUDA中的__device__和__global__
@ti.kernel
def paint(t: float):
    #最外层作用域的循环，是并行执行的，内部的循环则不是，如果了解CUDA并行编程，那么可以自然而然理解这点，在遍历稀疏数组时注意这种并行循环
    for i, j in pixels:  # Parallized over all pixels
        c = ti.Vector([-0.8, ti.cos(t) * 0.2])
        z = ti.Vector([i / n - 1, j / n - 0.5]) * 2
        iterations = 0
        while z.norm() < 20 and iterations < 50:
            z = complex_sqr(z) + c
            iterations += 1
        pixels[i, j] = 1 - iterations * 0.02
gui = ti.GUI("Julia Set", res=(n * 2, n))
for i in range(1000000):
    paint(i * 0.03)
    gui.set_image(pixels)
    gui.show()

@ti.kernel
def fill():
    for i in range(10): # Parallelized
        x[i] += i

        s = 0
        for j in range(5): # Serialized in each parallel thread
            s += j

        y[i] = s

@ti.kernel
def fill_3d():
    # Parallelized for all 3 <= i < 8, 1 <= j < 6, 0 <= k < 9
    for i, j, k in ti.ndrange((3, 8), (1, 6), 9):
        x[i, j, k] = i + j + k

可以很方便的给taichi数组赋值

import taichi as ti
pixels = ti.field(ti.f32, (1024, 512))
pixels[42, 11] = 0.7  # store data into pixels
print(pixels[42, 11]) # prints 0.7

还可以很方便的转换成numpy数组格式

import taichi as ti
pixels = ti.field(ti.f32, (1024, 512))

import numpy as np
arr = np.random.rand(1024, 512)
pixels.from_numpy(arr)   # load numpy data into taichi fields

import matplotlib.pyplot as plt
arr = pixels.to_numpy()  # store taichi data into numpy arrays
plt.imshow(arr)
plt.show()

import matplotlib.cm as cm
cmap = cm.get_cmap('magma')
gui = ti.GUI('Color map')
while gui.running:
    render_pixels()
    arr = pixels.to_numpy()
    gui.set_image(cmap(arr))
    gui.show()

最外层作用域的 for 循环是被自动并行执行的。Taichi 的 for 循环具有两种形式，区间 for 循环，和结构 for 循环。区间 for 循环和普通的 Python for 循环没多大区别，只是 Taichi 最外层的 for 会并行执行而已。区间 for 循环可以嵌套。

@ti.kernel
def fill():
  for i in range(10): # 并行执行
    x[i] += i

    s = 0
    for j in range(5): # 在每个并行的线程中顺序执行
      s += j

    y[i] = s

@ti.kernel
def fill_3d():
  # 在区间 3 <= i < 8, 1 <= j < 6, 0 <= k < 9 上展开并行
  for i, j, k in ti.ndrange((3, 8), (1, 6), 9):
    x[i, j, k] = i + j + k

核函数以及类型

taichi是一个伪装成python语言的高性能语言，为了更高的执行速度，所以跟C++一样是一个强类型语言，ti.kernel核函数的输入输出参数要写上类型，不过一般来说ti.kernel核函数不会有返回值，大量的并行数据都是通过内存引用进行赋值和传递的，在CUDA的global核函数中一般也是如此。所以实际上，taichi语言的ti.kernel核函数只能支持返回一个标量值，不支持返回并行的数组，因为这样会影响性能，也没必要。需要注意的是，taichi函数不支持递归，在做一些程序比如渲染器时，光线的多次重复反射一般用递归，这时要修改成循环。

@ti.kernel
def add_xy(x: ti.f32, y: ti.f32) -> ti.i32:
    return x + y  # 等价于： ti.cast(x + y, ti.i32)
res = add_xy(2.3, 1.1)
print(res)  # 3，因为返回值类型是 ti.i32

也可以使用模板ti.template()来指定类型

@ti.func
def my_func(x: ti.template()):
    x = x + 1  # will change the original value of x

@ti.kernel
def my_kernel():
    ...
    x = 233
    my_func(x)
    print(x)  # 234
    ...

kernel核函数不支持向量和矩阵作为参数，但是func函数可以支持

@ti.func
def sdf(u):  # functions support matrices and vectors as arguments. No type-hints needed.
    return u.norm() - 1

@ti.kernel
def render(d_x: ti.f32, d_y: ti.f32):  # kernels do not support vector/matrix arguments yet. We have to use a workaround.
    d = ti.Vector([d_x, d_y])
    p = ti.Vector([0.0, 0.0])
    t = sdf(p)
    p += d * t
    ...

func函数不支持多个返回值return，所以这里需要稍微转换一下

# Bad function - two return statements
@ti.func
def safe_sqrt(x):
  if x >= 0:
    return ti.sqrt(x)
  else:
    return 0.0

# Good function - single return statement
@ti.func
def safe_sqrt(x):
  ret = 0.0
  if x >= 0:
    ret = ti.sqrt(x)
  else:
    ret = 0.0
  return ret

有几个简单方便的标量算术函数，当这些标量函数被作用在矩阵或向量上时，会被逐个作用到所有元素

B = ti.Matrix([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
C = ti.Matrix([[3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])

A = ti.sin(B)
# is equivalent to
for i in ti.static(range(2)):
    for j in ti.static(range(3)):
        A[i, j] = ti.sin(B[i, j])

A = ti.pow(B, 2)
# is equivalent to
for i in ti.static(range(2)):
    for j in ti.static(range(3)):
        A[i, j] = ti.pow(B[i, j], 2)

A = ti.pow(B, C)
# is equivalent to
for i in ti.static(range(2)):
    for j in ti.static(range(3)):
        A[i, j] = ti.pow(B[i, j], C[i, j])

A += 2
# is equivalent to
for i in ti.static(range(2)):
    for j in ti.static(range(3)):
        A[i, j] += 2

A += B
# is equivalent to
for i in ti.static(range(2)):
    for j in ti.static(range(3)):
        A[i, j] += B[i, j]

数据位数可以是8,16,32,64位，一般常用的是32位，类型可以是

i 用于有符号整数，例如233，-666
u 用于无符号整数，例如233，666
f 用于浮点数，例如2.33, 1e-4

创建field张量场，大小为12864，每个元素是32的小矩阵

A = ti.Matrix.field(3, 2, dtype=ti.f32, shape=(128, 64))
#获取小矩阵
mat = A[i, j]
#获取小矩阵元素
a0 = mat[0, 1]
a0 = A[i, j][0, 1]

在taichi中，增量赋值自动作为原子操作

@ti.kernel
def sum():
    for i in x:
        # 方式 1: 正确
        total[None] += x[i]
        # 方式 2: 正确
        ti.atomic_add(total[None], x[i])
        # 方式 3: 非原子操作因而会得到错误结果
        total[None] = total[None] + x[i]

也可以使用命令进行显式原子操作

#这些操作会返回一个参数的旧值
ti.atomic_add(x, y)
ti.atomic_sub(x, y)
ti.atomic_and(x, y)
ti.atomic_or(x, y)
ti.atomic_xor(x, y)

将taichi数据转换成numpy矩阵的形式

@ti.kernel
def my_kernel():
   for i in x:
      x[i] = i * 2

x = ti.field(ti.f32, 4)
my_kernel()
x_np = x.to_numpy()
print(x_np)  # np.array([0, 2, 4, 6])

x = ti.field(ti.f32, 4)
x_np = np.array([1, 7, 3, 5])
x.from_numpy(x_np)

field.to_numpy()
参数：field可以是ti.field，ti.Vector.field或ti.Matrix.field
返回值：np.array数组
field.from_numpy(array)
参数：field可以是ti.field，ti.Vector.field或ti.Matrix.field
array为np.array数组

转换成torch的过程和numpy的命令是类似的。在转换时需要注意到尺寸大小的对应关系

field = ti.field(ti.i32, shape=(233, 666))
field.shape  # (233, 666)

array = field.to_numpy()
array.shape  # (233, 666)

field.from_numpy(array)  # the input array must be of shape (233, 666)

field = ti.Vector.field(3, ti.i32, shape=(233, 666))
field.shape  # (233, 666)
field.n      # 3

array = field.to_numpy()
array.shape  # (233, 666, 3)

field.from_numpy(array)  # the input array must be of shape (233, 666, 3)

field = ti.Matrix.field(3, 4, ti.i32, shape=(233, 666))
field.shape  # (233, 666)
field.n      # 3
field.m      # 4

array = field.to_numpy()
array.shape  # (233, 666, 3, 4)

field.from_numpy(array)  # the input array must be of shape (233, 666, 3, 4)

# Matrix
arr = np.ones(shape=(n, m, 3, 4), dtype=np.int32)
mat.from_numpy(arr)
arr = mat.to_numpy()
assert arr.shape == (n, m, 3, 4)

标量场

taichi场的元素可以是一个标量、向量或矩阵，首先介绍标量场，field场的维度最高为8，可以是稠密或者是稀疏的，首先看稠密场的定义.
ti.field(dtype, shape = None, offset = None)
参数:
dtype – (DataType) type of the field element
shape – (optional, scalar or tuple) the shape of field
offset – (optional, scalar or tuple) see Coordinate offsets

For example, this creates a dense field with four int32 as elements:

x = ti.field(ti.i32, shape=4)
This creates a 4x3 dense field with float32 elements:

x = ti.field(ti.f32, shape=(4, 3))
If shape is () (empty tuple), then a 0-D field (scalar) is created:

x = ti.field(ti.f32, shape=())
Then access it by passing None as index:
x[None] = 2

If shape is not provided or None, the user must manually place it afterwards:

x = ti.field(ti.f32)
ti.root.dense(ti.ij, (4, 3)).place(x)
# equivalent to: x = ti.field(ti.f32, shape=(4, 3))

如果没有设置shape参数，后面必须要手动指定，这时可以按照自己的场景，自定义更加灵活的field场的布局。需要注意的是，在核函数计算之前，要定义好所有的数据变量。

x = ti.field(ti.i32, (6, 5))
x.shape  # (6, 5)

x = ti.field(ti.i32, (2, 3))
x.dtype  # ti.i32

# 设置field场的布局
x = ti.field(ti.i32)
y = ti.field(ti.i32)
blk1 = ti.root.dense(ti.ij, (6, 5))
blk2 = blk1.dense(ti.ij, (3, 2))
blk1.place(x)
blk2.place(y)
# 查询布局节点的层次
x.parent()   # blk1
y.parent()   # blk2
y.parent(2)  # blk1

向量场

定义一个长度为3的全局的向量场

a = ti.Vector.field(3, dtype=ti.f32, shape=(5, 4))
x = a[6, 3][0]
# or
vec = a[6, 3]
x = vec[0]

定义一个局部向量场

a = ti.Vector([2, 3, 4])

向量场支持xyzw四轴表示

a.x
Same as a[0].
a.y
Same as a[1].
a.z
Same as a[2].
a.w
Same as a[3].

使用taichi_glsl进行表示

import taichi_glsl as tl

v = tl.vec(2, 3, 4)
print(v.xy)  # [2 3]
print(v._xYzX_z)  # [0 2 -3 4 -2 0 4]

有几个常用的方法函数

# 模长
a = ti.Vector([3, 4])
a.norm() # sqrt(3*3 + 4*4 + 0) = 5
# 模长平方
a.norm_sqr() # 3*3 + 4*4 = 25
# 归一化
a.normalized() # [3 / 5, 4 / 5]
# 点积
b = ti.Vector([2, 4])
a.dot(b) # 1*2 + 3*4 = 14
# 叉积
c = ti.cross(a, b)
# 外积
c = ti.outer_product(a, b) # NOTE: c[i, j] = a[i] * b[j]
# 转换类型
a = ti.Vector([1.6, 2.3])
a.cast(ti.i32) # [2, 3]
# 长度
a = ti.Vector([1, 2, 3])
a.n  # 3
# 对于全局向量场，获取大小和类型
a = ti.Vector.field(3, dtype=ti.f32, shape=(4, 5))
a.shape  # (4, 5)
a.dtype  # ti.f32