深度学习编译中间件之NNVM(十二)NNVM源代码阅读1

参考文档

对于阅读NNVM源代码而言，建议从最外层使用的nnvm.compiler.build函数开始阅读，逐渐深入．

这里先展示一个最简单的NNVM编译器的使用过程：

# 从本地文件加载mxnet模型
mx_sym, args, auxs = mx.model.load_checkpoint('mobilenet', 0)
nnvm_sym, nnvm_params = nnvm.frontend.from_mxnet(mx_sym, args, auxs)

# 设置输入数据的shape
batch_size = 1
image_shape = (3, 224, 224)
data_shape = (batch_size,) + image_shape

# 进行NNVM编译
with nnvm.compiler.build_config(opt_level = 3):
    graph, lib, params = nnvm.compiler.build(
        nnvm_sym, tvm.target.rasp(), shape={"data": data_shape}, params = nnvm_params)

# 保存生成的执行so库
lib.export_library("mobilenet_deploy.so")

可以将nnvm.compiler.build的执行过程总结为如下步骤：

校正Layout
初始化Pass(指定shape)
初始化所有变量(_all_var_init)
应用优化
预计算裁剪
融合相邻运算并生成最终so
保存变量的初始化值到params参数文件中

在介绍具体的步骤之前先介绍graph.apply这个函数：

展示python/nnvm/graph.py的部分代码

class Graph(object):
    def apply(self, passes):
        """Apply passes to the graph

        Parameters
        ----------
        passes : str or list of str
            The passes to be applied

        Returns
        -------
        g : Graph
            The transformed graph.
        """
        if isinstance(passes, string_types):
            passes = [passes]
        cpass = c_array(ctypes.c_char_p, [c_str(key) for key in passes])
        ghandle = GraphHandle()
        npass = nn_uint(len(passes))
        check_call(_LIB.NNGraphApplyPasses(self.handle, npass, cpass, ctypes.byref(ghandle)))
        return Graph(ghandle)

从上述代码可以看到graph.apply用于调用后端Pass返回转换之后的图．具体通过NNGraphApplyPasses接口来实现调用．

融合相邻运算并生成最终so

展示python/nnvm/build_module.py的部分代码

# 代码可能有缩减，只展示核心代码
graph = graph_attr.set_shape_inputs(graph, shape)
graph = graph.apply("InferShape")

graph = graph_attr.set_dtype_inputs(graph, dtype)

graph._set_json_attr("target", str(target), "str")
graph._set_json_attr("target_host", str(target_host), "str")
graph._set_json_attr("opt_level", 1, "int")

graph = graph.apply("InferShape").apply("InferType")
graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")

libmod = graph_attr._move_out_module(graph, "module")

上述代码中的graph.apply属于核心代码，这些代码用于调用后端Pass．

graph_attr._move_out_module的定义位于python/nnvm/graph_attr.py

_move_out_module = tvm.get_global_func("nnvm.graph._move_module")

nnvm.graph._move_module的定义位于src/compiler/packed_func_ext.cc

TVM_REGISTER_GLOBAL("nnvm.graph._move_module")
    .set_body([](TVMArgs args, TVMRetValue *rv) {
        const nnvm::Graph& g = args[0].AsExtension<Graph>();
        *rv = const_cast<nnvm::Graph*>(&g)->MoveCopyAttr<tvm::runtime::Module>(args[1]);
    });

Graph.MoveCopyAttr的定义位于include/nnvm/top/graph.h

template<typename T>
inline T Graph::MoveCopyAttr(const std::string& attr_name) {
    auto it = attrs.find(attr_name);
    CHECK(it != attrs.end())
        << "Cannot find attribute " << attr_name << " in the graph";
    std::shared_ptr<any> sptr = it->second;
    attrs.erase(it);
    if (sptr.unique()) {
        return std::move(nnvm::get<T>(*sptr));
    } else {
        return nnvm::get<T>(*sptr);
    }
}

从上述代码可以看到graph_attr._move_out_module(graph, "module")访问的是一个tvm::runtime::Module的对象．但是还不清楚这个Module对象是如何生成的，所以需要继续看下去．

在NNVM代码工程中搜索attrs["module"]得到如下代码：

/src/compiler/graph_fuse.cc

// 代码段位于GraphFuseCompile函数中

static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target");
tvm::runtime::Module module = fbuild(func_list, target, target_host);
ret.attrs["module"] = std::make_shared<any>(std::move(module));

上述代码中fbuild函数是使用GetPackedFunc获得，根据深度学习编译中间件之NNVM(四)TVM设计理念与开发者指南中提到的，此处是使用了C++调用Python函数的方法．

通过全局搜索可以得到nnvm.compiler.build_target的定义位于python/nnvm/build_module.py：

@tvm.register_func("nnvm.compiler.build_target")
def _build(funcs, target, target_host):
    if target_host == "":
        target_host = None
    return tvm.build(funcs, target=target, target_host=target_host)

nnvm.compiler.build_target调用了tvm.build

tvm.build的定义位于tvm/python/tvm/build_module.py，执行到这里表示对于整个编译过程而言已经完成了NNVM图优化的阶段，进入到TVM代码生成的阶段．

在介绍TVM具体的代码生成过程前，先了解NNVM传送给TVM进行代码生成的数据结构为：

Array<tvm::LoweredFunc> func_list;
// tvm::LoweredFunc数组

这个数据结构包含了被lower的TVM函数的相关信息，是代码生成前的最终数据结构(IR表示)。这里将介绍这个IR表示是如何生成的。

展示nnvm::Graph GraphFuseCompile函数中和lower相关的部分代码：

src/compiler/graph_fuse.cc

fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx);
for (LoweredFunc f : fe.compiled_func->funcs) {
    if (!func_set.count(f.get())) {
        func_set.insert(f.get());
        func_list.push_back(f);
    }
}

src/compiler/compile_engine.cc

GraphFunc GraphLower(Graph graph,
                     const Array<tvm::Tensor>& inputs,
                     const std::string& target,
                     int master_idx) {
    return CompileEngine::Global()->Lower(graph, inputs, target, master_idx);
}

// CompileEngine::Global()->Lower最终调用了CompileEngine::DoLower函数
// run the actual lowering process
GraphFunc DoLower(Graph graph,
                  const Array<tvm::Tensor>& inputs,
                  const std::string& target,
                  int master_idx) {
    std::string readable_name;
    Array<tvm::Tensor> all_args;
    Array<tvm::Tensor> outputs;
    Schedule sch;

    std::tie(sch, all_args, graph) = GetScheduleArgs(
        graph, inputs, target, master_idx,
        &readable_name, &outputs);

    std::shared_ptr<GraphFuncNode> gf = std::make_shared<GraphFuncNode>();
    gf->target = target;
    gf->func_name = GetUniqeName(readable_name);
    gf->inputs = inputs;
    gf->outputs = outputs;
    static const PackedFunc& flower = GetPackedFunc("nnvm.compiler.lower");
    gf->funcs = flower(sch, all_args, gf->func_name, graph);
    return GraphFunc(gf);
}

// DoLower函数中比较重要的有两点
// 1. GetScheduleArgs函数用于生成Schedule参数
// 2. GetPackedFunc("nnvm.compiler.lower")重新调用了TVM的Python接口

GetScheduleArgs函数定义位于src/compiler/compile_engine.cc

    // get schedule and its args
    std::tuple<Schedule, Array<tvm::Tensor>, Graph>
    GetScheduleArgs(Graph graph,
                    const Array<tvm::Tensor> &inputs,
                    const std::string &target,
                    int master_idx,
                    std::string *readable_name,
                    Array<tvm::Tensor> *outputs) {
        // shape, type
        // 获取TVM计算函数和TVM调度函数
        static auto& fcompute =
            nnvm::Op::GetAttr<FTVMCompute>("FTVMCompute");
        static auto& fschedule =
            nnvm::Op::GetAttr<FTVMSchedule>("FTVMSchedule");

        // 获取并设置输入Shape和类型
        std::vector<TShape> ishape;
        std::vector<int> idtype;

        for (const tvm::Tensor t : inputs) {
            std::vector<dim_t> shape;
            for (Expr v : t->shape) {
                CHECK(v.as<tvm::ir::IntImm>());
                shape.push_back(v.as<tvm::ir::IntImm>()->value);
            }
            ishape.emplace_back(TShape(shape.begin(), shape.end()));
            idtype.emplace_back(GetTypeFlag(t->dtype));
        }
        graph = pass::InferShape(graph, ishape);
        graph = pass::InferType(graph, idtype);

        const ShapeVector& shape_vec = graph.GetAttr<ShapeVector>("shape");
        const DTypeVector& dtype_vec = graph.GetAttr<DTypeVector>("dtype");
        const IndexedGraph& idx = graph.indexed_graph();
        CHECK_EQ(inputs.size(), idx.input_nodes().size());

        // 设置输入Tensor
        std::vector<tvm::Tensor> tensor_vec(idx.num_node_entries());
        for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
            uint32_t nid = idx.input_nodes()[i];
            tensor_vec[idx.entry_id(nid, 0)] = inputs[i];
        }

        std::ostringstream readable_name_os;
        readable_name_os << "fuse";
        for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
            const auto& inode = idx[nid];
            if (inode.source->is_variable()) continue;
            Array<Tensor> op_inputs, out_info;
            readable_name_os << "_" << inode.source->op()->name;
            // input array
            for (const IndexedGraph::NodeEntry& e : inode.inputs) {
                const tvm::Tensor& t = tensor_vec[idx.entry_id(e)];
                CHECK(t.defined());
                op_inputs.push_back(t);
            }
            // output hint
            for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
                Array<Expr> shape;
                for (int64_t x : shape_vec[idx.entry_id(nid, i)]) {
                    CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
                    shape.push_back(make_const(Int(32), x));
                }
                out_info.push_back(
                    placeholder(shape,
                        GetTVMType(dtype_vec[idx.entry_id(nid, i)])));
            }
            // 运行一次op，输入数据随机
            Array<Tensor> out = fcompute[inode.source->op()](
                inode.source->attrs, op_inputs, out_info);
            CHECK_EQ(out.size(), inode.source->num_outputs());
            // schedule on root node, and use master's schedule
            for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
                uint32_t eid = idx.entry_id(nid, index);
                tensor_vec[eid] = out[index];
            }
        }

        // Schedule on final output.
        Array<Tensor> all_args = inputs;
        Array<Tensor> outs;
        for (const IndexedGraph::NodeEntry& e : idx.outputs()) {
            const tvm::Tensor& t = tensor_vec[idx.entry_id(e)];
            CHECK(t.defined());
            outs.push_back(t);
            all_args.push_back(t);
        }

        Schedule sch = fschedule[idx[master_idx].source->op()](
            idx[master_idx].source->attrs, outs, target);

        // store extra return values
        if (readable_name != nullptr) {
            *readable_name = readable_name_os.str();
        }
        if (outputs != nullptr) {
            *outputs = outs;
        }

        return std::make_tuple(sch, all_args, graph);
    }

nnvm.compiler.lower的定义位于tvm/python/tvm/build_module.py

def lower(sch,
          args,
          name="default_function",
          binds=None,
          simple_mode=False):

TVM代码生成过程

这里先展示tvm.build的部分代码：

if fdevice:
    mdev = codegen.build_module(fdevice, str(target_device))
    mhost.import_module(mdev)
return mhost

tvm.build调用了codegen.build_module方法，位于tvm/python/tvm/codegen.py：

from ._ffi.function import _init_api

def build_module(lowered_func, target):
    """Build lowered_func into Module.

    Parameters
    ----------
    lowered_func : LoweredFunc
        The lowered function

    target : str
        The target module type.

    Returns
    -------
    module : Module
        The corressponding module.
    """
    return _Build(lowered_func, target)

codegen._Build的定义位于tvm/src/api/api_codegen.cc:

TVM_REGISTER_API("codegen._Build")
.set_body([](TVMArgs args, TVMRetValue *ret) {
    if (args[0].IsNodeType<LoweredFunc>()) {
        *ret = Build({args[0]}, args[1]);
    } else {
        *ret = Build(args[0], args[1]);
    }
  });

runtime::Module::Build位于tvm/src/codegen/codegen.cc:

runtime::Module Build(const Array<LoweredFunc>& funcs,
                      const std::string& target) {
    std::string mode = target;
    size_t pos = mode.find(' ');
    if (pos != std::string::npos) {
        mode = mode.substr(0, pos);
    }
    std::string build_f_name = "codegen.build_" + mode;
    // the build function.
    const PackedFunc* bf = runtime::Registry::Get(build_f_name);
    CHECK(bf != nullptr)
        << "Target " << target << " is not enabled";
    runtime::Module m = (*bf)(funcs, target);
    return m;
}

因为这里验证的ARM处理器，所以mode为llvm:

codegen.build_llvm的定义位于tvm/src/codegen/llvm/llvm_module.cc:

TVM_REGISTER_API("codegen.build_llvm")
.set_body([](TVMArgs args, TVMRetValue* rv) {
    std::shared_ptr<LLVMModuleNode> n = std::make_shared<LLVMModuleNode>();
    n->Init(args[0], args[1]);
    *rv = runtime::Module(n);
});

LLVMModuleNode::Init的定义位于tvm/src/codegen/llvm/llvm_module.cc:

void Init(const Array<LoweredFunc>& funcs, std::string target) {
    InitializeLLVM();
    tm_ = GetLLVMTargetMachine(target);
    bool system_lib = (target.find("-system-lib") != std::string::npos);
    CHECK_NE(funcs.size(), 0U);
    ctx_ = std::make_shared<llvm::LLVMContext>();
    std::unique_ptr<CodeGenLLVM> cg = CodeGenLLVM::Create(tm_);
    entry_func_ = funcs[0]->name;
    cg->Init(funcs[0]->name, tm_, ctx_.get(), system_lib, system_lib);
    for (LoweredFunc f :  funcs) {
      cg->AddFunction(f);
    }
    cg->AddMainFunction(funcs[0]->name);
    module_ = cg->Finish();
    module_->addModuleFlag(
        llvm::Module::Warning, "tvm_target",
        llvm::MDString::get(*ctx_, target));
    target_ = target;
    mptr_ = module_.get();
}

LLVMModuleNode::Init函数中和代码生成相关的主要代码调用接口为：

CodeGenLLVM::Create
CodeGenLLVM::Init
CodeGenLLVM::AddFunction
CodeGenLLVM::AddMainFunction
CodeGenLLVM::Finish

/*!
 * \brief Compile and add function f to the current module.
 * \param f The function to be added.
 */
virtual void AddFunction(const LoweredFunc& f);

CodeGenLLVM::AddFunction即是负责编译每一个函数并添加到当前module的函数。

CodeGenLLVM::AddFunction的定义位于tvm/src/codegen/llvm/codegen_llvm.cc:

void CodeGenLLVM::AddFunction(const LoweredFunc& f) {
    this->AddFunctionInternal(f, false);
}

void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) {
    this->InitFuncState();
    std::vector<llvm::Type*> arg_types;
    is_restricted_ = f->is_restricted;
    for (Var arg : f->args) {
        Type t = arg.type();
        if (t.is_handle()) {
            auto it = f->handle_data_type.find(arg);
            if (it != f->handle_data_type.end()) {
                arg_types.push_back(LLVMType((*it).second.type())
                    ->getPointerTo(GetGlobalAddressSpace()));
            } else {
                arg_types.push_back(t_int8_->getPointerTo(GetGlobalAddressSpace()));
            }
            if (!is_restricted_) {
                alias_var_set_.insert(arg.get());
            }
        } else {
            arg_types.push_back(LLVMType(arg.type()));
        }
    }
    llvm::FunctionType* ftype = llvm::FunctionType::get(
        ret_void ? t_void_ : t_int_, arg_types, false);
    CHECK(module_->getFunction(f->name) == nullptr)
        << "Function " << f->name << " already exist in module";
    function_ = llvm::Function::Create(
        ftype, llvm::Function::ExternalLinkage,
        f->name, module_.get());
    function_->setCallingConv(llvm::CallingConv::C);
    function_->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
    // set var map and align information
    auto arg_it = function_->arg_begin();
    for (size_t i = 0; i < f->args.size(); ++i, ++arg_it) {
        llvm::Argument* v = &(*arg_it);
        const Var& var = f->args[i];
        var_map_[var.get()] = v;
        if (is_restricted_) {
            if (var.type().is_handle() && !alias_var_set_.count(var.get())) {
                // set non alias.
#if TVM_LLVM_VERSION >= 50
                function_->addParamAttr(i, llvm::Attribute::NoAlias);
#else
                function_->setDoesNotAlias(i + 1);
#endif
            }
        }
    }
    llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function_);
    builder_->SetInsertPoint(entry);
    this->VisitStmt(f->body);
    if (ret_void) {
        builder_->CreateRetVoid();
    } else {
        builder_->CreateRet(ConstInt32(0));
    }
}

CodeGenLLVM::AddFunctionInternal函数的主要内部实现细节为：

确定函数参数和返回值类型，以此确定函数类型
创建函数(llvm::Function::Create)
设置函数选项(调用约定、DLL存储类型)
填充函数参数