【gcc, cmake, eigen, opencv,ubuntu】三.eigen和mkl安装和使用

文章目录

- eigen 和 mkl介绍

eigen 和 mkl介绍

1.eigen和mkl安装

eigen的安装很简单：两种方法，一种直接命令行安装，另一种通过源文件安装。无论哪种都比较简单。
eigen安装参考：https://zhuanlan.zhihu.com/p/462494086
eigen官方网站：http://eigen.tuxfamily.org/index.php?title=Main_Page

查看安装位置

locate eigen3

安装后,头文件安装在/usr/local/include/eigen3/, 一般系统默认寻找路径有 /usr/local/include/,
所以如果包含文件的时候想要使用 #include <Eigen>, 而不是使用 #include <eigen3/Eigen>
可以使用下面的命令：移动头文件

sudo cp -r /usr/local/include/eigen3/Eigen /usr/local/include

或者设置软链接

cd   /usr/local/include/
sudo  ln  -s   eigen3/Eigen   Eigen

2.eigen使用

Eigen是一个C++语言中的开源的模板库，支持线性代数的运算，包括向量运算，矩阵运算，数值分析等相关算法。因为eigen只包含头文件，所以使用的话不需要进行编译，只需要在cpp文件开头写#include <Eigen>就好。

直接 g++编译即可。
比如g++ src.cpp -o out

3.mkl安装

我是参考一下第一个链接下载和安装的。
Linux下MKL库的安装部署与使用

下面链接也可以参考：
cpp, mkl 加速 eigen 实例
 Linux 版的 Intel MKL 的安装使用

4.mkl使用

编译指令：

gcc -I/opt/mkl/mkl/include test_mkl.c /opt/mkl/mkl/lib/intel64/libmkl_rt.so -L/opt/mkl/mkl/lib/intel64 -L/opt/mkl/lib/intel64

能够顺利编译通过和运行，说明安装成功。
测试文件test_mkl.c 代码：

#define min(x,y) (((x) < (y)) ? (x) : (y))
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"
 
int main()
{
    
    
    double *A, *B, *C;
    int m, n, p, i, j;
    double alpha, beta;
 
    printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
            " Intel(R) MKL function dgemm, where A, B, and  C are matrices and \n"
            " alpha and beta are double precision scalars\n\n");
 
    m = 2000, p = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
    alpha = 1.0; beta = 0.0;
    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
    B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
    
    
        printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }
 
    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*p); i++) {
    
    
        A[i] = (double)(i+1);
    }
 
    for (i = 0; i < (p*n); i++) {
    
    
        B[i] = (double)(-i-1);
    }
 
    for (i = 0; i < (m*n); i++) {
    
    
        C[i] = 0.0;
    }
 
    printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
                m, n, p, alpha, A, p, B, n, beta, C, n);
    printf ("\n Computations completed.\n\n");
 
    printf (" Top left corner of matrix A: \n");
    for (i=0; i<min(m,6); i++) {
    
    
        for (j=0; j<min(p,6); j++) {
    
    
            printf ("%12.0f", A[j+i*p]);
        }
        printf ("\n");
    }
 
    printf ("\n Top left corner of matrix B: \n");
    for (i=0; i<min(p,6); i++) {
    
    
        for (j=0; j<min(n,6); j++) {
    
    
            printf ("%12.0f", B[j+i*n]);
        }
        printf ("\n");
    }
    
    printf ("\n Top left corner of matrix C: \n");
    for (i=0; i<min(m,6); i++) {
    
    
        for (j=0; j<min(n,6); j++) {
    
    
            printf ("%12.5G", C[j+i*n]);
        }
        printf ("\n");
    }
 
    printf ("\n Deallocating memory \n\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);
 
    printf (" Example completed. \n\n");
    return 0;
}

5.eigen使用

eigen是一个矩阵库，使用的时候只需要包含头文件即可。
mkl是intel的一个数学计算相关库，速度比较快。

一般使用eigen可以不使用mkl, 如果使用eigen的时候使能mkl可能会有助于提升程序的运行速度。

eigen中使用mkl，只需要在程序开头：

#define EIGEN_USE_MKL_ALL
#define EIGEN_VECTORIZE_SSE4_2

下面利用eigen完成一个简单的全连接层：
eigen2.cpp 是一个7层的全连接网络。

不使用mkl的时候 9微秒，g++ -march=native -O2 eigen2.cpp -o eigen2
使用mkl后也是9微秒左右，g++ -march=native -O2 eigen2.cpp -o eigen2 /opt/mkl/mkl/lib/intel64/libmkl_rt.so -I/opt/mkl/mkl/include -L/opt/mkl/mkl/lib/intel64
这个程序使用和不使用mkl差异不大。

#define EIGEN_USE_MKL_ALL
#define EIGEN_VECTORIZE_SSE4_2

#include <eigen3/Eigen/Dense>
#include <eigen3/Eigen/Core>
#include <iostream>
#include <chrono>

using namespace Eigen;
using namespace std;
typedef MatrixXf mat;
typedef VectorXf vec;

constexpr int d_in = 20;
constexpr int d_inter = 100;
constexpr int d_out = 30;
constexpr int num_round = 1000;

vec prelu(vec in, const vec &a){
    
    
    for (int i=0; i<in.size(); i++){
    
    
        if (in[i] < 0)
            in[i] = in[i] * a[i];
    }
    return in;
}

double one_run(){
    
    
    // eigen has no normal distributed initialization. approximate it with uniform distribution
    // weights
    mat w1 = mat::Random(d_inter, d_in);
    mat w2 = mat::Random(d_inter, d_inter);
    mat w3 = mat::Random(d_inter, d_inter);
    mat w4 = mat::Random(d_inter, d_inter);
    mat w5 = mat::Random(d_inter, d_inter);
    mat w6 = mat::Random(d_inter, d_inter);
    mat w7 = mat::Random(d_out, d_inter);

    // bias
    vec b1 = vec::Random(d_inter);
    vec b2 = vec::Random(d_inter);
    vec b3 = vec::Random(d_inter);
    vec b4 = vec::Random(d_inter);
    vec b5 = vec::Random(d_inter);
    vec b6 = vec::Random(d_inter);
    vec b7 = vec::Random(d_out);

    // param for prelu
    vec a1 = vec::Random(d_inter);
    vec a2 = vec::Random(d_inter);
    vec a3 = vec::Random(d_inter);
    vec a4 = vec::Random(d_inter);
    vec a5 = vec::Random(d_inter);
    vec a6 = vec::Random(d_inter);

    auto t_start = std::chrono::high_resolution_clock::now();

    // random input
    vec input = vec::Random(d_in);

    // forward
    vec result;

    result = prelu(w1 * input + b1, a1);
    result = prelu(w2 * result + b2, a2);
    result = prelu(w3 * result + b3, a3);
    result = prelu(w4 * result + b4, a4);
    result = prelu(w5 * result + b5, a5);
    result = prelu(w6 * result + b6, a6);
    result = (w7 * result + b7).eval();         // force evaluation. just in case. 
    
    auto t_end = std::chrono::high_resolution_clock::now();
    double elapsed_time_us = std::chrono::duration<double, std::micro>(t_end-t_start).count();

    return elapsed_time_us;
}


int main(){
    
    
    VectorXd all = VectorXd::Random(num_round);

    for (int i=0; i< num_round; ++i){
    
    
        all[i] = one_run();
    }

    cout << "time in micro second" << endl;
    cout << "mean: " << all.mean() << endl;
    cout << "max: " << all.maxCoeff() << endl;
    cout << "min: " << all.minCoeff() << endl;
    
    VectorXd err = all - VectorXd::Constant(num_round, all.mean());
    err = err.array() * err.array();
    float std = err.mean();
    std = sqrt(std);
    cout << "std: " << std << endl;

    return 0;
}

6.利用eigen实现三线性插值方法，以及一些小示例

每个函数内部有注释

#define EIGEN_USE_MKL_ALL
#define EIGEN_VECTORIZE_SSE4_2
#include <sys/time.h>
#include <eigen3/Eigen/Dense>
#include <eigen3/Eigen/Core>
#include <iostream>
#include <chrono>
#include <vector>
#include "lut3d.h"
using namespace Eigen;
using namespace std;

#include "opencv2/opencv.hpp"
using namespace cv;

// matrix
typedef MatrixXd matd;
typedef VectorXd vecd;
typedef MatrixXf matf;
typedef VectorXf vecf;
typedef MatrixXi mati;
typedef VectorXi veci;

// print eigen matrix
#define printMat(M) cout << #M <<"= "<<endl; \
cout << M <<endl; \
cout << "========================================" <<endl; \


void test_eigen_lookuptable(){
    
    
    // 1. select row and col
    matd m1 = matd::Random(5, 5);
    printMat(m1);

    vector<int> keep_rows = {
    
    0,1,3,4};
    veci keep_cols = veci::LinSpaced(m1.cols(), 0, m1.cols());
    vector<int> keep_cols2 = {
    
    0,1,3,4,2,2,2};
    
    matd m1_sel = m1(keep_rows, keep_cols); 
    printMat(m1_sel);

    // 2. condition and setting new value
    Eigen::MatrixXi m(1, 5);
    m << 1, 2, 3, 4, 5;
    m = (m.array() == 3).select(5666, m);
    std::cout << m << std::endl; //1,2,5566,4,5
}


void run_eigen_test2()
{
    
    
 
	MatrixXf M1 = MatrixXf::Random(3, 8);
 
	cout << "Column major input:" << endl << M1 << "\n";
    // 1. eigen is col priority
    //innerStride既表示沿着矩阵的数据存储方向移动一个元素的位置，在内存中需要移动的宽度。
    //outerStride的含义就是不沿着数据存储方向移动一个位置
	cout << "M1.outerStride() = " << M1.outerStride() << endl;
	cout << "M1.innerStride() = " << M1.innerStride() << endl;
 

 
	//2. 最经常用的就是取出一行或者一列的操作
	cout << "Column major input:" << endl << M1 << "\n";
	cout << "The first column is:" << endl << M1.col(0) << "\n";
	cout << "The last column is: " << endl << M1.rightCols(1) << "\n";
	cout << "The first row is: " << endl << M1.topRows<1>() << endl;
	cout << "The last row is: " << endl << M1.bottomRows<1>() << endl;
 
    //3. read matrix file and select index row. notice transpose
    Map<MatrixXf> lut(lut3d, 3, DIM*DIM*DIM);
    printMat(lut.transpose());
    vector<int> index={
    
    0,0,0,1,3,4,17*17*17-1,17*17*17-2};
    matf lut_sel = lut.transpose()(index, Eigen::all);
    printMat(lut_sel);
    cout<< lut_sel.rows()<<"  "<<lut_sel.cols()<<endl;
}
void test_convert()
{
    
    

    // 1. opencv to eigen, rgb data
    Mat mat_opencv = Mat::zeros(4, 4, CV_32FC3);
    float* mat_opencv_p = mat_opencv.ptr<float>(0);
    for(int i=0;i<16*3;i++){
    
    
        mat_opencv_p[i] = 0.1+i;
    }
    cout<<"opencv mat: "<<mat_opencv<<endl;

    Map<MatrixXf> mat_eigen2(mat_opencv_p, 12, 4);
    printMat(mat_eigen2.transpose());

    // 2. opencv to eigen, splited r,g, b data
    vector<Mat> rgb;
    split(mat_opencv, rgb);

    Map<MatrixXf> r_eigen((float*)rgb[0].data, rgb[0].cols, rgb[0].rows);
    Map<MatrixXf> g_eigen((float*)rgb[1].data, rgb[1].cols, rgb[1].rows);
    Map<MatrixXf> b_eigen((float*)rgb[2].data, rgb[2].cols, rgb[2].rows);
    printMat(r_eigen.transpose());
    printMat(g_eigen.transpose());

    int output_size = rgb[0].rows * rgb[0].cols;

    // 3. eigen 2 opencv
    Mat rr(rgb[0].rows, rgb[0].cols, CV_32FC1, r_eigen.data());
    cout<<rr.rows<<"   dd "<<rr.cols<<endl;
    cout<<rr<<endl;

    // 4. RowVectorXf init
    // Map<RowVectorXf> r_eigen1((float*)rgb[0].data, output_size);
    // Map<RowVectorXf> g_eigen1((float*)rgb[1].data, output_size);
    // Map<RowVectorXf> b_eigen1((float*)rgb[2].data, output_size);

    // 5. ArrayXf , one dim， Array和Matrix不同，Array的一般算术运算是element-wise.
    Map<ArrayXf> r_eigen1((float*)rgb[0].data, output_size);
    Map<ArrayXf> g_eigen1((float*)rgb[1].data, output_size);
    Map<ArrayXf> b_eigen1((float*)rgb[2].data, output_size);
    cout<<r_eigen1.size()<<endl;
    ArrayXf c = r_eigen1*0.2 + 45 ;
    cout << " c = "<< c<<endl;
    cout << " c = "<< c.cast<int>()<<endl;
    // 6. ArrayXXf , two dim
    ArrayXXf a(2,2);
    ArrayXXf b(2,2);
    a << 1,2,
        3,4;
    b << 5,6,
        7,8;
    cout << "a * b = " << endl << a * b << endl;
    cout << "a / b = " << endl << a / b << endl;

}

//三线性插值：apply 3dlut
void TriLinearForwardCpu_eigen(float* lut3d, Mat& image, Mat& output, const int dim, const float binsize, const int width, const int height)
{
    
    
    vector<Mat> rgb;
    split(image, rgb);

    int output_size = height * width;
    
    Map<ArrayXf> r_eigen((float*)rgb[0].data, output_size);
    Map<ArrayXf> g_eigen((float*)rgb[1].data, output_size);
    Map<ArrayXf> b_eigen((float*)rgb[2].data, output_size);

    ArrayXf r = r_eigen / binsize;
    ArrayXf g = g_eigen / binsize;
    ArrayXf b = b_eigen / binsize;
 
    ArrayXi r_id, g_id, b_id;
    r_id = r.cast<int>();
    g_id = g.cast<int>();
    b_id = b.cast<int>();
  
    // ArrayXf r_d(output_size), g_d(output_size), b_d(output_size);
    ArrayXf r_d = r - r_id.cast<float>();
    ArrayXf g_d = g - g_id.cast<float>();
    ArrayXf b_d = b - b_id.cast<float>();

    ArrayXi id000, id100, id010, id110, id001, id101, id011, id111;
    id000 = r_id + g_id * dim + b_id * dim * dim;
    id100 = r_id + 1 + g_id * dim + b_id * dim * dim;
    id010 = r_id + (g_id + 1) * dim + b_id * dim * dim;
    id110 = r_id + 1 + (g_id + 1) * dim + b_id * dim * dim;
    id001 = r_id + g_id * dim + (b_id + 1) * dim * dim;
    id101 = r_id + 1 + g_id * dim + (b_id + 1) * dim * dim;
    id011 = r_id + (g_id + 1) * dim + (b_id + 1) * dim * dim;
    id111 = r_id + 1 + (g_id + 1) * dim + (b_id + 1) * dim * dim;

    ArrayXf w000, w100, w010, w110, w001, w101, w011, w111;
    w000 = (1 - r_d) * (1 - g_d) * (1 - b_d);
    w100 = r_d * (1 - g_d) * (1 - b_d);
    w010 = (1 - r_d) * g_d * (1 - b_d);
    w110 = r_d * g_d * (1 - b_d);
    w001 = (1 - r_d) * (1 - g_d) * b_d;
    w101 = r_d * (1 - g_d) * b_d;
    w011 = (1 - r_d) * g_d * b_d;
    w111 = r_d * g_d * b_d;

    Map<ArrayXXf> lutt(lut3d, 3, DIM*DIM*DIM);
    ArrayXXf lut = lutt.transpose();
 

    r = w000 * lut(id000, 0) + w100 * lut(id100, 0) +
        w010 * lut(id010, 0) + w110 * lut(id110, 0) +
        w001 * lut(id001, 0) + w101 * lut(id101, 0) +
        w011 * lut(id011, 0) + w111 * lut(id111, 0);

    g = w000 * lut(id000, 1) + w100 * lut(id100, 1) +
            w010 * lut(id010, 1) + w110 * lut(id110, 1) +
            w001 * lut(id001, 1) + w101 * lut(id101, 1) +
            w011 * lut(id011, 1) + w111 * lut(id111, 1);

    b = w000 * lut(id000, 2) + w100 * lut(id100, 2) +
            w010 * lut(id010, 2) + w110 * lut(id110, 2) +
            w001 * lut(id001, 2) + w101 * lut(id101, 2) +
            w011 * lut(id011, 2) + w111 * lut( id111, 2);
     Mat r1(height, width, CV_32FC1, r.data());
    Mat g1(height, width, CV_32FC1, g.data());
    Mat b1(height, width, CV_32FC1, b.data());
    rgb[0] = r1;
    rgb[1] = g1;
    rgb[2] = b1;
    merge(rgb, output);
   
}


int test_trilinear_eigen()
{
    
    
    std::string file = "image/IMG_0002.tif";
    cv::Mat img0 = cv::imread(file);
    cv::imshow("window", img0);
    //cv::waitKey(0);// 按任意键在0秒后退出窗口，不写这句话是不会显示出窗口的
    cv::Mat img1;
    cv::cvtColor(img0, img1, cv::COLOR_BGR2RGB);
    cv::Mat img;
    img1.convertTo(img, CV_32F, 1.0 / 255);



    int height = img.rows;
    int width = img.cols;
    int channels = img.channels();
    printf("hello  height, width: %d,%d\n", height, width);
    cv::Vec3f color_value = img.ptr<cv::Vec3f>(0)[0];
    cout << img.ptr<float>(0)[0] << "  " << img.ptr<float>(0)[1] << "" << img.ptr<float>(0)[2] << endl;
    cout << color_value << endl;

    int N = 10;
    vector<double> all(N, 0);

    float* lut = lut3d;
    float* image = (float*)img.data;

    cv::Mat output_img = Mat::zeros(height, width, CV_32FC3); 
    float* output = (float*)output_img.data;

    const int shift = 1;
    const float binsize = 1.00001 / (DIM - 1);

    for (int i = 0; i < N; i++) {
    
    
        auto t_start = std::chrono::high_resolution_clock::now();

        struct timeval t1,t2;
        double timeuse;
        gettimeofday(&t1,NULL);

        TriLinearForwardCpu_eigen(lut, img, output_img, 17, binsize, width, height);
        gettimeofday(&t2,NULL);
        timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000000.0;

        printf("timeuse:%lf\n", timeuse);

        auto t_end = std::chrono::high_resolution_clock::now();
        double elapsed_time_us = std::chrono::duration<double, std::micro>(t_end - t_start).count();
        cout << "time in micro second " << elapsed_time_us / 1000 << endl;
        all[i] = elapsed_time_us;
    }
    output_img.convertTo(output_img, CV_8UC3, 255);
    cv::cvtColor(output_img, output_img, cv::COLOR_RGB2BGR);
    cv::imshow("window3", output_img);
    cv::waitKey(0);// 按任意键在0秒后退出窗口，不写这句话是不会显示出窗口的
    return 0;
}
int main()
{
    
    
    test_trilinear_eigen();
    //test_subset();
    //run_eigen_test2();
	//test_eigen_lookuptable();
    return 0;
}