SIMD Accelerated Computing Matrix (Composition Principle Experiment 5)

introduce

Document content of experiment 5: https://shimo.im/docs/4iV7Rw1nxLgeMsBe/

The following only implements the SIMD functions of the first part.

  • a) Introduction
    Parallelization is a major trend in computer hardware. However, gone are the days when programmers could make programs run faster without making any changes and relying solely on the work of computer architecture experts, compiler designers, and chip engineers. Therefore, if you want to make the program run faster, software designers should master the basic ideas of parallel programming.
    In this part of the lab, you will write parallel code to solve problems using SIMD intrinsics.
  • b) Familiar with the experimental code
    You will get two source files randomized.cpp and common.h, you can compile randomized.cpp on your familiar development platform and observe the running effect. The function sum() you need to optimize is placed in the common.h header file. The core function of this code is to conditionally sum an integer array and time the program running time. Since the SIMD versions of the core functions are not yet implemented, you should be able to see inconsistencies between the two versions.
  • c) Use the SIMD intrinsic function to optimize the sum function
    Find the function sum_simd() in the common.h header file, you need to optimize it according to the following code:

(Note: You only need to optimize the inner loop body.)
During the optimization process, you may use the following intrinsic functions:
insert image description here

  • d) some helpful hints

    • i. __m128i is the data type used by Intel to declare 128-bit vectors. We will use a variable of type __m128i to store four 32-bit integers;
    • ii. The code provides a variable called _127, which contains four copies of the number 127, and you can use it for comparison;
    • iii. Do not use the save function (_mm_storeu_si128) until you have finished computing the inner loop body. This function is computationally expensive, and if you use it at the end of every loop, you'll find your code doesn't perform well;
    • iv. When accessing a vector of type __m128i, it is not recommended that you directly access each element in it. A better way is to store the vector of type __m128i in an ordinary array with the storeu method, and then access it separately elements in this array;
    • e) Experimental results and experimental reports
      When you finish the code, you should be able to observe the following effects:
  • i. The calculation results of the SIMD version of the code are consistent with those of the unoptimized code;

  • ii. The SIMD version of the code should be faster than the unoptimized code (by how much, record the performance speedup, and analyze why you think it is the correct result)

    Provide your code, running results, and your thinking process in your experiment report, and upload a short demonstration video with explanation (only screenshots of the code and experimental results are given, but no points for the analysis process are provided. )

  • The source code given by the experiment common.h:

#ifndef COMMON_H
#define COMMON_H

#include <x86intrin.h>

#define NUM_ELEMS ((1 << 15) + 10)
#define OUTER_ITERATIONS (1 << 15)

/* 不要修改这个函数 */
long long int sum(unsigned int vals[NUM_ELEMS]) {
    
    
	clock_t start = clock();

	long long int sum = 0;
	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
    
    
		for(unsigned int i = 0; i < NUM_ELEMS; i++) {
    
    
			if(vals[i] >= 128) {
    
    
				sum += vals[i];
			}
		}
	}
	clock_t end = clock();
	printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
	return sum;
}

/* 不要修改这个函数 */
long long int sum_unrolled(unsigned int vals[NUM_ELEMS]) {
    
    
	clock_t start = clock();
	long long int sum = 0;

	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
    
     
		for(unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4) {
    
    
			if(vals[i] >= 128) sum += vals[i];
			if(vals[i + 1] >= 128) sum += vals[i + 1];
			if(vals[i + 2] >= 128) sum += vals[i + 2];
			if(vals[i + 3] >= 128) sum += vals[i + 3];
		}

		//This is what we call the TAIL CASE
		//For when NUM_ELEMS isn't a multiple of 4
		//NONTRIVIAL FACT: NUM_ELEMS / 4 * 4 is the largest multiple of 4 less than NUM_ELEMS
		for(unsigned int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i++) {
    
    
			if (vals[i] >= 128) {
    
    
				sum += vals[i];
			}
		}
	}
	clock_t end = clock();
	printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
	return sum;
}

long long int sum_simd(unsigned int vals[NUM_ELEMS]) {
    
    
	clock_t start = clock();
	
	//这句代码会为你生成一个含有若干个127的向量
	//思考题:为什么你需要它?
	__m128i _127 = _mm_set1_epi32(127);		
	
	long long int result = 0;// 将最终计算的结果保存在这里 
	
	//不要修改此行之上的任何代码!!! 
	
	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
    
    
		/* 你的代码从这里开始 */

		/* 你的代码在这里结束 */

	}
	clock_t end = clock();
	printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
	return result;
}

/* 不要修改这个函数 */
int int_comparator(const void* a, const void* b) {
    
    
	if(*(unsigned int*)a == *(unsigned int*)b) return 0;
	else if(*(unsigned int*)a < *(unsigned int*)b) return -1;
	else return 1;
}

#endif

  • source coderandomized.cpp
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include "common.h"

/* ***不要修改这个文件!只能修改common.h的内容!*** */

int main(int argc, char* argv[]) {
    
    
	printf("Let's generate a randomized array.\n");
	unsigned int vals[NUM_ELEMS];
	long long int reference;
	long long int simd;
	long long int simdu;
	for(unsigned int i = 0; i < NUM_ELEMS; i++) vals[i] = rand() % 256;

	printf("Starting randomized sum.\n");
	printf("Sum: %lld\n", reference = sum(vals));

	printf("Starting randomized unrolled sum.\n");
	printf("Sum: %lld\n", sum_unrolled(vals));

	printf("Starting randomized SIMD sum.\n");
	printf("Sum: %lld\n", simd = sum_simd(vals));
	if (simd != reference) {
    
    
		printf("OH NO! SIMD sum %lld doesn't match reference sum %lld!\n", simd, reference);
	}
}

The topic requires only modificationcommon.h

The code I implemented is as follows:

#ifndef COMMON_H
#define COMMON_H

#include <x86intrin.h>
#include <stdint.h>
#define NUM_ELEMS ((1 << 15) + 10)
#define OUTER_ITERATIONS (1 << 15)

/* 不要修改这个函数 */
long long int sum(unsigned int vals[NUM_ELEMS]) {
    
    
	clock_t start = clock();

	long long int sum = 0;
	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
    
    
		for(unsigned int i = 0; i < NUM_ELEMS; i++) {
    
    
			if(vals[i] >= 128) {
    
    
				sum += vals[i];
			}
		}
	}
	clock_t end = clock();
	printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
	return sum;
}

/* 不要修改这个函数 */
long long int sum_unrolled(unsigned int vals[NUM_ELEMS]) {
    
    
	clock_t start = clock();
	long long int sum = 0;

	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
    
     
		for(unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4) {
    
    
			if(vals[i] >= 128) sum += vals[i];
			if(vals[i + 1] >= 128) sum += vals[i + 1];
			if(vals[i + 2] >= 128) sum += vals[i + 2];
			if(vals[i + 3] >= 128) sum += vals[i + 3];
		}

		//This is what we call the TAIL CASE
		//For when NUM_ELEMS isn't a multiple of 4
		//NONTRIVIAL FACT: NUM_ELEMS / 4 * 4 is the
		// largest multiple of 4 less than NUM_ELEMS
		for(unsigned int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i++) {
    
    
			if (vals[i] >= 128) {
    
    
				sum += vals[i];
			}
		}
	}
	clock_t end = clock();
	printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
	return sum;
}

long long int sum_simd(unsigned int vals[NUM_ELEMS]) {
    
    
	clock_t start = clock();
	
	//这句代码会为你生成一个含有若干个127的向量
	//思考题:为什么你需要它?
	//用来实现比较对应的128, (>127) == (>=128)
	__m128i _127 = _mm_set1_epi32(127);		
	//用来比较数字是否>=128 
	long long int result = 0;// 将最终计算的结果保存在这里 
	
	//不要修改此行之上的任何代码!!! 
	 
	__m128i p =  _mm_setzero_si128( );//用来保存从数组中读取的数据
	
	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
    
    
		/* 你的代码从这里开始 */
		__m128i sum = _mm_setzero_si128( );//需要sum的值
		for(unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4){
    
    

			__m128i* h = (__m128i*)(vals+i);//地址强制类型转换
			p = _mm_loadu_si128( h );//从指针中获取值,获取128位,即4Byte,恰好是vals[0-3]
			__m128i flag =  _mm_setzero_si128( ); 
		 	flag = _mm_cmpgt_epi32( p , _127 ); //大于127时应该是oxfffff,
		 	//一下子比较四个向量, 4Byte的每一bits都是应该是十进制的-1,而但小于时则会出现ox0000,
		 	//所以就是0, 在flag中的四个部分中分别比较后的结果会变成四个部分保存, 
		 	//所以只存在-1和0的结果.如果都大于,(-1,-1,-1,-1).
		 	
		 	//这样子的话,-1时是每一bis都是1, 那么我们取大于的时候,
		 	//直接用flag和我们的p来做and与计算, 那么如果大于,直接保留不变,
		 	//小于的话是0,与完后变成0,相当于跳过计算,不影响我们的计算结果.

			__m128i hi = _mm_setzero_si128( );
			hi = _mm_and_si128( p, flag );
			
			sum = _mm_add_epi32(sum, hi);
				 
		}
			//int 一般为32位, 4*32 = 128
			int32_t *k = (int *)&sum;//将_m128i转换为4个int型的数组 (
			result = result + k[0] + k[1] + k[2] + k[3];//将其全部加入result
		
		for(unsigned int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i++) {
    
    
			if (vals[i] >= 128) {
    
    
				result += vals[i];
			}
			
		}
		//result *= OUTER_ITERATIONS;
		/* 你的代码在这里结束 */
	}
	clock_t end = clock();
	printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
	return result;
}

/* 不要修改这个函数 */
int int_comparator(const void* a, const void* b) {
    
    
	if(*(unsigned int*)a == *(unsigned int*)b) return 0;
	else if(*(unsigned int*)a < *(unsigned int*)b) return -1;
	else return 1;
}

#endif

operation result

insert image description here

Nice

insert image description here

Guess you like

Origin blog.csdn.net/horizon08/article/details/110678605