HLS第二十一课（linebuffer和window buffer）

A standard convolution function applied to an image is used here
to demonstrate how the C code can negatively impact the performance which is possible from an FPGA.

template<typename T, int K>
static void convolution_orig(
	int width,
	int height,
	const T *src,
	T *dst,
	const T *hcoeff,
	const T *vcoeff
	)
{
	T local[MAX_IMG_ROWS*MAX_IMG_COLS];
	// Horizontal convolution
	HconvH:for(int col = 0; col < height; col++){
		HconvWfor(int row = border_width; row < width - border_width; row++){
			Hconv:for(int i = - border_width; i <= border_width; i++){
			}
		}
	}
	
	// Vertical convolution
	VconvH:for(int col = border_width; col < height - border_width; col++){
		VconvW:for(int row = 0; row < width; row++){
			Vconv:for(int i = - border_width; i <= border_width; i++){
			}
		}
	}

	// Border pixels
	Top_Border:for(int col = 0; col < border_width; col++){
	}
	Side_Border:for(int col = border_width; col < height - border_width; col++){
	}
	Bottom_Border:for(int col = height - border_width; col < height; col++){
	}
}

The C code for performing this operation is shown below.

	const int conv_size = K;
	const int border_width = int(conv_size / 2);
#ifndef __SYNTHESIS__
	T * const local = new T[MAX_IMG_ROWS*MAX_IMG_COLS];
#else // Static storage allocation for HLS, dynamic otherwise
	T local[MAX_IMG_ROWS*MAX_IMG_COLS];
#endif

	Clear_Local:for(int i = 0; i < height * width; i++){
		local[i]=0;
	}
	
	// Horizontal convolution
	HconvH:for(int col = 0; col < height; col++){
		HconvWfor(int row = border_width; row < width - border_width; row++){
			int pixel = col * width + row;
			Hconv:for(int i = - border_width; i <= border_width; i++){
				local[pixel] += src[pixel + i] * hcoeff[i + border_width];
			}
		}
	}

After the vertical convolution, the image is now smaller then the source image src due to both
the horizontal and vertical border effect.

	Clear_Dst:for(int i = 0; i < height * width; i++){
		dst[i]=0;
	}
	// Vertical convolution
	VconvH:for(int col = border_width; col < height - border_width; col++){
		VconvW:for(int row = 0; row < width; row++){
			int pixel = col * width + row;
			Vconv:for(int i = - border_width; i <= border_width; i++){
				int offset = i * width;
				dst[pixel] += local[pixel + offset] * vcoeff[i + border_width];
			}
		}
	}

The border region is populated with the nearest valid value.

	int border_width_offset = border_width * width;
	int border_height_offset = (height - border_width - 1) * width;
	// Border pixels
	Top_Border:for(int col = 0; col < border_width; col++){
		int offset = col * width;
		for(int row = 0; row < border_width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[border_width_offset + border_width];
		}
		
		for(int row = border_width; row < width - border_width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[border_width_offset + row];
		}
		
		for(int row = width - border_width; row < width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[border_width_offset + width - border_width - 1];
		}
	}
	
	
	Side_Border:for(int col = border_width; col < height - border_width; col++){
		int offset = col * width;
		
		for(int row = 0; row < border_width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[offset + border_width];
		}
		
		for(int row = width - border_width; row < width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[offset + width - border_width - 1];
		}
	}
	
	Bottom_Border:for(int col = height - border_width; col < height; col++){
		int offset = col * width;
		
		for(int row = 0; row < border_width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[border_height_offset + border_width];
		}
		
		for(int row = border_width; row < width - border_width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[border_height_offset + row];
		}
		
		for(int row = width - border_width; row < width; row++){
			int pixel = offset + row;
			dst[pixel] = dst[border_height_offset + width - border_width - 1];
		}
	}

The code suffers from the same repeated access for data.

+++++++++++++++++++++++++++++++++++++++++++++
The key to implementing the convolution example reviewed in the previous section as a highperformance design with minimal resources is to consider how the FPGA implementation will be used in the overall system.
Maximize the flow of data through the system.

Maximize the reuse of data. Use local caches to ensure there are no requirements to re-read
data and the incoming data can keep flowing.

One of the first enhancements which can be made to the earlier code is to use the HLS stream
construct, typically referred to as an hls::stream.
In the C code, the hls::stream behaves like a FIFO of infinite depth.

If the data from an hls::stream is required again, it must be cached. This is another good practice
when writing code to be synthesized on an FPGA.

++++++++++++++++++++++++++++++++++++++++++++++
With an hls::stream construct the outline of the new optimized code is as follows:

template<typename T, int K>
static void convolution_strm(
	int width,
	int height,
	hls::stream<T> &src,
	hls::stream<T> &dst,
	const T *hcoeff,
	const T *vcoeff
	)
{
	hls::stream<T> hconv("hconv");
	hls::stream<T> vconv("vconv");
	
	// These assertions let HLS know the upper bounds of loops
	assert(height < MAX_IMG_ROWS);
	assert(width < MAX_IMG_COLS);
	assert(vconv_xlim < MAX_IMG_COLS - (K - 1));
	// Horizontal convolution
	HConvH:for(int col = 0; col < height; col++) {
		HConvW:for(int row = 0; row < width; row++) {
			HConv:for(int i = 0; i < K; i++) {
			}
		}
	}
	
	// Vertical convolution
	VConvH:for(int col = 0; col < height; col++) {
		VConvW:for(int row = 0; row < vconv_xlim; row++) {
			VConv:for(int i = 0; i < K; i++) {
			}
		}
	}
	
	Border:for (int i = 0; i < height; i++) {
		for (int j = 0; j < width; j++) {
		}
	}
}

The input and output data is now modelled as hls::streams.
Instead of a single local array of size HEIGHT*WDITH there are two internal hls::streams used
to save the output of the horizontal and vertical convolutions.

The algorithm must usethe K previous samples to compute the convolution result, it therefore copies the sample into atemporary cache hwin.
The algorithm keeps reading input samples a caching them into hwin.

Each time is reads a new sample, it pushes an unneeded sample out of hwin.
only the last K samples are stored in hwin.

The code to perform these operations is shown below

	// Horizontal convolution
	HConvW:for(int row = 0; row < width; row++) {
		HconvW:for(int row = border_width; row < width - border_width; row++){
			T in_val = src.read();
			T out_val = 0;
			
			HConv:for(int i = 0; i < K; i++) {
				hwin[i] = i < K - 1 ? hwin[i + 1] : in_val;
				out_val += hwin[i] * hcoeff[i];
			}
			
			if (row >= K - 1)
				hconv << out_val;
		}
	}

use of the temporary variable out_val to perform the convolution calculation. This variable is set to zero before the calculation is performed, negating the need to spend 2 million clocks cycle to reset the values,

In a CPU architecture, conditional or branch operations are often avoided. When the program
needs to branch it loses any instructions stored in the CPU fetch pipeline.

In an FPGAarchitecture, a separate path already exists in the hardware for each conditional branch and thereis no performance penalty associated with branching inside a pipelined task.
It is simply a case of selecting which branch to use.

The outputs are stored in the hls::stream hconv for use by the vertical convolution loop.

The vertical convolution represents a challenge to the streaming data model preferred by an
FPGA. The data must be accessed by column but you do not wish to store the entire image. The
solution is to use line buffers,

from the hls::stream hconv.
The algorithm requires at least K-1 lines of data before it can process the first sample.

A line buffer allows K-1 lines of data to be stored.
Each time a new sample is read, another sample is pushed out the line buffer.

An interesting point to note here is that the newest sample is used in the calculation and then the sample is stored into the line buffer and the old sample ejected out.
This ensure only K-1 lines are required to be cached, rather than K lines.

	// Vertical convolution
	VConvH:for(int col = 0; col < height; col++) {
		VConvW:for(int row = 0; row < vconv_xlim; row++) {
		#pragma HLS DEPENDENCE variable=linebuf inter false
		#pragma HLS PIPELINE
			T in_val = hconv.read();
			T out_val = 0;
			
			VConv:for(int i = 0; i < K; i++) {
				T vwin_val = i < K - 1 ? linebuf[i][row] : in_val;
				out_val += vwin_val * vcoeff[i];
				if (i > 0)
					linebuf[i - 1][row] = vwin_val;
			}
			
			if (col >= K - 1)
				vconv << out_val;
		}
	}

how the border samples are aligned into the image.
Each sample is read from the vconv output from the vertical convolution.
The sample is then cached as one of 4 possible pixel types.
The sample is then written to the output stream.

	Border:for (int i = 0; i < height; i++) {
		for (int j = 0; j < width; j++) {
			T pix_in, l_edge_pix, r_edge_pix, pix_out;
			#pragma HLS PIPELINE
		
			if (i == 0 || (i > border_width && i < height - border_width)) {
				if (j < width - (K - 1)) {
					pix_in = vconv.read();
					borderbuf[j] = pix_in;
				}
				
				if (j == 0) {
					l_edge_pix = pix_in;
				}
				
				if (j == width - K) {
					r_edge_pix = pix_in;
				}
			}
			
			if (j <= border_width) {
				pix_out = l_edge_pix;
			} 
			else if (j >= width - border_width - 1) {
				pix_out = r_edge_pix;
			}
			else {
				pix_out = borderbuf[j - border_width];
			}
			
			dst << pix_out;
		}
	}

A notable difference with this new code is the extensive use of conditionals inside the tasks. This
allows the task, once it is pipelined, to continuously process data and the result of the
conditionals does not impact the execution of the pipeline:
the result will impact the output values but the pipeline with keep processing so long as input samples are available.

++++++++++++++++++++++++++++++++++++++++++++
The final code for this FPGA-friendly algorithm has the following optimization directives used.

template<typename T, int K>
static void convolution_strm(
	int width,
	int height,
	hls::stream<T> &src,
	hls::stream<T> &dst,
	const T *hcoeff,
	const T *vcoeff
	)
{
#pragma HLS DATAFLOW
#pragma HLS ARRAY_PARTITION variable=linebuf dim=1 complete
	hls::stream<T> hconv("hconv");
	hls::stream<T> vconv("vconv");
	
	// These assertions let HLS know the upper bounds of loops
	assert(height < MAX_IMG_ROWS);
	assert(width < MAX_IMG_COLS);
	assert(vconv_xlim < MAX_IMG_COLS - (K - 1));
	
	// Horizontal convolution
	HConvH:for(int col = 0; col < height; col++) {
		HConvW:for(int row = 0; row < width; row++) {
		#pragma HLS PIPELINE
			HConv:for(int i = 0; i < K; i++) {
			}
		}
	}
	
	// Vertical convolution
	VConvH:for(int col = 0; col < height; col++) {
		VConvW:for(int row = 0; row < vconv_xlim; row++) {
		#pragma HLS PIPELINE
		#pragma HLS DEPENDENCE variable=linebuf inter false
			VConv:for(int i = 0; i < K; i++) {
			}
		}
	}
	
	Border:for (int i = 0; i < height; i++) {
		for (int j = 0; j < width; j++) {
		#pragma HLS PIPELINE
		}
	}
}

Each of the tasks are pipelined at the sample level.
The line buffer is full partitioned into registers
to ensure there are no read or write limitations due to insufficient block RAM ports.
The line buffer also requires a dependence directive.
The hls::streams are automatically implemented as FIFOs with 1 element.

HLS第二十一课（linebuffer和window buffer）

猜你喜欢