RTL design (3)-ping-pong operation

Ping pong

The ping-pong operation method is a data buffer optimization design technology in FPGA development, and it can be regarded as another form of pipeline technology. When the input data stream passes through the "input data stream selection unit", the data stream is equally divided into two data buffer modules in time. The data buffer module can be any storage module in FPGA, such as dual-port RAM, single-port RAM and FIFO.

The flow of ping-pong operation:
in the first buffering cycle, the input data stream is buffered to "data buffer module 1";
in the second buffering cycle, the input data stream is buffered by switching the "input data stream selection unit" to "data buffer module 2", while data "1 data buffer module" first cache cycle by selecting the "output data stream selection unit", and to "data stream arithmetic processing module" arithmetic processing;
at In three buffering cycles, by switching the "input data stream selection unit" again, the input data stream is buffered to the "data buffer module 1", and at the same time the second cycle data buffered by the "data buffer module 2" is passed through the "output The switching of the data stream selection unit" is sent to the "data stream operation processing module" for operation processing, and so on.

The characteristics of ping-pong operation:
(1) The input data stream and the output data stream are continuous without any pause , so it is especially suitable for pipeline processing of the data stream . Therefore, the ping-pong operation method is often applied to pipeline algorithms to complete seamless data buffering and processing.
(2) Need to use double memory resources.
(3) It is suitable for the situation where the data is too late to be processed for each transmission and needs to be cached .

Ping-pong operation example

Module introduction

RTL design (2)-dual-port RAM is used in the module .

Module function: Use AXI-Stream communication to convert 1024 32bit data into 8bit data output.

Sequence logic: the
first trigger == 1 pulse signal: write data to ram_ping
the second trigger == 1 pulse signal: switch select, write data to ram_pong, and output the data of ram_ping to
the third trigger == 1 Pulse signal: switch select, write data to ram_ping, and output ram_pong data at the same time
...
8th trigger == 1 Pulse signal: switch select, write data to ram_pong, and output ram_ping data for
the 9th time (simulation is in progress The last time ) trigger == 1 pulse signal: switch select, output ram_ping data

Note: When the addressing of data interception is too complicated in program design, multi-dimensional reg array can be used to simplify the addressing logic.
Such as: reg [7:0] data[15:0][8:0];

program

dataconvert.v

`timescale 1ns / 1ps

// Company: 
// Engineer: 
// 
// Create Date: 2020/12/09
// Author Name: Sniper
// Module Name: dataconvert
// Project Name: 
// Target Devices: 
// Tool Versions: 
// Description: 
// 
// Dependencies: 
// 
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
// 


module dataconvert
#(
    parameter BUS_DATA_WIDTH = 32
)
(
    input clk,
    input rst_n,

    input trigger,//pulse signal
    output busy,

    //AXI-Stream input
    input s_axis_tvalid,
    output s_axis_tready,
    input [BUS_DATA_WIDTH-1:0] s_axis_tdata,
    input s_axis_tlast,

    //AXI-Stream output
    output reg m_axis_tvalid,
    input m_axis_tready,
    output [7:0] m_axis_tdata,
    output reg m_axis_tlast
);


assign s_axis_tready = 1;


//count control
reg [7:0] store_cnt;
reg [7:0] output_cnt;

always@(posedge clk or negedge rst_n)
	if(!rst_n)
		store_cnt <= 0;
	else if(s_axis_tvalid & s_axis_tready & s_axis_tlast)
		store_cnt <= store_cnt + 1;

always@(posedge clk or negedge rst_n)
	if(!rst_n)
		output_cnt <= 0;
	else if(m_axis_tvalid & m_axis_tready & m_axis_tlast)
		output_cnt <= output_cnt + 1;


//output_flag control
reg output_flag;

always@(posedge clk or negedge rst_n)
	if(!rst_n)
		output_flag <= 0;
	else if(trigger && output_cnt != store_cnt && !busy)
		output_flag <= 1;
	else if(m_axis_tvalid & m_axis_tready & m_axis_tlast)
		output_flag <= 0;


//busy control
reg store_busy;
wire output_busy;

always@(posedge clk or negedge rst_n)
	if(!rst_n)
		store_busy <= 0;
	else if(s_axis_tvalid & s_axis_tready & s_axis_tlast)
		store_busy <= 0;
	else if(s_axis_tvalid & s_axis_tready)
		store_busy <= 1;

assign output_busy = output_flag;
assign busy = store_busy | output_busy;


//ping-pong control
reg select;
reg busy_buff;

always@(posedge clk or negedge rst_n)
	if(!rst_n)
		select <= 0;
	else
    begin
        busy_buff <= busy;
        if(busy_buff & ~busy)//remove busy
            select <= ~select;
    end




localparam RAM_NUM = BUS_DATA_WIDTH/8;
localparam RAM_DEPTH = 1024;

reg [$clog2(RAM_DEPTH)-1:0] ram_waddr;
reg [$clog2(RAM_DEPTH)-1:0] ram_raddr;

wire [7:0] ram_wdata[RAM_NUM-1:0];
wire [7:0] ram_ping_rdata[RAM_NUM-1:0];
wire [7:0] ram_pong_rdata[RAM_NUM-1:0];
reg ram_ren;

//ram_waddr control
always@(posedge clk or negedge rst_n)
	if(!rst_n)
		ram_waddr <= 0;
	else if(s_axis_tvalid & s_axis_tready & s_axis_tlast)
		ram_waddr <= 0;
	else if(s_axis_tvalid & s_axis_tready)
		ram_waddr <= ram_waddr + 1;


//data locate
reg [7:0] ram_rdata_buff[RAM_NUM-1:0];
reg ram_ren_buff;
always@(posedge clk or negedge rst_n)
	if(!rst_n)
    begin
		//ram_rdata_buff[RAM_NUM-1:0] <= 0;
		ram_ren_buff <= 0;
    end
	else
    begin
		ram_ren_buff <= ram_ren;
        if(ram_ren_buff)
            ram_rdata_buff[RAM_NUM-1:0] <= select ? ram_pong_rdata[RAM_NUM-1:0] : ram_ping_rdata[RAM_NUM-1:0];
        else if(m_axis_tvalid & m_axis_tready)
            ram_rdata_buff[RAM_NUM-1:0] <= {
    
    ram_rdata_buff[0], ram_rdata_buff[RAM_NUM-1:1]};//shift right round
    end

assign m_axis_tdata = ram_rdata_buff[0];


//m_axis_tvalid control
reg [7:0] state;

always@(posedge clk or negedge rst_n)
	if(!rst_n)
    begin
		m_axis_tvalid <= 0;
		ram_ren <= 0;
		state <= 0;
    end
	else
    begin
        if(output_flag)
            case(state)
            0:
            begin
                state <= state + 1;
                m_axis_tvalid <= 0;
                ram_ren <= 1;
            end
            1:
            begin
                state <= state + 1;
                ram_ren <= 0;
            end
            2:
            begin
                state <= state + 1;
                m_axis_tvalid <= 1;
            end
            3,4,5,6:
            begin
                if(m_axis_tvalid && m_axis_tready)
                begin
                    state <= state + 1;
                    if(state == 6)
                    begin
                        state <= 1;
                        m_axis_tvalid <= 0;
                        ram_ren <= 1;
                    end
                end

            end
            default: state <= 0;
            endcase
        else
        begin
            m_axis_tvalid <= 0;
            ram_ren <= 0;
            state <= 0;
        end
    end


//m_axis_tlast control
reg [15:0] cnt;

always@(posedge clk or negedge rst_n)
	if(!rst_n)
    begin
		m_axis_tlast <= 0;
		cnt <= 0;
    end
    else if(m_axis_tvalid && m_axis_tready)
    begin
        if(cnt == RAM_DEPTH*BUS_DATA_WIDTH/8-1 -1)
            m_axis_tlast <= 1;
        else
            m_axis_tlast <= 0;

        if(cnt == RAM_DEPTH*BUS_DATA_WIDTH/8 -1)
            cnt <= 0;
        else
            cnt <= cnt + 1;
    end


//ram_rd control
always@(posedge clk or negedge rst_n)
	if(!rst_n)
		ram_raddr <= 0;
	else if(trigger && output_cnt != store_cnt && !busy)
        ram_raddr <= 0;
	else if(ram_ren)
        ram_raddr <= ram_raddr + 1;




genvar i;
generate
    for(i=0;i<RAM_NUM;i=i+1)
    begin
        assign ram_wdata[i] = s_axis_tdata[8*(i+1)-1 -:8];

        dualram
        #(
            .WIDTH(8),
            .DEPTH(RAM_DEPTH)
        )
        u_ram_ping
        (
            .wr_clk(clk),
            .wr_addr(ram_waddr),
            .wr_data(ram_wdata[i]),
            .wr_en(s_axis_tvalid & s_axis_tready & select),

            .rd_clk(clk),
            .rd_addr(ram_raddr),
            .rd_data(ram_ping_rdata[i]),
            .rd_en(ram_ren & ~select)
        );

        dualram
        #(
            .WIDTH(8),
            .DEPTH(RAM_DEPTH)
        )
        u_ram_pong
        (
            .wr_clk(clk),
            .wr_addr(ram_waddr),
            .wr_data(ram_wdata[i]),
            .wr_en(s_axis_tvalid & s_axis_tready & ~select),

            .rd_clk(clk),
            .rd_addr(ram_raddr),
            .rd_data(ram_pong_rdata[i]),
            .rd_en(ram_ren & select)
        );
    end
endgenerate



endmodule

tb_dataconvert.sv

`timescale 1ns / 1ps

// Company:
// Engineer:
//
// Create Date: 2020/12/10
// Author Name: Sniper
// Module Name: tb_dataconvert
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//


module tb_dataconvert;

//parameter
parameter BUS_DATA_WIDTH = 32;


//input
reg clk;
reg rst_n;
reg trigger;
reg s_axis_tvalid;
reg [BUS_DATA_WIDTH-1:0] s_axis_tdata;
reg s_axis_tlast;
reg m_axis_tready;


//output
wire busy;
wire s_axis_tready;
wire m_axis_tvalid;
wire [7:0] m_axis_tdata;
wire m_axis_tlast;


reg [7:0] rand_n;
int percent = 20;

initial
begin
    clk = 0;
    rst_n = 0;
    trigger = 0;
    m_axis_tready = 1;

	#100;
    rst_n = 1;

    $srandom(100);//random seed
    forever @(posedge clk)
    begin
        rand_n = $urandom_range(99);
        m_axis_tready <= rand_n >= percent;
    end




end

//clock
always #5 clk = ~clk;


//trigger
reg [3:0] trig_state;
always@(posedge clk or negedge rst_n)
begin
    if(!rst_n)
    begin
        trigger <= 0;
        trig_state <= 0;
    end
    else
    begin
        case(trig_state)
        0:
        begin
            trigger <= 0;
            trig_state <= trig_state + 1;
        end
        3:
        begin
            if(!busy)
            begin
                trigger <= 1;
                trig_state <= 0;
            end
        end
        default: trig_state <= trig_state + 1;
        endcase
    end
end


//axis_write
reg [7:0] sys_state;
reg [15:0] cnt;
reg [7:0] trigger_cnt;
always@(posedge clk or negedge rst_n)
begin
    if(!rst_n)
    begin
        s_axis_tvalid <= 0;
        s_axis_tdata <= {
    
    8'd3,8'd2,8'd1,8'd0};
        s_axis_tlast <= 0;
        sys_state <= 0;
        cnt <= 0;
        trigger_cnt <= 0;
    end
    else
    begin
        case(sys_state)
        0:
        begin
            s_axis_tvalid <= 0;
            s_axis_tlast <= 0;
            cnt <= 0;
            sys_state <= sys_state + 1;
        end
        1:
        begin
            if(trigger)
            begin
                trigger_cnt <= trigger_cnt + 1;
                if(trigger_cnt < 8)
                begin
                    s_axis_tvalid <= 1;
                    sys_state <= sys_state + 1;
                end
                else
                    trigger_cnt <= trigger_cnt;
            end
        end
        2:
        begin
            if(s_axis_tvalid & s_axis_tready)
            begin
                s_axis_tdata <= s_axis_tdata + 1;
                cnt <= cnt + 1;

                if(cnt == 1022)
                    s_axis_tlast <= 1;

                if(cnt == 1023)
                begin
                    cnt <= 0;
                    sys_state <= 0;
                    s_axis_tvalid <= 0;
                    s_axis_tlast <= 0;
                end
            end
        end
        default: sys_state <= 0;
        endcase
    end
end


//DUT
dataconvert 
#(
    .BUS_DATA_WIDTH(BUS_DATA_WIDTH)
)
DUT
(
    .clk(clk),
    .rst_n(rst_n),
    .trigger(trigger),
    .busy(busy),
    .s_axis_tvalid(s_axis_tvalid),
    .s_axis_tready(s_axis_tready),
    .s_axis_tdata(s_axis_tdata),
    .s_axis_tlast(s_axis_tlast),
    .m_axis_tvalid(m_axis_tvalid),
    .m_axis_tready(m_axis_tready),
    .m_axis_tdata(m_axis_tdata),
    .m_axis_tlast(m_axis_tlast)
);

initial
begin
    $dumpfile("tb_dataconvert.vcd");
    $dumpvars(0,tb_dataconvert);
end

initial #1_000_000 $finish;

endmodule

operation result

vcs -R -sverilog dualram.v dataconvert.v tb_dataconvert.sv

The output remains ready=1:
Insert picture description here
Insert picture description here

The output ready is random, used to verify the flow control function of AXI-Stream:
Insert picture description here

Guess you like

Origin blog.csdn.net/meng1506789/article/details/110955251