Ping pong operation
Ping pong
The ping-pong operation method is a data buffer optimization design technology in FPGA development, and it can be regarded as another form of pipeline technology. When the input data stream passes through the "input data stream selection unit", the data stream is equally divided into two data buffer modules in time. The data buffer module can be any storage module in FPGA, such as dual-port RAM, single-port RAM and FIFO.
The flow of ping-pong operation:
in the first buffering cycle, the input data stream is buffered to "data buffer module 1";
in the second buffering cycle, the input data stream is buffered by switching the "input data stream selection unit" to "data buffer module 2", while data "1 data buffer module" first cache cycle by selecting the "output data stream selection unit", and to "data stream arithmetic processing module" arithmetic processing;
at In three buffering cycles, by switching the "input data stream selection unit" again, the input data stream is buffered to the "data buffer module 1", and at the same time the second cycle data buffered by the "data buffer module 2" is passed through the "output The switching of the data stream selection unit" is sent to the "data stream operation processing module" for operation processing, and so on.
The characteristics of ping-pong operation:
(1) The input data stream and the output data stream are continuous without any pause , so it is especially suitable for pipeline processing of the data stream . Therefore, the ping-pong operation method is often applied to pipeline algorithms to complete seamless data buffering and processing.
(2) Need to use double memory resources.
(3) It is suitable for the situation where the data is too late to be processed for each transmission and needs to be cached .
Ping-pong operation example
Module introduction
RTL design (2)-dual-port RAM is used in the module .
Module function: Use AXI-Stream communication to convert 1024 32bit data into 8bit data output.
Sequence logic: the
first trigger == 1 pulse signal: write data to ram_ping
the second trigger == 1 pulse signal: switch select, write data to ram_pong, and output the data of ram_ping to
the third trigger == 1 Pulse signal: switch select, write data to ram_ping, and output ram_pong data at the same time
...
8th trigger == 1 Pulse signal: switch select, write data to ram_pong, and output ram_ping data for
the 9th time (simulation is in progress The last time ) trigger == 1 pulse signal: switch select, output ram_ping data
Note: When the addressing of data interception is too complicated in program design, multi-dimensional reg array can be used to simplify the addressing logic.
Such as: reg [7:0] data[15:0][8:0];
program
dataconvert.v
`timescale 1ns / 1ps
// Company:
// Engineer:
//
// Create Date: 2020/12/09
// Author Name: Sniper
// Module Name: dataconvert
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
module dataconvert
#(
parameter BUS_DATA_WIDTH = 32
)
(
input clk,
input rst_n,
input trigger,//pulse signal
output busy,
//AXI-Stream input
input s_axis_tvalid,
output s_axis_tready,
input [BUS_DATA_WIDTH-1:0] s_axis_tdata,
input s_axis_tlast,
//AXI-Stream output
output reg m_axis_tvalid,
input m_axis_tready,
output [7:0] m_axis_tdata,
output reg m_axis_tlast
);
assign s_axis_tready = 1;
//count control
reg [7:0] store_cnt;
reg [7:0] output_cnt;
always@(posedge clk or negedge rst_n)
if(!rst_n)
store_cnt <= 0;
else if(s_axis_tvalid & s_axis_tready & s_axis_tlast)
store_cnt <= store_cnt + 1;
always@(posedge clk or negedge rst_n)
if(!rst_n)
output_cnt <= 0;
else if(m_axis_tvalid & m_axis_tready & m_axis_tlast)
output_cnt <= output_cnt + 1;
//output_flag control
reg output_flag;
always@(posedge clk or negedge rst_n)
if(!rst_n)
output_flag <= 0;
else if(trigger && output_cnt != store_cnt && !busy)
output_flag <= 1;
else if(m_axis_tvalid & m_axis_tready & m_axis_tlast)
output_flag <= 0;
//busy control
reg store_busy;
wire output_busy;
always@(posedge clk or negedge rst_n)
if(!rst_n)
store_busy <= 0;
else if(s_axis_tvalid & s_axis_tready & s_axis_tlast)
store_busy <= 0;
else if(s_axis_tvalid & s_axis_tready)
store_busy <= 1;
assign output_busy = output_flag;
assign busy = store_busy | output_busy;
//ping-pong control
reg select;
reg busy_buff;
always@(posedge clk or negedge rst_n)
if(!rst_n)
select <= 0;
else
begin
busy_buff <= busy;
if(busy_buff & ~busy)//remove busy
select <= ~select;
end
localparam RAM_NUM = BUS_DATA_WIDTH/8;
localparam RAM_DEPTH = 1024;
reg [$clog2(RAM_DEPTH)-1:0] ram_waddr;
reg [$clog2(RAM_DEPTH)-1:0] ram_raddr;
wire [7:0] ram_wdata[RAM_NUM-1:0];
wire [7:0] ram_ping_rdata[RAM_NUM-1:0];
wire [7:0] ram_pong_rdata[RAM_NUM-1:0];
reg ram_ren;
//ram_waddr control
always@(posedge clk or negedge rst_n)
if(!rst_n)
ram_waddr <= 0;
else if(s_axis_tvalid & s_axis_tready & s_axis_tlast)
ram_waddr <= 0;
else if(s_axis_tvalid & s_axis_tready)
ram_waddr <= ram_waddr + 1;
//data locate
reg [7:0] ram_rdata_buff[RAM_NUM-1:0];
reg ram_ren_buff;
always@(posedge clk or negedge rst_n)
if(!rst_n)
begin
//ram_rdata_buff[RAM_NUM-1:0] <= 0;
ram_ren_buff <= 0;
end
else
begin
ram_ren_buff <= ram_ren;
if(ram_ren_buff)
ram_rdata_buff[RAM_NUM-1:0] <= select ? ram_pong_rdata[RAM_NUM-1:0] : ram_ping_rdata[RAM_NUM-1:0];
else if(m_axis_tvalid & m_axis_tready)
ram_rdata_buff[RAM_NUM-1:0] <= {
ram_rdata_buff[0], ram_rdata_buff[RAM_NUM-1:1]};//shift right round
end
assign m_axis_tdata = ram_rdata_buff[0];
//m_axis_tvalid control
reg [7:0] state;
always@(posedge clk or negedge rst_n)
if(!rst_n)
begin
m_axis_tvalid <= 0;
ram_ren <= 0;
state <= 0;
end
else
begin
if(output_flag)
case(state)
0:
begin
state <= state + 1;
m_axis_tvalid <= 0;
ram_ren <= 1;
end
1:
begin
state <= state + 1;
ram_ren <= 0;
end
2:
begin
state <= state + 1;
m_axis_tvalid <= 1;
end
3,4,5,6:
begin
if(m_axis_tvalid && m_axis_tready)
begin
state <= state + 1;
if(state == 6)
begin
state <= 1;
m_axis_tvalid <= 0;
ram_ren <= 1;
end
end
end
default: state <= 0;
endcase
else
begin
m_axis_tvalid <= 0;
ram_ren <= 0;
state <= 0;
end
end
//m_axis_tlast control
reg [15:0] cnt;
always@(posedge clk or negedge rst_n)
if(!rst_n)
begin
m_axis_tlast <= 0;
cnt <= 0;
end
else if(m_axis_tvalid && m_axis_tready)
begin
if(cnt == RAM_DEPTH*BUS_DATA_WIDTH/8-1 -1)
m_axis_tlast <= 1;
else
m_axis_tlast <= 0;
if(cnt == RAM_DEPTH*BUS_DATA_WIDTH/8 -1)
cnt <= 0;
else
cnt <= cnt + 1;
end
//ram_rd control
always@(posedge clk or negedge rst_n)
if(!rst_n)
ram_raddr <= 0;
else if(trigger && output_cnt != store_cnt && !busy)
ram_raddr <= 0;
else if(ram_ren)
ram_raddr <= ram_raddr + 1;
genvar i;
generate
for(i=0;i<RAM_NUM;i=i+1)
begin
assign ram_wdata[i] = s_axis_tdata[8*(i+1)-1 -:8];
dualram
#(
.WIDTH(8),
.DEPTH(RAM_DEPTH)
)
u_ram_ping
(
.wr_clk(clk),
.wr_addr(ram_waddr),
.wr_data(ram_wdata[i]),
.wr_en(s_axis_tvalid & s_axis_tready & select),
.rd_clk(clk),
.rd_addr(ram_raddr),
.rd_data(ram_ping_rdata[i]),
.rd_en(ram_ren & ~select)
);
dualram
#(
.WIDTH(8),
.DEPTH(RAM_DEPTH)
)
u_ram_pong
(
.wr_clk(clk),
.wr_addr(ram_waddr),
.wr_data(ram_wdata[i]),
.wr_en(s_axis_tvalid & s_axis_tready & ~select),
.rd_clk(clk),
.rd_addr(ram_raddr),
.rd_data(ram_pong_rdata[i]),
.rd_en(ram_ren & select)
);
end
endgenerate
endmodule
tb_dataconvert.sv
`timescale 1ns / 1ps
// Company:
// Engineer:
//
// Create Date: 2020/12/10
// Author Name: Sniper
// Module Name: tb_dataconvert
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
module tb_dataconvert;
//parameter
parameter BUS_DATA_WIDTH = 32;
//input
reg clk;
reg rst_n;
reg trigger;
reg s_axis_tvalid;
reg [BUS_DATA_WIDTH-1:0] s_axis_tdata;
reg s_axis_tlast;
reg m_axis_tready;
//output
wire busy;
wire s_axis_tready;
wire m_axis_tvalid;
wire [7:0] m_axis_tdata;
wire m_axis_tlast;
reg [7:0] rand_n;
int percent = 20;
initial
begin
clk = 0;
rst_n = 0;
trigger = 0;
m_axis_tready = 1;
#100;
rst_n = 1;
$srandom(100);//random seed
forever @(posedge clk)
begin
rand_n = $urandom_range(99);
m_axis_tready <= rand_n >= percent;
end
end
//clock
always #5 clk = ~clk;
//trigger
reg [3:0] trig_state;
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
begin
trigger <= 0;
trig_state <= 0;
end
else
begin
case(trig_state)
0:
begin
trigger <= 0;
trig_state <= trig_state + 1;
end
3:
begin
if(!busy)
begin
trigger <= 1;
trig_state <= 0;
end
end
default: trig_state <= trig_state + 1;
endcase
end
end
//axis_write
reg [7:0] sys_state;
reg [15:0] cnt;
reg [7:0] trigger_cnt;
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
begin
s_axis_tvalid <= 0;
s_axis_tdata <= {
8'd3,8'd2,8'd1,8'd0};
s_axis_tlast <= 0;
sys_state <= 0;
cnt <= 0;
trigger_cnt <= 0;
end
else
begin
case(sys_state)
0:
begin
s_axis_tvalid <= 0;
s_axis_tlast <= 0;
cnt <= 0;
sys_state <= sys_state + 1;
end
1:
begin
if(trigger)
begin
trigger_cnt <= trigger_cnt + 1;
if(trigger_cnt < 8)
begin
s_axis_tvalid <= 1;
sys_state <= sys_state + 1;
end
else
trigger_cnt <= trigger_cnt;
end
end
2:
begin
if(s_axis_tvalid & s_axis_tready)
begin
s_axis_tdata <= s_axis_tdata + 1;
cnt <= cnt + 1;
if(cnt == 1022)
s_axis_tlast <= 1;
if(cnt == 1023)
begin
cnt <= 0;
sys_state <= 0;
s_axis_tvalid <= 0;
s_axis_tlast <= 0;
end
end
end
default: sys_state <= 0;
endcase
end
end
//DUT
dataconvert
#(
.BUS_DATA_WIDTH(BUS_DATA_WIDTH)
)
DUT
(
.clk(clk),
.rst_n(rst_n),
.trigger(trigger),
.busy(busy),
.s_axis_tvalid(s_axis_tvalid),
.s_axis_tready(s_axis_tready),
.s_axis_tdata(s_axis_tdata),
.s_axis_tlast(s_axis_tlast),
.m_axis_tvalid(m_axis_tvalid),
.m_axis_tready(m_axis_tready),
.m_axis_tdata(m_axis_tdata),
.m_axis_tlast(m_axis_tlast)
);
initial
begin
$dumpfile("tb_dataconvert.vcd");
$dumpvars(0,tb_dataconvert);
end
initial #1_000_000 $finish;
endmodule
operation result
vcs -R -sverilog dualram.v dataconvert.v tb_dataconvert.sv
The output remains ready=1:
The output ready is random, used to verify the flow control function of AXI-Stream: