diff options
| -rw-r--r-- | gfx_hw.tcl | 3 | ||||
| -rw-r--r-- | rtl/gfx/fold_flow.sv | 55 | ||||
| -rw-r--r-- | rtl/gfx/fp_add.sv | 11 | ||||
| -rw-r--r-- | rtl/gfx/fp_mul.sv | 11 | ||||
| -rw-r--r-- | rtl/gfx/gfx.sv | 4 | ||||
| -rw-r--r-- | rtl/gfx/gfx_defs.sv | 7 | ||||
| -rw-r--r-- | rtl/gfx/horizontal_fold.sv | 68 | ||||
| -rw-r--r-- | rtl/gfx/mat_mat_mul.sv | 70 | ||||
| -rw-r--r-- | rtl/gfx/mat_vec_mul.sv | 29 | ||||
| -rw-r--r-- | rtl/gfx/pipeline_flow.sv | 40 | ||||
| -rw-r--r-- | rtl/gfx/pipelined_flow.sv | 26 | ||||
| -rw-r--r-- | rtl/gfx/vec_dot.sv | 15 |
12 files changed, 227 insertions, 112 deletions
@@ -46,7 +46,8 @@ add_fileset_file gfx_defs.sv SYSTEM_VERILOG PATH rtl/gfx/gfx_defs.sv add_fileset_file horizontal_fold.sv SYSTEM_VERILOG PATH rtl/gfx/horizontal_fold.sv add_fileset_file mat_mat_mul.sv SYSTEM_VERILOG PATH rtl/gfx/mat_mat_mul.sv add_fileset_file mat_vec_mul.sv SYSTEM_VERILOG PATH rtl/gfx/mat_vec_mul.sv -add_fileset_file pipelined_flow.sv SYSTEM_VERILOG PATH rtl/gfx/pipelined_flow.sv +add_fileset_file pipeline_flow.sv SYSTEM_VERILOG PATH rtl/gfx/pipeline_flow.sv +add_fileset_file fold_flow.sv SYSTEM_VERILOG PATH rtl/gfx/fold_flow.sv add_fileset_file vec_dot.sv SYSTEM_VERILOG PATH rtl/gfx/vec_dot.sv diff --git a/rtl/gfx/fold_flow.sv b/rtl/gfx/fold_flow.sv new file mode 100644 index 0000000..718786e --- /dev/null +++ b/rtl/gfx/fold_flow.sv @@ -0,0 +1,55 @@ +`include "gfx/gfx_defs.sv" + +module fold_flow +( + input logic clk, + rst_n, + + input logic in_valid, + out_ready, + + output logic in_ready, + out_valid, + stall, + feedback, + feedback_last +); + + index4 rounds[`FP_ADD_STAGES], last_round; + + assign stall = out_valid && !out_ready; + assign in_ready = !stall && !feedback; + assign out_valid = last_round == `INDEX4_MAX; + + assign feedback = last_round[1] ^ last_round[0]; + assign feedback_last = last_round[1]; + + assign last_round = rounds[`FP_ADD_STAGES - 1]; + + always_ff @(posedge clk or negedge rst_n) + if (!rst_n) + rounds[0] <= `INDEX4_MIN; + else if (!stall) + unique case (last_round) + 2'b01: + rounds[0] <= 2'b10; + + 2'b10: + rounds[0] <= 2'b11; + + 2'b00, 2'b11: + rounds[0] <= {1'b0, in_valid}; + endcase + + genvar i; + generate + for (i = 1; i < `FP_ADD_STAGES; ++i) begin: pipeline + always_ff @(posedge clk or negedge rst_n) + if (!rst_n) + rounds[i] <= `INDEX4_MIN; + else if (in_ready) + rounds[i] <= rounds[i - 1]; + end + endgenerate + +endmodule diff --git a/rtl/gfx/fp_add.sv b/rtl/gfx/fp_add.sv index 6cf4874..fad4768 100644 --- a/rtl/gfx/fp_add.sv +++ b/rtl/gfx/fp_add.sv @@ -3,25 +3,18 @@ module fp_add ( input logic clk, - rst_n, - input logic start, input fp a, b, + input logic stall, - output logic done, output fp q ); - pipelined_flow #(.STAGES(`FP_ADD_STAGES)) stages - ( - .* - ); - `ifndef VERILATOR ip_fp_add ip_add ( - .en(1), + .en(!stall), .areset(0), .* ); diff --git a/rtl/gfx/fp_mul.sv b/rtl/gfx/fp_mul.sv index c5aa56a..90d30fb 100644 --- a/rtl/gfx/fp_mul.sv +++ b/rtl/gfx/fp_mul.sv @@ -3,25 +3,18 @@ module fp_mul ( input logic clk, - rst_n, - input logic start, input fp a, b, + input logic stall, - output logic done, output fp q ); - pipelined_flow #(.STAGES(`FP_MUL_STAGES)) stages - ( - .* - ); - `ifndef VERILATOR ip_fp_mul ip_mul ( - .en(1), + .en(!stall), .areset(0), .* ); diff --git a/rtl/gfx/gfx.sv b/rtl/gfx/gfx.sv index ec2fb13..233d285 100644 --- a/rtl/gfx/gfx.sv +++ b/rtl/gfx/gfx.sv @@ -22,6 +22,10 @@ module gfx mat_mat_mul mul ( + .in_ready(), + .in_valid(start), + .out_ready(1), + .out_valid(done), .* ); diff --git a/rtl/gfx/gfx_defs.sv b/rtl/gfx/gfx_defs.sv index d01822a..648f9e6 100644 --- a/rtl/gfx/gfx_defs.sv +++ b/rtl/gfx/gfx_defs.sv @@ -1,6 +1,7 @@ `ifndef GFX_DEFS_SV `define GFX_DEFS_SV +// Esto es arquitectural, no se puede ajustar sin cambiar otras cosas `define FLOAT_BITS 16 `define FLOATS_PER_VEC 4 `define VECS_PER_MAT 4 @@ -10,7 +11,13 @@ `define FP_MUL_STAGES 3 // ~119 LUTs ~1 bloque DSP typedef logic[`FLOAT_BITS - 1:0] fp; +typedef fp vec2[2]; typedef fp vec4[`FLOATS_PER_VEC]; typedef vec4 mat4[`VECS_PER_MAT]; +typedef logic[1:0] index4; + +`define INDEX4_MIN 2'b00 +`define INDEX4_MAX 2'b11 + `endif diff --git a/rtl/gfx/horizontal_fold.sv b/rtl/gfx/horizontal_fold.sv index 127150e..513e3b1 100644 --- a/rtl/gfx/horizontal_fold.sv +++ b/rtl/gfx/horizontal_fold.sv @@ -1,50 +1,44 @@ `include "gfx/gfx_defs.sv" -// Asume que N es una potencia de 2 module horizontal_fold -#(parameter N=1) ( - input logic clk, - rst_n, + input logic clk, - input logic start, - input fp vec[N - 1:0], + input vec4 vec, + input logic stall, + feedback, + feedback_last, - output logic done, - output fp q + output fp q ); - fp q_left, q_right; - logic halves_done; + vec2 feedback_vec, queued[`FP_ADD_STAGES]; + + assign feedback_vec = queued[`FP_ADD_STAGES - 1]; + + fp_add add + ( + .a(feedback ? q : vec[0]), + .b(feedback ? feedback_vec[feedback_last] : vec[1]), + .* + ); + + always_ff @(posedge clk) + if (!stall) begin + if (feedback) + queued[0] <= feedback_vec; + else begin + queued[0][0] <= vec[2]; + queued[0][1] <= vec[3]; + end + end + genvar i; generate - if (N > 1) begin - horizontal_fold #(.N(N / 2)) left - ( - .q(q_left), - .vec(vec[N - 1:N / 2]), - .done(halves_done), - .* - ); - - horizontal_fold #(.N(N / 2)) right - ( - .q(q_right), - .vec(vec[N / 2 - 1:0]), - .done(), - .* - ); - - fp_add fold - ( - .a(q_left), - .b(q_right), - .start(halves_done), - .* - ); - end else begin - assign q = vec[0]; - assign done = start; + for (i = 1; i < `FLOATS_PER_VEC; ++i) begin: stages + always_ff @(posedge clk) + if (!stall) + queued[i] <= queued[i - 1]; end endgenerate diff --git a/rtl/gfx/mat_mat_mul.sv b/rtl/gfx/mat_mat_mul.sv index aa5e769..cf4bd35 100644 --- a/rtl/gfx/mat_mat_mul.sv +++ b/rtl/gfx/mat_mat_mul.sv @@ -5,29 +5,67 @@ module mat_mat_mul input logic clk, rst_n, - input logic start, input mat4 a, b, + input logic in_valid, + out_ready, - output logic done, - output mat4 q + output mat4 q, + output logic in_ready, + out_valid ); - logic dones[`VECS_PER_MAT]; + mat4 a_hold, b_hold, q_hold, mul_b; + vec4 mul_q; + logic mul_in_ready, mul_in_valid, mul_out_ready, mul_out_valid; + index4 in_index, out_index; - assign done = dones[0]; + assign in_ready = mul_in_ready && in_index == `INDEX4_MIN; + assign out_valid = mul_out_valid && out_index == `INDEX4_MAX; - genvar i; - generate - for (i = 0; i < `VECS_PER_MAT; ++i) begin: columns - mat_vec_mul column_i - ( - .x(b[i]), - .q(q[i]), - .done(dones[i]), - .* - ); + assign mul_in_valid = in_valid || in_index != `INDEX4_MIN; + assign mul_out_ready = out_ready || out_index != `INDEX4_MAX; + + mat_vec_mul mul + ( + .a(in_index == `INDEX4_MIN ? a : a_hold), + .x(mul_b[in_index]), + .q(mul_q), + .in_ready(mul_in_ready), + .in_valid(mul_in_valid), + .out_ready(mul_out_ready), + .out_valid(mul_out_valid), + .* + ); + + always_comb begin + mul_b = b_hold; + mul_b[0] = b[0]; + + q = q_hold; + q[`VECS_PER_MAT - 1] = mul_q; + end + + always_ff @(posedge clk or negedge rst_n) + if (!rst_n) begin + in_index <= `INDEX4_MIN; + out_index <= `INDEX4_MIN; + end else begin + if (mul_in_ready && mul_in_valid) + in_index <= in_index + 1; + + if (mul_out_ready && mul_out_valid) + out_index <= out_index + 1; + end + + always_ff @(posedge clk) begin + if (in_ready) begin + a_hold <= a; + b_hold <= b; end - endgenerate + + if (mul_out_ready && mul_out_valid) + q_hold[out_index] <= mul_q; + end endmodule diff --git a/rtl/gfx/mat_vec_mul.sv b/rtl/gfx/mat_vec_mul.sv index 43860c9..9f5dcae 100644 --- a/rtl/gfx/mat_vec_mul.sv +++ b/rtl/gfx/mat_vec_mul.sv @@ -5,27 +5,42 @@ module mat_vec_mul input logic clk, rst_n, - input logic start, input mat4 a, input vec4 x, + input logic in_valid, + out_ready, - output logic done, - output vec4 q + output vec4 q, + output logic in_ready, + out_valid ); - logic dones[`FLOATS_PER_VEC]; + logic stall_mul, stall_fold, mul_ready, mul_valid, feedback, feedback_last; - assign done = dones[0]; + pipeline_flow #(.STAGES(`FP_MUL_STAGES)) mul + ( + .stall(stall_mul), + .out_ready(mul_ready), + .out_valid(mul_valid), + .* + ); + + fold_flow fold + ( + .stall(stall_fold), + .in_ready(mul_ready), + .in_valid(mul_valid), + .* + ); genvar i; generate - for (i = 0; i < `FLOATS_PER_VEC; ++i) begin: dots + for (i = 0; i < `VECS_PER_MAT; ++i) begin: dots vec_dot dot_i ( .a(a[i]), .b(x), .q(q[i]), - .done(dones[i]), .* ); end diff --git a/rtl/gfx/pipeline_flow.sv b/rtl/gfx/pipeline_flow.sv new file mode 100644 index 0000000..2b9c891 --- /dev/null +++ b/rtl/gfx/pipeline_flow.sv @@ -0,0 +1,40 @@ +`include "gfx/gfx_defs.sv" + +module pipeline_flow +#(parameter STAGES=0) +( + input logic clk, + rst_n, + + input logic in_valid, + out_ready, + + output logic in_ready, + out_valid, + stall +); + + logic valid[STAGES]; + + assign stall = !in_ready; + assign in_ready = out_ready || !out_valid; + assign out_valid = valid[STAGES - 1]; + + always_ff @(posedge clk or negedge rst_n) + if (!rst_n) + valid[0] <= 0; + else if (in_ready) + valid[0] <= in_valid; + + genvar i; + generate + for (i = 1; i < STAGES; ++i) begin: pipeline + always_ff @(posedge clk or negedge rst_n) + if (!rst_n) + valid[i] <= 0; + else if (in_ready) + valid[i] <= valid[i - 1]; + end + endgenerate + +endmodule diff --git a/rtl/gfx/pipelined_flow.sv b/rtl/gfx/pipelined_flow.sv deleted file mode 100644 index 1e3c1ce..0000000 --- a/rtl/gfx/pipelined_flow.sv +++ /dev/null @@ -1,26 +0,0 @@ -module pipelined_flow -#(parameter STAGES=0) -( - input logic clk, - rst_n, - - input logic start, - output logic done -); - - logic valid[STAGES]; - - assign done = valid[STAGES - 1]; - - always_ff @(posedge clk or negedge rst_n) - valid[0] <= !rst_n ? 0 : start; - - genvar i; - generate - for (i = 1; i < STAGES; ++i) begin: pipeline - always_ff @(posedge clk or negedge rst_n) - valid[i] <= !rst_n ? 0 : valid[i - 1]; - end - endgenerate - -endmodule diff --git a/rtl/gfx/vec_dot.sv b/rtl/gfx/vec_dot.sv index d984504..4e1fdee 100644 --- a/rtl/gfx/vec_dot.sv +++ b/rtl/gfx/vec_dot.sv @@ -3,23 +3,24 @@ module vec_dot ( input logic clk, - rst_n, - input logic start, + input logic stall_mul, + stall_fold, + feedback, + feedback_last, + input vec4 a, b, - output logic done, output fp q ); vec4 products; - logic dones[`FLOATS_PER_VEC]; - horizontal_fold #(.N(`FLOATS_PER_VEC)) fold + horizontal_fold fold ( - .start(dones[0]), .vec(products), + .stall(stall_fold), .* ); @@ -30,8 +31,8 @@ module vec_dot ( .a(a[i]), .b(b[i]), - .done(dones[i]), .q(products[i]), + .stall(stall_mul), .* ); end |
