summaryrefslogtreecommitdiff
path: root/rtl/gfx
diff options
context:
space:
mode:
authorAlejandro Soto <alejandro@34project.org>2023-11-05 21:35:16 -0600
committerAlejandro Soto <alejandro@34project.org>2023-11-10 01:43:02 -0600
commit5c982f38139cd1b0c5b590f67e99b1bcc1a32c9b (patch)
tree8085b34356b79eac8f8a22bc0c484bddfd676b73 /rtl/gfx
parentd5de20fade70a0d454e3aa0087313ca715ff8759 (diff)
rtl/gfx: implement fixed-point FMA
Diffstat (limited to 'rtl/gfx')
-rw-r--r--rtl/gfx/gfx_defs.sv8
-rw-r--r--rtl/gfx/gfx_fixed_fma.sv72
-rw-r--r--rtl/gfx/gfx_fixed_fma_dot.sv48
3 files changed, 125 insertions, 3 deletions
diff --git a/rtl/gfx/gfx_defs.sv b/rtl/gfx/gfx_defs.sv
index 1d2bd68..45e1a63 100644
--- a/rtl/gfx/gfx_defs.sv
+++ b/rtl/gfx/gfx_defs.sv
@@ -14,7 +14,6 @@
typedef logic[`FLOAT_BITS - 1:0] fp;
typedef fp vec2[2];
typedef fp vec4[`FLOATS_PER_VEC];
-typedef fp[`FLOATS_PER_VEC - 1:0] vec4_packed;
typedef vec4 mat4[`VECS_PER_MAT];
`define FP_UNIT 16'h3c00
@@ -48,10 +47,13 @@ typedef struct packed
logic[9:0] r, g, b;
} rgb30;
-typedef logic signed[31:0] fixed;
-
`define FIXED_FRAC 16
+`define FIXED_FMA_STAGES 5
+`define FIXED_FMA_DOT_STAGES (2 * `FIXED_FMA_STAGES)
+
+typedef logic signed[31:0] fixed;
+
typedef struct packed
{
fixed x, y;
diff --git a/rtl/gfx/gfx_fixed_fma.sv b/rtl/gfx/gfx_fixed_fma.sv
new file mode 100644
index 0000000..e1dd6bb
--- /dev/null
+++ b/rtl/gfx/gfx_fixed_fma.sv
@@ -0,0 +1,72 @@
+`include "gfx/gfx_defs.sv"
+
+/* Operación a * b + c en punto fijo, diseñada para cerrar timing fácilmente
+ * en Cyclone V donde los bloques de DSP soportan 18x18. Las etapas son:
+ * - fma_ab: Productos de permutaciones a_lo/hi con b_lo/hi
+ * - fma_pp: Recombinación en FMAs parciales
+ * - fma_lo: Mitad baja del resultado y mitad alta pre-carry
+ * - fma_hi: Mitad alta post-carry
+ *
+ * Nótese que esto toma exactamente el mismo trabajo que a * b
+ * (ver rtl/core/mul.sv en proyecto 2 de arqui 1).
+ */
+module gfx_fixed_fma
+(
+ input logic clk,
+
+ input fixed a,
+ b,
+ c,
+ input logic stall,
+
+ output fixed q
+);
+
+ fixed a_ab, b_ab, c_ab, c_pp;
+ logic[1:0] carry;
+ logic[16:0] lo_left, lo_right;
+ logic[17:0] lo_with_carry;
+ logic[35:0] ab_ll, ab_lh, ab_hl, ab_hh;
+
+ logic[15:0] a_lo, a_hi, b_lo, b_hi, ab_ll_hi, ab_ll_lo, ab_hl_hi, ab_hl_lo,
+ ab_lh_hi, ab_lh_lo, ab_hh_hi, ab_hh_lo, c_hi, c_lo, hi, hi_left, hi_right, lo;
+
+ assign {a_hi, a_lo} = a_ab;
+ assign {b_hi, b_lo} = b_ab;
+ assign {c_hi, c_lo} = c_pp;
+
+ assign {ab_ll_hi, ab_ll_lo} = ab_ll[31:0];
+ assign {ab_lh_hi, ab_lh_lo} = ab_lh[31:0];
+ assign {ab_hl_hi, ab_hl_lo} = ab_hl[31:0];
+ assign {ab_hh_hi, ab_hh_lo} = ab_hh[31:0];
+
+ assign {carry, lo} = lo_with_carry;
+
+ always @(posedge clk)
+ if (!stall) begin
+ a_ab <= a;
+ b_ab <= b;
+ c_ab <= c;
+
+ /* Como los operandos son pequeños (16 bits), esto no se sintetiza,
+ * sino que se enruta a través de los bloques de DSP más cercanos
+ */
+ ab_ll <= {2'd0, a_lo} * {2'd0, b_lo};
+ ab_lh <= {2'd0, a_lo} * {2'd0, b_hi};
+ ab_hl <= {2'd0, a_hi} * {2'd0, b_lo};
+ ab_hh <= {2'd0, a_hi} * {2'd0, b_hi};
+
+ c_pp <= c_ab;
+
+ hi_left <= ab_hh_lo + ab_lh_hi;
+ lo_left <= {1'd0, ab_lh_lo} + {1'd0, ab_hl_lo};
+ hi_right <= ab_hl_hi + c_hi;
+ lo_right <= {1'd0, ab_ll_hi} + {1'd0, c_lo};
+
+ hi <= hi_left + hi_right;
+ lo_with_carry <= {1'd0, lo_left} + {1'd0, lo_right};
+
+ q <= {hi + {14'd0, carry}, lo};
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_fixed_fma_dot.sv b/rtl/gfx/gfx_fixed_fma_dot.sv
new file mode 100644
index 0000000..2831d08
--- /dev/null
+++ b/rtl/gfx/gfx_fixed_fma_dot.sv
@@ -0,0 +1,48 @@
+`include "gfx/gfx_defs.sv"
+
+module gfx_fixed_fma_dot
+(
+ input logic clk,
+
+ input fixed a0,
+ b0,
+ a1,
+ b1,
+ c,
+ input logic stall,
+
+ output fixed q
+);
+
+ fixed q0, a1_hold[`FIXED_FMA_STAGES], b1_hold[`FIXED_FMA_STAGES];
+
+ gfx_fixed_fma fma0
+ (
+ .a(a0),
+ .b(b0),
+ .q(q0),
+ .*
+ );
+
+ gfx_fixed_fma fma1
+ (
+ .a(a1_hold[`FIXED_FMA_STAGES - 1]),
+ .b(b1_hold[`FIXED_FMA_STAGES - 1]),
+ .c(q0),
+ .*
+ );
+
+ integer i;
+
+ always_ff @(posedge clk)
+ if (!stall) begin
+ a1_hold[0] <= a1;
+ b1_hold[0] <= b1;
+
+ for (i = 1; i < `FIXED_FMA_STAGES; ++i) begin
+ a1_hold[i] <= a1_hold[i - 1];
+ b1_hold[i] <= b1_hold[i - 1];
+ end
+ end
+
+endmodule