diff options
| author | Alejandro Soto <alejandro@34project.org> | 2024-03-08 04:36:35 -0600 |
|---|---|---|
| committer | Alejandro Soto <alejandro@34project.org> | 2024-03-08 04:36:35 -0600 |
| commit | 9456d0f772502c4d9891f35cdc433da8332f55ea (patch) | |
| tree | ebbddd6f410183289ed36e53005f156df6d11474 /platform/wavelet3d | |
| parent | c8b633207f42b85480635573fd2a271b842c1260 (diff) | |
platform/wavelet3d: refactor fpint pipeline
Diffstat (limited to 'platform/wavelet3d')
| -rw-r--r-- | platform/wavelet3d/gfx_fpint.sv | 78 | ||||
| -rw-r--r-- | platform/wavelet3d/gfx_fpint_lane.sv | 790 | ||||
| -rw-r--r-- | platform/wavelet3d/gfx_pkg.sv | 164 | ||||
| -rw-r--r-- | platform/wavelet3d/main.cpp | 128 | ||||
| -rw-r--r-- | platform/wavelet3d/mod.mk | 4 |
5 files changed, 832 insertions, 332 deletions
diff --git a/platform/wavelet3d/gfx_fpint.sv b/platform/wavelet3d/gfx_fpint.sv new file mode 100644 index 0000000..babc916 --- /dev/null +++ b/platform/wavelet3d/gfx_fpint.sv @@ -0,0 +1,78 @@ +module gfx_fpint +( + input logic clk, + + input gfx::word a, + b, + input logic setup_mul_float, + setup_unit_b, + mnorm_put_hi, + mnorm_put_lo, + mnorm_put_mul, + mnorm_zero_b, + mnorm_zero_flags, + minmax_copy_flags, + shiftr_int_signed, + addsub_copy_flags, + addsub_int_operand, + clz_force_nop, + shiftl_copy_flags, + round_copy_flags, + round_enable, + encode_enable, + + output gfx::word q +); + + import gfx::*; + + fpint_op op, stage_op[FPINT_STAGES]; + + assign stage_op[0] = op; + + assign op.setup_mul_float = setup_mul_float; + assign op.setup_unit_b = setup_unit_b; + assign op.mnorm_put_hi = mnorm_put_hi; + assign op.mnorm_put_lo = mnorm_put_lo; + assign op.mnorm_put_mul = mnorm_put_mul; + assign op.mnorm_zero_b = mnorm_zero_b; + assign op.mnorm_zero_flags = mnorm_zero_flags; + assign op.minmax_copy_flags = minmax_copy_flags; + assign op.shiftr_int_signed = shiftr_int_signed; + assign op.addsub_copy_flags = addsub_copy_flags; + assign op.addsub_int_operand = addsub_int_operand; + assign op.clz_force_nop = clz_force_nop; + assign op.shiftl_copy_flags = shiftl_copy_flags; + assign op.round_copy_flags = round_copy_flags; + assign op.round_enable = round_enable; + assign op.encode_enable = encode_enable; + + gfx_fpint_lane lane + ( + .clk(clk), + .a(a), + .b(b), + .q(q), + .mul_float_0(stage_op[0].setup_mul_float), + .unit_b_0(stage_op[0].setup_unit_b), + .put_hi_2(stage_op[2].mnorm_put_hi), + .put_lo_2(stage_op[2].mnorm_put_lo), + .put_mul_2(stage_op[2].mnorm_put_mul), + .zero_b_2(stage_op[2].mnorm_zero_b), + .zero_flags_2(stage_op[2].mnorm_zero_flags), + .copy_flags_3(stage_op[3].minmax_copy_flags), + .int_signed_5(stage_op[5].shiftr_int_signed), + .copy_flags_6(stage_op[6].addsub_copy_flags), + .int_operand_6(stage_op[6].addsub_int_operand), + .force_nop_7(stage_op[7].clz_force_nop), + .copy_flags_11(stage_op[11].shiftl_copy_flags), + .copy_flags_12(stage_op[12].round_copy_flags), + .enable_12(stage_op[12].round_enable), + .enable_14(stage_op[14].encode_enable) + ); + + always_ff @(posedge clk) + for (int i = 1; i < FPINT_STAGES; ++i) + stage_op[i] <= stage_op[i - 1]; + +endmodule diff --git a/platform/wavelet3d/gfx_fpint_lane.sv b/platform/wavelet3d/gfx_fpint_lane.sv index 63d56e2..8cb77a8 100644 --- a/platform/wavelet3d/gfx_fpint_lane.sv +++ b/platform/wavelet3d/gfx_fpint_lane.sv @@ -1,28 +1,41 @@ +/* Las 15 etapas son: + * - setup + * - mulclass + * - mnorm + * - minmax + * - expdiff + * - shiftr + * - addsub + * - clz0-clz3 + * - shiftl + * - round + * - rnorm + * - encode + */ module gfx_fpint_lane ( - input logic clk, - - input gfx::float a, - b, - - input logic mul_float_m1, - unit_b_m1, - float_a_1, - int_hi_a_1, - int_lo_a_1, - zero_flags_1, - zero_b_1, - copy_flags_2, - int_signed_4, - copy_flags_5, - int_operand_5, - enable_norm_6, - copy_flags_10, - copy_flags_11, - enable_round_11, - encode_special_13, - - output gfx::float q + input logic clk, + + input gfx::word a, + b, + input logic mul_float_0, + unit_b_0, + put_hi_2, + put_lo_2, + put_mul_2, + zero_b_2, + zero_flags_2, + copy_flags_3, + int_signed_5, + copy_flags_6, + int_operand_6, + force_nop_7, + copy_flags_11, + copy_flags_12, + enable_12, + enable_14, + + output gfx::word q ); import gfx::*; @@ -69,169 +82,284 @@ module gfx_fpint_lane * el exponente. */ - logic exp_step, guard_0, guard_1, guard_2, guard_3, guard_4, guard_5, guard_10, - int_sign, lo_msb, lo_reduce, overflow_0, overflow_1, overflow_10, overflow_12, - round_0, round_1, round_2, round_3, round_4, round_5, round_10, sign_0, - sign_10, sign_11, sign_12, slow_1, slow_2, slow_3, slow_4, slow_5, slow_10, - slow_11, slow_12, slow_in_1, slow_in_next, slow_out, sticky_1, sticky_2, - sticky_3, sticky_4, sticky_5, sticky_10, sticky_last, zero_1, zero_2, zero_3, - zero_4, zero_5, zero_10, zero_11, zero_12; + fpint_setup_mulclass setup_mulclass; + fpint_mulclass_mnorm mulclass_mnorm; + fpint_mnorm_minmax mnorm_minmax; + fpint_minmax_expdiff minmax_expdiff; + fpint_expdiff_shiftr expdiff_shiftr; + fpint_shiftr_addsub shiftr_addsub; + fpint_addsub_clz addsub_clz; + fpint_clz_shiftl clz_shiftl; + fpint_shiftl_round shiftl_round; + fpint_round_rnorm round_rnorm; + fpint_rnorm_encode rnorm_encode; + + gfx_fpint_lane_setup stage_0 + ( + .clk(clk), + .a(a), + .b(b), + .out(setup_mulclass), + .unit_b(unit_b_0), + .mul_float(mul_float_0) + ); - float a_add, a_m1, a_mul, b_add, b_0, b_m1, b_mul, - max_2, max_3, max_4, max_5, min_2, min_3, min_4; + gfx_fpint_lane_mulclass stage_1 + ( + .clk(clk), + .in(setup_mulclass), + .out(mulclass_mnorm) + ); - float_class a_class_0, a_class_1, b_class_0, b_class_1, - max_class_2, max_class_3, min_class_2, min_class_3, min_class_4; + gfx_fpint_lane_mnorm stage_2 + ( + .clk(clk), + .in(mulclass_mnorm), + .out(mnorm_minmax), + .put_hi(put_hi_2), + .put_lo(put_lo_2), + .put_mul(put_mul_2), + .zero_b(zero_b_2), + .zero_flags(zero_flags_2) + ); - word add_sub, clz_in, normalized, product_hi, product_lo; - dword product; - float_exp exp, exp_11, exp_10, exp_12, exp_delta; - float_mant mant_10, mant_11, mant_12; - float_mant_full hi; - logic[$bits(float_mant_full) - 3:0] lo; + gfx_fpint_lane_minmax stage_3 + ( + .clk(clk), + .in(mnorm_minmax), + .out(minmax_expdiff), + .copy_flags(copy_flags_3) + ); - typedef logic[$bits(float_mant_full) + 1:0] extended_mant; - localparam bit[$clog2($bits(extended_mant)):0] MAX_SHIFT = 1 << $clog2($bits(extended_mant)); + gfx_fpint_lane_expdiff stage_4 + ( + .clk(clk), + .in(minmax_expdiff), + .out(expdiff_shiftr) + ); - extended_mant max_mant, min_mant, sticky_mask; - logic[$clog2(MAX_SHIFT):0] clz_shift, exp_shift; + gfx_fpint_lane_shiftr stage_5 + ( + .clk(clk), + .in(expdiff_shiftr), + .out(shiftr_addsub), + .int_signed(int_signed_5) + ); - localparam int INT_SHIFT_REF = $bits(word) - 2; - localparam int SHIFT_WIDTH = {{($bits(int) - $bits(MAX_SHIFT)){1'b0}}, MAX_SHIFT}; - localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(clz_shift) + 1; + gfx_fpint_lane_addsub stage_6 + ( + .clk(clk), + .in(shiftr_addsub), + .out(addsub_clz), + .copy_flags(copy_flags_6), + .int_operand(int_operand_6) + ); - struct packed - { - float max; - logic guard, - round, - slow, - sticky, - zero; - word add_sub; - } clz_hold[FADD_CLZ_STAGES], clz_hold_out; + gfx_fpint_lane_clz stage_7_8_9_10 + ( + .clk(clk), + .in(addsub_clz), + .out(clz_shiftl), + .force_nop(force_nop_7) + ); - gfx_clz #($bits(word)) clz + gfx_fpint_lane_shiftl stage_11 ( .clk(clk), - .clz(clz_shift), - .value(clz_in) + .in(clz_shiftl), + .out(shiftl_round), + .copy_flags(copy_flags_11) ); - function extended_mant extend_min_max(float in, float_class in_class); - extend_min_max = {~in_class.exp_min, in.mant, 2'b00}; - endfunction + gfx_fpint_lane_round stage_12 + ( + .clk(clk), + .in(shiftl_round), + .out(round_rnorm), + .enable(enable_12), + .copy_flags(copy_flags_12) + ); - function word fp_add_sub_arg(extended_mant arg); - fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}}; - endfunction + gfx_fpint_lane_rnorm stage_13 + ( + .clk(clk), + .in(round_rnorm), + .out(rnorm_encode) + ); - assign lo_msb = lo[$bits(lo) - 1]; - assign slow_out = &exp_12 || slow_12 || overflow_12; - assign exp_delta = max_2.exp - min_2.exp; - assign lo_reduce = |lo[$bits(lo) - 2:0]; - assign normalized = clz_hold_out.add_sub << clz_shift; - assign clz_hold_out = clz_hold[FADD_CLZ_STAGES - 1]; - assign slow_in_next = is_float_special(a_class_0) | is_float_special(b_class_0); - assign {product_hi, product_lo} = product; - assign {hi, guard_0, round_0, lo} = product[2 * $bits(float_mant_full) - 1:0]; + gfx_fpint_lane_encode stage_14 + ( + .clk(clk), + .q(q), + .in(rnorm_encode), + .enable(enable_14) + ); - always_comb begin - clz_in = add_sub; - if (~enable_norm_6) - clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01; - end +endmodule - always_ff @(posedge clk) begin - // Stage -1: +// Stage 0: argumentos de mul +module gfx_fpint_lane_setup +( + input logic clk, - a_m1 <= a; - b_m1 <= b; - a_mul <= a; - b_mul <= b; + input gfx::word a, + b, + input logic mul_float, + unit_b, + + output gfx::fpint_setup_mulclass out +); + + always_ff @(posedge clk) begin + out.a <= a; + out.b <= b; + out.a_mul <= a; + out.b_mul <= b; /* Nótese que el orden es sign-exp-mant. Esto coloca el '1.' implícito * en la posición correcta para multiplicar las mantisas. */ - if (mul_float_m1) begin - a_mul.exp <= 1; - b_mul.exp <= 1; - a_mul.sign <= 0; - b_mul.sign <= 0; + if (mul_float) begin + out.a_mul.exp <= 1; + out.b_mul.exp <= 1; + out.a_mul.sign <= 0; + out.b_mul.sign <= 0; end - if (unit_b_m1) begin - b_mul.exp <= 0; - b_mul.mant <= 1; - b_mul.sign <= 0; + if (unit_b) begin + out.b_mul.exp <= 0; + out.b_mul.mant <= 1; + out.b_mul.sign <= 0; end + end + +endmodule + +// Stage 1: multiplicación de fp o enteros +module gfx_fpint_lane_mulclass +( + input logic clk, + + input gfx::fpint_setup_mulclass in, + + output gfx::fpint_mulclass_mnorm out +); + + import gfx::*; + + always_ff @(posedge clk) begin + out.b <= in.b; + out.sign <= in.a.sign ^ in.b.sign; + out.a_class <= classify_float(in.a); + out.b_class <= classify_float(in.b); + out.product <= in.a_mul * in.b_mul; + {out.overflow, out.exp} <= {1'b0, in.a.exp} + {1'b0, in.b.exp} - {1'b0, FLOAT_EXP_BIAS}; + end + +endmodule - // Stage 0: multiplicación de fp o enteros +// Stage 2: normalización +module gfx_fpint_lane_mnorm +( + input logic clk, + + input gfx::fpint_mulclass_mnorm in, + input logic put_hi, + put_lo, + put_mul, + zero_b, + zero_flags, + + output gfx::fpint_mnorm_minmax out +); + + import gfx::*; - b_0 <= b_m1; - sign_0 <= a_m1.sign ^ b_m1.sign; - product <= a_mul * b_mul; - a_class_0 <= classify_float(a_m1); - b_class_0 <= classify_float(b_m1); - {overflow_0, exp} <= {1'b0, a_m1.exp} + {1'b0, b_m1.exp} - {1'b0, FLOAT_EXP_BIAS}; + word product_hi, product_lo; + logic guard, lo_msb, lo_reduce, round, slow_in_next; + float_mant_full hi; + logic[$bits(float_mant_full) - 3:0] lo; - // Stage 1: normalización + assign lo_msb = lo[$bits(lo) - 1]; + assign lo_reduce = |lo[$bits(lo) - 2:0]; + assign slow_in_next = is_float_special(in.a_class) | is_float_special(in.b_class); + assign {product_hi, product_lo} = in.product; + assign {hi, guard, round, lo} = in.product[2 * $bits(float_mant_full) - 1:0]; - if (float_a_1) begin - slow_1 <= slow_in_next | (overflow_0 & ~a_class_0.exp_min & ~a_class_1.exp_min); - zero_1 <= a_class_0.exp_min | b_class_0.exp_min; + always_ff @(posedge clk) begin + if (put_mul) begin + out.slow <= slow_in_next | (in.overflow & ~in.a_class.exp_min & ~in.a_class.exp_min); + out.zero <= in.a_class.exp_min | in.b_class.exp_min; end else begin - slow_1 <= 0; - zero_1 <= 0; + out.slow <= 0; + out.zero <= 0; end - overflow_1 <= 0; - a_add.sign <= sign_0; + out.a.sign <= in.sign; + out.overflow <= 0; if (hi[$bits(hi) - 1]) begin - guard_1 <= guard_0; - round_1 <= round_0; - sticky_1 <= lo_msb | lo_reduce; - a_add.mant <= implicit_mant(hi); - {overflow_1, a_add.exp} <= {1'b0, exp} + 1; + out.guard <= guard; + out.round <= round; + out.sticky <= lo_msb | lo_reduce; + out.a.mant <= implicit_mant(hi); + {out.overflow, out.a.exp} <= {1'b0, in.exp} + 1; end else begin /* Bit antes de msb es necesariamente 1, ya que los msb de * ambos multiplicandos son 1. Ver assert en implicit_mant(). */ - guard_1 <= round_0; - round_1 <= lo[$bits(lo) - 1]; - sticky_1 <= lo_reduce; - a_add.exp <= exp; - a_add.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard_0}); + out.guard <= round; + out.round <= lo_msb; + out.sticky <= lo_reduce; + + out.a.exp <= in.exp; + out.a.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard}); end unique case (1'b1) - float_a_1: ; + put_mul: ; - int_hi_a_1: - a_add <= product_hi; + put_hi: + out.a <= product_hi; - int_lo_a_1: - a_add <= product_lo; + put_lo: + out.a <= product_lo; endcase - a_class_1 <= a_class_0; - slow_in_1 <= slow_in_next; + out.a_class <= in.a_class; + out.slow_in <= slow_in_next; - if (zero_flags_1) begin - a_class_1 <= classify_float(0); - slow_in_1 <= 0; + if (zero_flags) begin + out.a_class <= classify_float(0); + out.slow_in <= 0; end - if (zero_b_1) begin - b_add <= 0; - b_class_1 <= classify_float(0); + if (zero_b) begin + out.b <= 0; + out.b_class <= classify_float(0); end else begin - b_add <= b_0; - b_class_1 <= b_class_0; + out.b <= in.b; + out.b_class <= in.b_class; end + end + +endmodule + +// Stage 3: ordenar tal que abs(max) >= abs(min) +module gfx_fpint_lane_minmax +( + input logic clk, + + input gfx::fpint_mnorm_minmax in, + input logic copy_flags, - /* Stage 2: ordenar tal que abs(max) >= abs(min). Wiki dice: + output gfx::fpint_minmax_expdiff out +); + + import gfx::*; + + always_ff @(posedge clk) begin + /* Wiki dice: * * A property of the single- and double-precision formats is that * their encoding allows one to easily sort them without using @@ -241,164 +369,308 @@ module gfx_fpint_lane * floating-point representation also had this property for normalized * numbers). */ - if ({b_add.exp, b_add.mant} > {a_add.exp, a_add.mant}) begin - max_2 <= b_add; - min_2 <= a_add; - max_class_2 <= b_class_1; - min_class_2 <= a_class_1; + if ({in.b.exp, in.b.mant} > {in.a.exp, in.a.mant}) begin + out.max <= in.b; + out.min <= in.a; + out.max_class <= in.b_class; + out.min_class <= in.a_class; end else begin - max_2 <= a_add; - min_2 <= b_add; - max_class_2 <= a_class_1; - min_class_2 <= b_class_1; + out.max <= in.a; + out.min <= in.b; + out.max_class <= in.a_class; + out.min_class <= in.b_class; end - guard_2 <= guard_1; - round_2 <= round_1; - sticky_2 <= sticky_1; + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; - if (copy_flags_2) begin - slow_2 <= slow_1 | overflow_1; - zero_2 <= zero_1; + if (copy_flags) begin + out.slow <= in.slow | in.overflow; + out.zero <= in.zero; end else begin - slow_2 <= slow_in_1; - zero_2 <= 0; + out.slow <= in.slow_in; + out.zero <= 0; end + end + +endmodule + +// Stage 4: exp_shift amount +module gfx_fpint_lane_expdiff +( + input logic clk, + + input gfx::fpint_minmax_expdiff in, + + output gfx::fpint_expdiff_shiftr out +); + + import gfx::*; + + float_exp exp_delta; + + assign exp_delta = in.max.exp - in.min.exp; + + always_ff @(posedge clk) begin + out.max <= in.max; + out.min <= in.min; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + out.max_class <= in.max_class; + out.min_class <= in.min_class; + + out.exp_shift <= exp_delta[$bits(out.exp_shift) - 1:0]; + if (exp_delta > {{($bits(exp_delta) - $bits(FPINT_MAX_SHIFT)){1'b0}}, FPINT_MAX_SHIFT}) + out.exp_shift <= FPINT_MAX_SHIFT; + end + +endmodule + +// Stage 5: shifts y abs(max) para enteros con signo +module gfx_fpint_lane_shiftr +( + input logic clk, + + input gfx::fpint_expdiff_shiftr in, + input logic int_signed, + + output gfx::fpint_shiftr_addsub out +); + + import gfx::*; + + always_ff @(posedge clk) begin + out.min <= in.min; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + out.min_class <= in.min_class; + + out.max_mant <= float_prepare_round(in.max, in.max_class); + out.min_mant <= float_prepare_round(in.min, in.min_class) >> in.exp_shift; + out.sticky_mask <= {($bits(out.min_mant)){1'b1}} << in.exp_shift; + + out.max <= in.max; + out.int_sign <= in.max[$bits(in.max) - 1]; + + if (int_signed & in.max[$bits(in.max) - 1]) + out.max <= -in.max; + end + +endmodule + +// Stage 6: suma de mantisas +module gfx_fpint_lane_addsub +( + input logic clk, + + input gfx::fpint_shiftr_addsub in, + input logic copy_flags, + int_operand, - // Stage 3: exp_shift amount - - max_3 <= max_2; - min_3 <= min_2; - slow_3 <= slow_2; - zero_3 <= zero_2; - guard_3 <= guard_2; - round_3 <= round_2; - sticky_3 <= sticky_2; - max_class_3 <= max_class_2; - min_class_3 <= min_class_2; - - exp_shift <= exp_delta[$bits(exp_shift) - 1:0]; - if (exp_delta > {{($bits(exp_delta) - $bits(MAX_SHIFT)){1'b0}}, MAX_SHIFT}) - exp_shift <= MAX_SHIFT; - - // Stage 4: shifts y abs(max) para enteros con signo - - min_4 <= min_3; - slow_4 <= slow_3; - zero_4 <= zero_3; - guard_4 <= guard_3; - round_4 <= round_3; - sticky_4 <= sticky_3; - min_class_4 <= min_class_3; - - max_mant <= extend_min_max(max_3, max_class_3); - min_mant <= extend_min_max(min_3, min_class_3) >> exp_shift; - sticky_mask <= {($bits(min_mant)){1'b1}} << exp_shift; - - max_4 <= max_3; - int_sign <= max_3[$bits(max_3) - 1]; - - if (int_signed_4 & max_3[$bits(max_3) - 1]) - max_4 <= -max_3; - - // Stage 5: suma de mantisas - - max_5 <= max_4; - slow_5 <= slow_4; - zero_5 <= zero_4; - guard_5 <= guard_4; - round_5 <= round_4; - - if (int_operand_5) begin - max_5.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0]; - max_5.sign <= int_sign; + output gfx::fpint_addsub_clz out +); + + import gfx::*; + + localparam int INT_SHIFT_REF = $bits(word) - 2; + + function word fp_add_sub_arg(float_mant_ext arg); + fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}}; + endfunction + + always_ff @(posedge clk) begin + out.max <= in.max; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + + if (int_operand) begin + out.max.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0]; + out.max.sign <= in.int_sign; end - if (copy_flags_5) - sticky_5 <= sticky_4; + if (copy_flags) + out.sticky <= in.sticky; else - sticky_5 <= |(extend_min_max(min_4, min_class_4) & ~sticky_mask); + out.sticky <= |(float_prepare_round(in.min, in.min_class) & ~in.sticky_mask); - if (int_operand_5) - add_sub <= max_4; - else if (max_4.sign ^ min_4.sign) - add_sub <= fp_add_sub_arg(max_mant) - fp_add_sub_arg(min_mant); + if (int_operand) + out.add_sub <= in.max; + else if (in.max.sign ^ in.min.sign) + out.add_sub <= fp_add_sub_arg(in.max_mant) - fp_add_sub_arg(in.min_mant); else - add_sub <= fp_add_sub_arg(max_mant) + fp_add_sub_arg(min_mant); + out.add_sub <= fp_add_sub_arg(in.max_mant) + fp_add_sub_arg(in.min_mant); + end + +endmodule - // Stages 6-9: clz +// Stages 7-10: encontrar el 1 más significativo +module gfx_fpint_lane_clz +( + input logic clk, + + input gfx::fpint_addsub_clz in, + input logic force_nop, + + output gfx::fpint_clz_shiftl out +); - clz_hold[0].max <= max_5; - clz_hold[0].slow <= slow_5; - clz_hold[0].zero <= zero_5; - clz_hold[0].guard <= guard_5; - clz_hold[0].round <= round_5; - clz_hold[0].sticky <= sticky_5; - clz_hold[0].add_sub <= add_sub; + import gfx::*; - for (int i = 1; i < FADD_CLZ_STAGES; ++i) - clz_hold[i] <= clz_hold[i - 1]; + word clz_in; + fpint_clz_hold hold[FPINT_CLZ_STAGES]; - // Stage 10: normalización + assign out.hold = hold[FPINT_CLZ_STAGES - 1]; - sign_10 <= clz_hold_out.max.sign; - slow_10 <= clz_hold_out.slow; - zero_10 <= clz_hold_out.zero; - sticky_10 <= clz_hold_out.sticky; + gfx_clz #($bits(word)) clz + ( + .clk(clk), + .clz(out.shift), + .value(clz_in) + ); - {mant_10, guard_10, round_10, sticky_last} <= + always_comb begin + clz_in = in.add_sub; + if (force_nop) + clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01; + end + + always_ff @(posedge clk) begin + hold[0] <= in; + + for (int i = 1; i < FPINT_CLZ_STAGES; ++i) + hold[i] <= hold[i - 1]; + end + +endmodule + +// Stage 11: normalización +module gfx_fpint_lane_shiftl +( + input logic clk, + + input gfx::fpint_clz_shiftl in, + input logic copy_flags, + + output gfx::fpint_shiftl_round out +); + + import gfx::*; + + localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(in.shift) + 1; + + word normalized; + + assign normalized = in.hold.add_sub << in.shift; + + always_ff @(posedge clk) begin + out.slow <= in.hold.slow; + out.zero <= in.hold.zero; + out.sticky <= in.hold.sticky; + out.val.sign <= in.hold.max.sign; + + {out.val.mant, out.guard, out.round, out.sticky_last} <= normalized[$bits(normalized) - 2:$bits(normalized) - $bits(float_mant) - 4]; - {overflow_10, exp_10} <= - {1'b0, clz_hold_out.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, clz_shift} + 1; + {out.overflow, out.val.exp} <= + {1'b0, in.hold.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, in.shift} + 1; - if (clz_shift[$bits(clz_shift) - 1]) - zero_10 <= 1; + if (in.shift[$bits(in.shift) - 1]) + out.zero <= 1; - if (copy_flags_10) begin - guard_10 <= clz_hold_out.guard; - round_10 <= clz_hold_out.round; - sticky_last <= 0; - overflow_10 <= 0; + if (copy_flags) begin + out.guard <= in.hold.guard; + out.round <= in.hold.round; + out.overflow <= 0; + out.sticky_last <= 0; end + end + +endmodule + +// Stage 12: redondeo +module gfx_fpint_lane_round +( + input logic clk, + + input gfx::fpint_shiftl_round in, + input logic copy_flags, + enable, + + output gfx::fpint_round_rnorm out +); + + import gfx::*; - // Stage 11: redondeo + always_ff @(posedge clk) begin + out.val <= in.val; + out.slow <= in.slow | (~copy_flags & in.overflow & ~in.zero); + out.zero <= in.zero; + out.exp_step <= 0; + + // Este es el modo de redondeo más usual: round to nearest, ties to even + if (enable & in.guard & (in.round | in.sticky | in.sticky_last | in.val.mant[0])) + {out.exp_step, out.val.mant} <= {1'b0, out.val.mant} + 1; + end + +endmodule - exp_11 <= exp_10; - mant_11 <= mant_10; - sign_11 <= sign_10; - slow_11 <= slow_10 | (~copy_flags_11 & overflow_10 & ~zero_10); - zero_11 <= zero_10; - exp_step <= 0; +// Stage 13: ajuste de exponente por redondeo +module gfx_fpint_lane_rnorm +( + input logic clk, - // Este es el modo más común: round to nearest, ties to even - if (enable_round_11 & guard_10 & (round_10 | sticky_10 | sticky_last | mant_10[0])) - {exp_step, mant_11} <= {1'b0, mant_10} + 1; + input gfx::fpint_round_rnorm in, - // Stage 12: ajuste de exponente por redondeo + output gfx::fpint_rnorm_encode out +); - sign_12 <= sign_11; - slow_12 <= slow_11; - zero_12 <= zero_11; - mant_12 <= mant_11; - overflow_12 <= 0; + import gfx::*; - if (exp_step) - {overflow_12, exp_12} <= {1'b0, exp_11} + 1; + always_ff @(posedge clk) begin + out.slow <= in.slow; + out.zero <= in.zero; + out.overflow <= 0; + out.val.mant <= in.val.mant; + out.val.sign <= in.val.sign; + + if (in.exp_step) + {out.overflow, out.val.exp} <= {1'b0, in.val.exp} + 1; else - exp_12 <= exp_11; + out.val.exp <= in.val.exp; + end + +endmodule + +// Stage 14: salida y codificación de ceros y NaNs +module gfx_fpint_lane_encode +( + input logic clk, + + input gfx::fpint_rnorm_encode in, + input logic enable, + + output gfx::float q +); - // Stage 13: ceros y NaNs + import gfx::*; - q.exp <= exp_12; - q.mant <= mant_12; - q.sign <= sign_12; + always_ff @(posedge clk) begin + q <= in.val; - if (encode_special_13) begin - if (slow_out) begin + if (enable) begin + if (&in.val.exp | in.slow | in.overflow) begin q.exp <= FLOAT_EXP_MAX; q.mant <= 1; - end else if (zero_12) begin + end else if (in.zero) begin q.exp <= 0; q.mant <= 0; end diff --git a/platform/wavelet3d/gfx_pkg.sv b/platform/wavelet3d/gfx_pkg.sv index cfab6a5..5c420cc 100644 --- a/platform/wavelet3d/gfx_pkg.sv +++ b/platform/wavelet3d/gfx_pkg.sv @@ -5,7 +5,8 @@ package gfx; typedef logic[7:0] float_exp; typedef logic[$bits(word) - $bits(float_exp) - 2:0] float_mant; - typedef logic[$bits(float_mant):0] float_mant_full; // Incluye '1.' explícito + typedef logic[$bits(float_mant):0] float_mant_full; // Incluye '1.' explícito + typedef logic[$bits(float_mant_full) + 1:0] float_mant_ext; // Considera overflow localparam float_exp FLOAT_EXP_BIAS = (1 << ($bits(float_exp) - 1)) - 1; localparam float_exp FLOAT_EXP_MAX = {($bits(float_exp)){1'b1}}; @@ -56,14 +57,163 @@ package gfx; is_float_special = in.exp_max | (in.exp_min & ~in.mant_zero); endfunction - /* -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32 - */ - localparam FADD_CLZ_STAGES = 4; + function float_mant_ext float_prepare_round(float in, float_class in_class); + float_prepare_round = {~in_class.exp_min, in.mant, 2'b00}; + endfunction + + // -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32 + localparam int FPINT_CLZ_STAGES = 4; + localparam int FPINT_STAGES = 7 + FPINT_CLZ_STAGES + 4; + + localparam bit[$clog2($bits(float_mant_ext)):0] FPINT_MAX_SHIFT + = 1 << $clog2($bits(float_mant_ext)); + + typedef logic[$clog2(FPINT_MAX_SHIFT):0] fpint_shift; + + typedef struct packed + { + logic setup_mul_float, + setup_unit_b, + mnorm_put_hi, + mnorm_put_lo, + mnorm_put_mul, + mnorm_zero_b, + mnorm_zero_flags, + minmax_copy_flags, + shiftr_int_signed, + addsub_copy_flags, + addsub_int_operand, + clz_force_nop, + shiftl_copy_flags, + round_copy_flags, + round_enable, + encode_enable; + } fpint_op; + + typedef struct packed + { + float a, + b, + a_mul, + b_mul; + } fpint_setup_mulclass; + + typedef struct packed + { + float b; + float_exp exp; + float_class a_class, + b_class; + dword product; + logic sign, + overflow; + } fpint_mulclass_mnorm; + + typedef struct packed + { + float a, + b; + float_class a_class, + b_class; + logic slow, + zero, + guard, + round, + sticky, + slow_in, + overflow; + } fpint_mnorm_minmax; + + typedef struct packed + { + float max, + min; + float_class max_class, + min_class; + logic slow, + zero, + guard, + round, + sticky; + } fpint_minmax_expdiff; + + typedef struct packed + { + float max, + min; + float_class max_class, + min_class; + fpint_shift exp_shift; + logic slow, + zero, + guard, + round, + sticky; + } fpint_expdiff_shiftr; + + typedef struct packed + { + float max, + min; + float_class max_class, + min_class; + float_mant_ext max_mant, + min_mant, + sticky_mask; + logic slow, + zero, + guard, + round, + sticky, + int_sign; + } fpint_shiftr_addsub; typedef struct packed { - logic fadd, - fmul; - } arith_op; + float max; + word add_sub; + logic slow, + zero, + guard, + round, + sticky; + } fpint_clz_hold; + + typedef fpint_clz_hold fpint_addsub_clz; + + typedef struct packed + { + fpint_clz_hold hold; + fpint_shift shift; + } fpint_clz_shiftl; + + typedef struct packed + { + float val; + logic slow, + zero, + guard, + round, + sticky, + overflow, + sticky_last; + } fpint_shiftl_round; + + typedef struct packed + { + float val; + logic slow, + zero, + exp_step, + overflow; + } fpint_round_rnorm; + + typedef struct packed + { + float val; + logic slow, + zero, + overflow; + } fpint_rnorm_encode; endpackage diff --git a/platform/wavelet3d/main.cpp b/platform/wavelet3d/main.cpp index 037aee4..49c96c1 100644 --- a/platform/wavelet3d/main.cpp +++ b/platform/wavelet3d/main.cpp @@ -30,76 +30,76 @@ int main(int argc, char **argv) std::cin >> a >> b; // int->fp - top.mul_float_m1 = 0; - top.unit_b_m1 = 1; - top.float_a_1 = 0; - top.int_hi_a_1 = 0; - top.int_lo_a_1 = 1; - top.zero_flags_1 = 1; - top.zero_b_1 = 1; - top.copy_flags_2 = 0; - top.int_signed_4 = 1; - top.int_operand_5 = 1; - top.copy_flags_5 = 1; - top.enable_norm_6 = 1; - top.copy_flags_10 = 0; - top.copy_flags_11 = 0; - top.enable_round_11 = 1; - top.encode_special_13 = 1; + top.setup_mul_float = 0; + top.setup_unit_b = 1; + top.mnorm_put_hi = 0; + top.mnorm_put_lo = 1; + top.mnorm_put_mul = 0; + top.mnorm_zero_flags = 1; + top.mnorm_zero_b = 1; + top.minmax_copy_flags = 0; + top.shiftr_int_signed = 1; + top.addsub_int_operand = 1; + top.addsub_copy_flags = 1; + top.clz_force_nop = 1; + top.shiftl_copy_flags = 0; + top.round_copy_flags = 0; + top.round_enable = 1; + top.encode_enable = 1; // mul int - //top.mul_float_m1 = 0; - //top.unit_b_m1 = 0; - //top.float_a_1 = 0; - //top.int_hi_a_1 = 0; - //top.int_lo_a_1 = 1; - //top.zero_flags_1 = 1; - //top.zero_b_1 = 1; - //top.copy_flags_2 = 1; - //top.int_signed_4 = 0; - //top.int_operand_5 = 0; - //top.copy_flags_5 = 1; - //top.enable_norm_6 = 0; - //top.copy_flags_10 = 1; - //top.copy_flags_11 = 1; - //top.enable_round_11 = 0; - //top.encode_special_13 = 0; + //top.setup_mul_float = 0; + //top.setup_unit_b = 0; + //top.mnorm_put_hi = 0; + //top.mnorm_put_lo = 1; + //top.mnorm_put_mul = 0; + //top.mnorm_zero_flags = 1; + //top.mnorm_zero_b = 1; + //top.minmax_copy_flags = 1; + //top.shiftr_int_signed = 0; + //top.addsub_int_operand = 0; + //top.addsub_copy_flags = 1; + //top.clz_force_nop = 0; + //top.shiftl_copy_flags = 1; + //top.round_copy_flags = 1; + //top.round_enable = 0; + //top.encode_enable = 0; // mul fp - //top.mul_float_m1 = 1; - //top.unit_b_m1 = 0; - //top.float_a_1 = 1; - //top.int_hi_a_1 = 0; - //top.int_lo_a_1 = 0; - //top.zero_flags_1 = 0; - //top.zero_b_1 = 1; - //top.copy_flags_2 = 1; - //top.copy_flags_5 = 1; - //top.int_signed_4 = 0; - //top.int_operand_5 = 0; - //top.enable_norm_6 = 1; - //top.copy_flags_10 = 1; - //top.copy_flags_11 = 1; - //top.enable_round_11 = 1; - //top.encode_special_13 = 1; + //top.setup_mul_float = 1; + //top.setup_unit_b = 0; + //top.mnorm_put_hi = 0; + //top.mnorm_put_lo = 0; + //top.mnorm_put_mul = 1; + //top.mnorm_zero_flags = 0; + //top.mnorm_zero_b = 1; + //top.minmax_copy_flags = 1; + //top.shiftr_int_signed = 0; + //top.addsub_int_operand = 0; + //top.addsub_copy_flags = 1; + //top.clz_force_nop = 1; + //top.shiftl_copy_flags = 1; + //top.round_copy_flags = 1; + //top.round_enable = 1; + //top.encode_enable = 1; // suma/resta - //top.mul_float_m1 = 0; - //top.unit_b_m1 = 1; - //top.float_a_1 = 0; - //top.int_hi_a_1 = 0; - //top.int_lo_a_1 = 1; - //top.zero_flags_1 = 0; - //top.zero_b_1 = 0; - //top.copy_flags_2 = 0; - //top.copy_flags_5 = 0; - //top.int_signed_4 = 0; - //top.int_operand_5 = 0; - //top.enable_norm_6 = 1; - //top.copy_flags_10 = 0; - //top.copy_flags_11 = 0; - //top.enable_round_11 = 1; - //top.encode_special_13 = 1; + //top.setup_mul_float = 0; + //top.setup_unit_b = 1; + //top.mnorm_put_hi = 0; + //top.mnorm_put_lo = 1; + //top.mnorm_put_mul = 0; + //top.mnorm_zero_flags = 0; + //top.mnorm_zero_b = 0; + //top.minmax_copy_flags = 0; + //top.shiftr_int_signed = 0; + //top.addsub_int_operand = 0; + //top.addsub_copy_flags = 0; + //top.clz_force_nop = 1; + //top.shiftl_copy_flags = 0; + //top.round_copy_flags = 0; + //top.round_enable = 1; + //top.encode_enable = 1; top.a = *reinterpret_cast<unsigned*>(&a); top.b = *reinterpret_cast<unsigned*>(&b); diff --git a/platform/wavelet3d/mod.mk b/platform/wavelet3d/mod.mk index 232d808..6b8bd47 100644 --- a/platform/wavelet3d/mod.mk +++ b/platform/wavelet3d/mod.mk @@ -1,9 +1,9 @@ define core $(this)/deps := - $(this)/rtl_top := gfx_fpint_lane + $(this)/rtl_top := gfx_fpint $(this)/rtl_dirs := . - $(this)/rtl_files := gfx_pkg.sv gfx_fpint_lane.sv + $(this)/rtl_files := gfx_pkg.sv gfx_fpint.sv $(this)/vl_main := main.cpp $(this)/vl_pkgconfig := python3-embed |
