5 files changed, 832 insertions, 332 deletions
diff --git a/platform/wavelet3d/gfx_fpint.sv b/platform/wavelet3d/gfx_fpint.sv
new file mode 100644
index 0000000..babc916
--- /dev/null
+++ b/platform/wavelet3d/gfx_fpint.sv
@@ -0,0 +1,78 @@
+module gfx_fpint
+(
+	input  logic         clk,
+
+	input  gfx::word     a,
+	                     b,
+	input logic          setup_mul_float,
+	                     setup_unit_b,
+	                     mnorm_put_hi,
+	                     mnorm_put_lo,
+	                     mnorm_put_mul,
+	                     mnorm_zero_b,
+	                     mnorm_zero_flags,
+	                     minmax_copy_flags,
+	                     shiftr_int_signed,
+	                     addsub_copy_flags,
+	                     addsub_int_operand,
+	                     clz_force_nop,
+	                     shiftl_copy_flags,
+	                     round_copy_flags,
+	                     round_enable,
+	                     encode_enable,
+
+	output gfx::word     q
+);
+
+	import gfx::*;
+
+	fpint_op op, stage_op[FPINT_STAGES];
+
+	assign stage_op[0] = op;
+
+	assign op.setup_mul_float = setup_mul_float;
+	assign op.setup_unit_b = setup_unit_b;
+	assign op.mnorm_put_hi = mnorm_put_hi;
+	assign op.mnorm_put_lo = mnorm_put_lo;
+	assign op.mnorm_put_mul = mnorm_put_mul;
+	assign op.mnorm_zero_b = mnorm_zero_b;
+	assign op.mnorm_zero_flags = mnorm_zero_flags;
+	assign op.minmax_copy_flags = minmax_copy_flags;
+	assign op.shiftr_int_signed = shiftr_int_signed;
+	assign op.addsub_copy_flags = addsub_copy_flags;
+	assign op.addsub_int_operand = addsub_int_operand;
+	assign op.clz_force_nop = clz_force_nop;
+	assign op.shiftl_copy_flags = shiftl_copy_flags;
+	assign op.round_copy_flags = round_copy_flags;
+	assign op.round_enable = round_enable;
+	assign op.encode_enable = encode_enable;
+
+	gfx_fpint_lane lane
+	(
+		.clk(clk),
+		.a(a),
+		.b(b),
+		.q(q),
+		.mul_float_0(stage_op[0].setup_mul_float),
+		.unit_b_0(stage_op[0].setup_unit_b),
+		.put_hi_2(stage_op[2].mnorm_put_hi),
+		.put_lo_2(stage_op[2].mnorm_put_lo),
+		.put_mul_2(stage_op[2].mnorm_put_mul),
+		.zero_b_2(stage_op[2].mnorm_zero_b),
+		.zero_flags_2(stage_op[2].mnorm_zero_flags),
+		.copy_flags_3(stage_op[3].minmax_copy_flags),
+		.int_signed_5(stage_op[5].shiftr_int_signed),
+		.copy_flags_6(stage_op[6].addsub_copy_flags),
+		.int_operand_6(stage_op[6].addsub_int_operand),
+		.force_nop_7(stage_op[7].clz_force_nop),
+		.copy_flags_11(stage_op[11].shiftl_copy_flags),
+		.copy_flags_12(stage_op[12].round_copy_flags),
+		.enable_12(stage_op[12].round_enable),
+		.enable_14(stage_op[14].encode_enable)
+	);
+
+	always_ff @(posedge clk)
+		for (int i = 1; i < FPINT_STAGES; ++i)
+			stage_op[i] <= stage_op[i - 1];
+
+endmodule
diff --git a/platform/wavelet3d/gfx_fpint_lane.sv b/platform/wavelet3d/gfx_fpint_lane.sv
index 63d56e2..8cb77a8 100644
--- a/platform/wavelet3d/gfx_fpint_lane.sv
+++ b/platform/wavelet3d/gfx_fpint_lane.sv
@@ -1,28 +1,41 @@
+/* Las 15 etapas son:
+ * - setup
+ * - mulclass
+ * - mnorm
+ * - minmax
+ * - expdiff
+ * - shiftr
+ * - addsub
+ * - clz0-clz3
+ * - shiftl
+ * - round
+ * - rnorm
+ * - encode
+ */
 module gfx_fpint_lane
 (
-	input  logic      clk,
-
-	input  gfx::float a,
-	                  b,
-
-	input  logic      mul_float_m1,
-	                  unit_b_m1,
-	                  float_a_1,
-	                  int_hi_a_1,
-	                  int_lo_a_1,
-	                  zero_flags_1,
-	                  zero_b_1,
-	                  copy_flags_2,
-	                  int_signed_4,
-	                  copy_flags_5,
-	                  int_operand_5,
-	                  enable_norm_6,
-	                  copy_flags_10,
-	                  copy_flags_11,
-	                  enable_round_11,
-	                  encode_special_13,
-
-	output gfx::float q
+	input  logic         clk,
+
+	input  gfx::word     a,
+	                     b,
+	input logic          mul_float_0,
+	                     unit_b_0,
+	                     put_hi_2,
+	                     put_lo_2,
+	                     put_mul_2,
+	                     zero_b_2,
+	                     zero_flags_2,
+	                     copy_flags_3,
+	                     int_signed_5,
+	                     copy_flags_6,
+	                     int_operand_6,
+	                     force_nop_7,
+	                     copy_flags_11,
+	                     copy_flags_12,
+	                     enable_12,
+	                     enable_14,
+
+	output gfx::word     q
 );
 
 	import gfx::*;
@@ -69,169 +82,284 @@ module gfx_fpint_lane
 	 * el exponente.
 	 */
 
-	logic exp_step, guard_0, guard_1, guard_2, guard_3, guard_4, guard_5, guard_10,
-	      int_sign, lo_msb, lo_reduce, overflow_0, overflow_1, overflow_10, overflow_12,
-	      round_0, round_1, round_2, round_3, round_4, round_5, round_10, sign_0,
-	      sign_10, sign_11, sign_12, slow_1, slow_2, slow_3, slow_4, slow_5, slow_10,
-	      slow_11, slow_12, slow_in_1, slow_in_next, slow_out, sticky_1, sticky_2,
-	      sticky_3, sticky_4, sticky_5, sticky_10, sticky_last, zero_1, zero_2, zero_3,
-	      zero_4, zero_5, zero_10, zero_11, zero_12;
+	fpint_setup_mulclass setup_mulclass;
+	fpint_mulclass_mnorm mulclass_mnorm;
+	fpint_mnorm_minmax   mnorm_minmax;
+	fpint_minmax_expdiff minmax_expdiff;
+	fpint_expdiff_shiftr expdiff_shiftr;
+	fpint_shiftr_addsub  shiftr_addsub;
+	fpint_addsub_clz     addsub_clz;
+	fpint_clz_shiftl     clz_shiftl;
+	fpint_shiftl_round   shiftl_round;
+	fpint_round_rnorm    round_rnorm;
+	fpint_rnorm_encode   rnorm_encode;
+
+	gfx_fpint_lane_setup stage_0
+	(
+		.clk(clk),
+		.a(a),
+		.b(b),
+		.out(setup_mulclass),
+		.unit_b(unit_b_0),
+		.mul_float(mul_float_0)
+	);
 
-	float a_add, a_m1, a_mul, b_add, b_0, b_m1, b_mul,
-	      max_2, max_3, max_4, max_5, min_2, min_3, min_4;
+	gfx_fpint_lane_mulclass stage_1
+	(
+		.clk(clk),
+		.in(setup_mulclass),
+		.out(mulclass_mnorm)
+	);
 
-	float_class a_class_0, a_class_1, b_class_0, b_class_1,
-	            max_class_2, max_class_3, min_class_2, min_class_3, min_class_4;
+	gfx_fpint_lane_mnorm stage_2
+	(
+		.clk(clk),
+		.in(mulclass_mnorm),
+		.out(mnorm_minmax),
+		.put_hi(put_hi_2),
+		.put_lo(put_lo_2),
+		.put_mul(put_mul_2),
+		.zero_b(zero_b_2),
+		.zero_flags(zero_flags_2)
+	);
 
-	word add_sub, clz_in, normalized, product_hi, product_lo;
-	dword product;
-	float_exp exp, exp_11, exp_10, exp_12, exp_delta;
-	float_mant mant_10, mant_11, mant_12;
-	float_mant_full hi;
-	logic[$bits(float_mant_full) - 3:0] lo;
+	gfx_fpint_lane_minmax stage_3
+	(
+		.clk(clk),
+		.in(mnorm_minmax),
+		.out(minmax_expdiff),
+		.copy_flags(copy_flags_3)
+	);
 
-	typedef logic[$bits(float_mant_full) + 1:0] extended_mant;
-	localparam bit[$clog2($bits(extended_mant)):0] MAX_SHIFT = 1 << $clog2($bits(extended_mant));
+	gfx_fpint_lane_expdiff stage_4
+	(
+		.clk(clk),
+		.in(minmax_expdiff),
+		.out(expdiff_shiftr)
+	);
 
-	extended_mant max_mant, min_mant, sticky_mask;
-	logic[$clog2(MAX_SHIFT):0] clz_shift, exp_shift;
+	gfx_fpint_lane_shiftr stage_5
+	(
+		.clk(clk),
+		.in(expdiff_shiftr),
+		.out(shiftr_addsub),
+		.int_signed(int_signed_5)
+	);
 
-	localparam int INT_SHIFT_REF   = $bits(word) - 2;
-	localparam int SHIFT_WIDTH     = {{($bits(int) - $bits(MAX_SHIFT)){1'b0}}, MAX_SHIFT};
-	localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(clz_shift) + 1;
+	gfx_fpint_lane_addsub stage_6
+	(
+		.clk(clk),
+		.in(shiftr_addsub),
+		.out(addsub_clz),
+		.copy_flags(copy_flags_6),
+		.int_operand(int_operand_6)
+	);
 
-	struct packed
-	{
-		float    max;
-		logic    guard,
-		         round,
-		         slow,
-		         sticky,
-		         zero;
-		word     add_sub;
-	} clz_hold[FADD_CLZ_STAGES], clz_hold_out;
+	gfx_fpint_lane_clz stage_7_8_9_10
+	(
+		.clk(clk),
+		.in(addsub_clz),
+		.out(clz_shiftl),
+		.force_nop(force_nop_7)
+	);
 
-	gfx_clz #($bits(word)) clz
+	gfx_fpint_lane_shiftl stage_11
 	(
 		.clk(clk),
-		.clz(clz_shift),
-		.value(clz_in)
+		.in(clz_shiftl),
+		.out(shiftl_round),
+		.copy_flags(copy_flags_11)
 	);
 
-	function extended_mant extend_min_max(float in, float_class in_class);
-		extend_min_max = {~in_class.exp_min, in.mant, 2'b00};
-	endfunction
+	gfx_fpint_lane_round stage_12
+	(
+		.clk(clk),
+		.in(shiftl_round),
+		.out(round_rnorm),
+		.enable(enable_12),
+		.copy_flags(copy_flags_12)
+	);
 
-	function word fp_add_sub_arg(extended_mant arg);
-		fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}};
-	endfunction
+	gfx_fpint_lane_rnorm stage_13
+	(
+		.clk(clk),
+		.in(round_rnorm),
+		.out(rnorm_encode)
+	);
 
-	assign lo_msb = lo[$bits(lo) - 1];
-	assign slow_out = &exp_12 || slow_12 || overflow_12;
-	assign exp_delta = max_2.exp - min_2.exp;
-	assign lo_reduce = |lo[$bits(lo) - 2:0];
-	assign normalized = clz_hold_out.add_sub << clz_shift;
-	assign clz_hold_out = clz_hold[FADD_CLZ_STAGES - 1];
-	assign slow_in_next = is_float_special(a_class_0) | is_float_special(b_class_0);
-	assign {product_hi, product_lo} = product;
-	assign {hi, guard_0, round_0, lo} = product[2 * $bits(float_mant_full) - 1:0];
+	gfx_fpint_lane_encode stage_14
+	(
+		.clk(clk),
+		.q(q),
+		.in(rnorm_encode),
+		.enable(enable_14)
+	);
 
-	always_comb begin
-		clz_in = add_sub;
-		if (~enable_norm_6)
-			clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01;
-	end
+endmodule
 
-	always_ff @(posedge clk) begin
-		// Stage -1: 
+// Stage 0: argumentos de mul
+module gfx_fpint_lane_setup
+(
+	input  logic                     clk,
 
-		a_m1 <= a;
-		b_m1 <= b;
-		a_mul <= a;
-		b_mul <= b;
+	input  gfx::word                 a,
+	                                 b,
+	input  logic                     mul_float,
+	                                 unit_b,
+
+	output gfx::fpint_setup_mulclass out
+);
+
+	always_ff @(posedge clk) begin
+		out.a <= a;
+		out.b <= b;
+		out.a_mul <= a;
+		out.b_mul <= b;
 
 		/* Nótese que el orden es sign-exp-mant. Esto coloca el '1.' implícito
 		 * en la posición correcta para multiplicar las mantisas.
 		 */
-		if (mul_float_m1) begin
-			a_mul.exp <= 1;
-			b_mul.exp <= 1;
-			a_mul.sign <= 0;
-			b_mul.sign <= 0;
+		if (mul_float) begin
+			out.a_mul.exp <= 1;
+			out.b_mul.exp <= 1;
+			out.a_mul.sign <= 0;
+			out.b_mul.sign <= 0;
 		end
 
-		if (unit_b_m1) begin
-			b_mul.exp <= 0;
-			b_mul.mant <= 1;
-			b_mul.sign <= 0;
+		if (unit_b) begin
+			out.b_mul.exp <= 0;
+			out.b_mul.mant <= 1;
+			out.b_mul.sign <= 0;
 		end
+	end
+
+endmodule
+
+// Stage 1: multiplicación de fp o enteros
+module gfx_fpint_lane_mulclass
+(
+	input  logic                     clk,
+
+	input  gfx::fpint_setup_mulclass in,
+
+	output gfx::fpint_mulclass_mnorm out
+);
+
+	import gfx::*;
+
+	always_ff @(posedge clk) begin
+		out.b <= in.b;
+		out.sign <= in.a.sign ^ in.b.sign;
+		out.a_class <= classify_float(in.a);
+		out.b_class <= classify_float(in.b);
+		out.product <= in.a_mul * in.b_mul;
+		{out.overflow, out.exp} <= {1'b0, in.a.exp} + {1'b0, in.b.exp} - {1'b0, FLOAT_EXP_BIAS};
+	end
+
+endmodule
 
-		// Stage 0: multiplicación de fp o enteros
+// Stage 2: normalización
+module gfx_fpint_lane_mnorm
+(
+	input  logic                     clk,
+
+	input  gfx::fpint_mulclass_mnorm in,
+	input  logic                     put_hi,
+	                                 put_lo,
+	                                 put_mul,
+	                                 zero_b,
+	                                 zero_flags,
+
+	output gfx::fpint_mnorm_minmax   out
+);
+
+	import gfx::*;
 
-		b_0 <= b_m1;
-		sign_0 <= a_m1.sign ^ b_m1.sign;
-		product <= a_mul * b_mul;
-		a_class_0 <= classify_float(a_m1);
-		b_class_0 <= classify_float(b_m1);
-		{overflow_0, exp} <= {1'b0, a_m1.exp} + {1'b0, b_m1.exp} - {1'b0, FLOAT_EXP_BIAS};
+	word product_hi, product_lo;
+	logic guard, lo_msb, lo_reduce, round, slow_in_next;
+	float_mant_full hi;
+	logic[$bits(float_mant_full) - 3:0] lo;
 
-		// Stage 1: normalización
+	assign lo_msb = lo[$bits(lo) - 1];
+	assign lo_reduce = |lo[$bits(lo) - 2:0];
+	assign slow_in_next = is_float_special(in.a_class) | is_float_special(in.b_class);
+	assign {product_hi, product_lo} = in.product;
+	assign {hi, guard, round, lo} = in.product[2 * $bits(float_mant_full) - 1:0];
 
-		if (float_a_1) begin
-			slow_1 <= slow_in_next | (overflow_0 & ~a_class_0.exp_min & ~a_class_1.exp_min);
-			zero_1 <= a_class_0.exp_min | b_class_0.exp_min;
+	always_ff @(posedge clk) begin
+		if (put_mul) begin
+			out.slow <= slow_in_next | (in.overflow & ~in.a_class.exp_min & ~in.a_class.exp_min);
+			out.zero <= in.a_class.exp_min | in.b_class.exp_min;
 		end else begin
-			slow_1 <= 0;
-			zero_1 <= 0;
+			out.slow <= 0;
+			out.zero <= 0;
 		end
 
-		overflow_1 <= 0;
-		a_add.sign <= sign_0;
+		out.a.sign <= in.sign;
+		out.overflow <= 0;
 
 		if (hi[$bits(hi) - 1]) begin
-			guard_1 <= guard_0;
-			round_1 <= round_0;
-			sticky_1 <= lo_msb | lo_reduce;
-			a_add.mant <= implicit_mant(hi);
-			{overflow_1, a_add.exp} <= {1'b0, exp} + 1;
+			out.guard <= guard;
+			out.round <= round;
+			out.sticky <= lo_msb | lo_reduce;
+			out.a.mant <= implicit_mant(hi);
+			{out.overflow, out.a.exp} <= {1'b0, in.exp} + 1;
 		end else begin
 			/* Bit antes de msb es necesariamente 1, ya que los msb de
 			 * ambos multiplicandos son 1. Ver assert en implicit_mant().
 			 */
-			guard_1 <= round_0;
-			round_1 <= lo[$bits(lo) - 1];
-			sticky_1 <= lo_reduce;
-			a_add.exp <= exp;
-			a_add.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard_0});
+			out.guard <= round;
+			out.round <= lo_msb;
+			out.sticky <= lo_reduce;
+
+			out.a.exp <= in.exp;
+			out.a.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard});
 		end
 
 		unique case (1'b1)
-			float_a_1: ;
+			put_mul: ;
 
-			int_hi_a_1:
-				a_add <= product_hi;
+			put_hi:
+				out.a <= product_hi;
 
-			int_lo_a_1:
-				a_add <= product_lo;
+			put_lo:
+				out.a <= product_lo;
 		endcase
 
-		a_class_1 <= a_class_0;
-		slow_in_1 <= slow_in_next;
+		out.a_class <= in.a_class;
+		out.slow_in <= slow_in_next;
 
-		if (zero_flags_1) begin
-			a_class_1 <= classify_float(0);
-			slow_in_1 <= 0;
+		if (zero_flags) begin
+			out.a_class <= classify_float(0);
+			out.slow_in <= 0;
 		end
 
-		if (zero_b_1) begin
-			b_add <= 0;
-			b_class_1 <= classify_float(0);
+		if (zero_b) begin
+			out.b <= 0;
+			out.b_class <= classify_float(0);
 		end else begin
-			b_add <= b_0;
-			b_class_1 <= b_class_0;
+			out.b <= in.b;
+			out.b_class <= in.b_class;
 		end
+	end
+
+endmodule
+
+// Stage 3: ordenar tal que abs(max) >= abs(min)
+module gfx_fpint_lane_minmax
+(
+	input  logic                     clk,
+
+	input  gfx::fpint_mnorm_minmax   in,
+	input  logic                     copy_flags,
 
-		/* Stage 2: ordenar tal que abs(max) >= abs(min). Wiki dice:
+	output gfx::fpint_minmax_expdiff out
+);
+
+	import gfx::*;
+
+	always_ff @(posedge clk) begin
+		/* Wiki dice:
 		 *
 		 * A property of the single- and double-precision formats is that
 		 * their encoding allows one to easily sort them without using
@@ -241,164 +369,308 @@ module gfx_fpint_lane
 		 * floating-point representation also had this property for normalized
 		 * numbers).
 		 */
-		if ({b_add.exp, b_add.mant} > {a_add.exp, a_add.mant}) begin
-			max_2 <= b_add;
-			min_2 <= a_add;
-			max_class_2 <= b_class_1;
-			min_class_2 <= a_class_1;
+		if ({in.b.exp, in.b.mant} > {in.a.exp, in.a.mant}) begin
+			out.max <= in.b;
+			out.min <= in.a;
+			out.max_class <= in.b_class;
+			out.min_class <= in.a_class;
 		end else begin
-			max_2 <= a_add;
-			min_2 <= b_add;
-			max_class_2 <= a_class_1;
-			min_class_2 <= b_class_1;
+			out.max <= in.a;
+			out.min <= in.b;
+			out.max_class <= in.a_class;
+			out.min_class <= in.b_class;
 		end
 
-		guard_2 <= guard_1;
-		round_2 <= round_1;
-		sticky_2 <= sticky_1;
+		out.guard <= in.guard;
+		out.round <= in.round;
+		out.sticky <= in.sticky;
 
-		if (copy_flags_2) begin
-			slow_2 <= slow_1 | overflow_1;
-			zero_2 <= zero_1;
+		if (copy_flags) begin
+			out.slow <= in.slow | in.overflow;
+			out.zero <= in.zero;
 		end else begin
-			slow_2 <= slow_in_1;
-			zero_2 <= 0;
+			out.slow <= in.slow_in;
+			out.zero <= 0;
 		end
+	end
+
+endmodule
+
+// Stage 4: exp_shift amount
+module gfx_fpint_lane_expdiff
+(
+	input  logic                     clk,
+
+	input  gfx::fpint_minmax_expdiff in,
+
+	output gfx::fpint_expdiff_shiftr out
+);
+
+	import gfx::*;
+
+	float_exp exp_delta;
+
+	assign exp_delta = in.max.exp - in.min.exp;
+
+	always_ff @(posedge clk) begin
+		out.max <= in.max;
+		out.min <= in.min;
+		out.slow <= in.slow;
+		out.zero <= in.zero;
+		out.guard <= in.guard;
+		out.round <= in.round;
+		out.sticky <= in.sticky;
+		out.max_class <= in.max_class;
+		out.min_class <= in.min_class;
+
+		out.exp_shift <= exp_delta[$bits(out.exp_shift) - 1:0];
+		if (exp_delta > {{($bits(exp_delta) - $bits(FPINT_MAX_SHIFT)){1'b0}}, FPINT_MAX_SHIFT})
+			out.exp_shift <= FPINT_MAX_SHIFT;
+	end
+
+endmodule
+
+// Stage 5: shifts y abs(max) para enteros con signo
+module gfx_fpint_lane_shiftr
+(
+	input  logic                     clk,
+
+	input  gfx::fpint_expdiff_shiftr in,
+	input  logic                     int_signed,
+
+	output gfx::fpint_shiftr_addsub  out
+);
+
+	import gfx::*;
+
+	always_ff @(posedge clk) begin
+		out.min <= in.min;
+		out.slow <= in.slow;
+		out.zero <= in.zero;
+		out.guard <= in.guard;
+		out.round <= in.round;
+		out.sticky <= in.sticky;
+		out.min_class <= in.min_class;
+
+		out.max_mant <= float_prepare_round(in.max, in.max_class);
+		out.min_mant <= float_prepare_round(in.min, in.min_class) >> in.exp_shift;
+		out.sticky_mask <= {($bits(out.min_mant)){1'b1}} << in.exp_shift;
+
+		out.max <= in.max;
+		out.int_sign <= in.max[$bits(in.max) - 1];
+
+		if (int_signed & in.max[$bits(in.max) - 1])
+			out.max <= -in.max;
+	end
+
+endmodule
+
+// Stage 6: suma de mantisas
+module gfx_fpint_lane_addsub
+(
+	input  logic                    clk,
+
+	input  gfx::fpint_shiftr_addsub in,
+	input  logic                    copy_flags,
+	                                int_operand,
 
-		// Stage 3: exp_shift amount
-
-		max_3 <= max_2;
-		min_3 <= min_2;
-		slow_3 <= slow_2;
-		zero_3 <= zero_2;
-		guard_3 <= guard_2;
-		round_3 <= round_2;
-		sticky_3 <= sticky_2;
-		max_class_3 <= max_class_2;
-		min_class_3 <= min_class_2;
-
-		exp_shift <= exp_delta[$bits(exp_shift) - 1:0];
-		if (exp_delta > {{($bits(exp_delta) - $bits(MAX_SHIFT)){1'b0}}, MAX_SHIFT})
-			exp_shift <= MAX_SHIFT;
-
-		// Stage 4: shifts y abs(max) para enteros con signo
-
-		min_4 <= min_3;
-		slow_4 <= slow_3;
-		zero_4 <= zero_3;
-		guard_4 <= guard_3;
-		round_4 <= round_3;
-		sticky_4 <= sticky_3;
-		min_class_4 <= min_class_3;
-
-		max_mant <= extend_min_max(max_3, max_class_3);
-		min_mant <= extend_min_max(min_3, min_class_3) >> exp_shift;
-		sticky_mask <= {($bits(min_mant)){1'b1}} << exp_shift;
-
-		max_4 <= max_3;
-		int_sign <= max_3[$bits(max_3) - 1];
-
-		if (int_signed_4 & max_3[$bits(max_3) - 1])
-			max_4 <= -max_3;
-
-		// Stage 5: suma de mantisas
-
-		max_5 <= max_4;
-		slow_5 <= slow_4;
-		zero_5 <= zero_4;
-		guard_5 <= guard_4;
-		round_5 <= round_4;
-
-		if (int_operand_5) begin
-			max_5.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0];
-			max_5.sign <= int_sign;
+	output gfx::fpint_addsub_clz    out
+);
+
+	import gfx::*;
+
+	localparam int INT_SHIFT_REF = $bits(word) - 2;
+
+	function word fp_add_sub_arg(float_mant_ext arg);
+		fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}};
+	endfunction
+
+	always_ff @(posedge clk) begin
+		out.max <= in.max;
+		out.slow <= in.slow;
+		out.zero <= in.zero;
+		out.guard <= in.guard;
+		out.round <= in.round;
+
+		if (int_operand) begin
+			out.max.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0];
+			out.max.sign <= in.int_sign;
 		end
 
-		if (copy_flags_5)
-			sticky_5 <= sticky_4;
+		if (copy_flags)
+			out.sticky <= in.sticky;
 		else
-			sticky_5 <= |(extend_min_max(min_4, min_class_4) & ~sticky_mask);
+			out.sticky <= |(float_prepare_round(in.min, in.min_class) & ~in.sticky_mask);
 
-		if (int_operand_5)
-			add_sub <= max_4;
-		else if (max_4.sign ^ min_4.sign)
-			add_sub <= fp_add_sub_arg(max_mant) - fp_add_sub_arg(min_mant);
+		if (int_operand)
+			out.add_sub <= in.max;
+		else if (in.max.sign ^ in.min.sign)
+			out.add_sub <= fp_add_sub_arg(in.max_mant) - fp_add_sub_arg(in.min_mant);
 		else
-			add_sub <= fp_add_sub_arg(max_mant) + fp_add_sub_arg(min_mant);
+			out.add_sub <= fp_add_sub_arg(in.max_mant) + fp_add_sub_arg(in.min_mant);
+	end
+
+endmodule
 
-		// Stages 6-9: clz
+// Stages 7-10: encontrar el 1 más significativo
+module gfx_fpint_lane_clz
+(
+	input  logic                 clk,
+
+	input  gfx::fpint_addsub_clz in,
+	input  logic                 force_nop,
+
+	output gfx::fpint_clz_shiftl out
+);
 
-		clz_hold[0].max <= max_5;
-		clz_hold[0].slow <= slow_5;
-		clz_hold[0].zero <= zero_5;
-		clz_hold[0].guard <= guard_5;
-		clz_hold[0].round <= round_5;
-		clz_hold[0].sticky <= sticky_5;
-		clz_hold[0].add_sub <= add_sub;
+	import gfx::*;
 
-		for (int i = 1; i < FADD_CLZ_STAGES; ++i)
-			clz_hold[i] <= clz_hold[i - 1];
+	word clz_in;
+	fpint_clz_hold hold[FPINT_CLZ_STAGES];
 
-		// Stage 10: normalización
+	assign out.hold = hold[FPINT_CLZ_STAGES - 1];
 
-		sign_10 <= clz_hold_out.max.sign;
-		slow_10 <= clz_hold_out.slow;
-		zero_10 <= clz_hold_out.zero;
-		sticky_10 <= clz_hold_out.sticky;
+	gfx_clz #($bits(word)) clz
+	(
+		.clk(clk),
+		.clz(out.shift),
+		.value(clz_in)
+	);
 
-		{mant_10, guard_10, round_10, sticky_last} <=
+	always_comb begin
+		clz_in = in.add_sub;
+		if (force_nop)
+			clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01;
+	end
+
+	always_ff @(posedge clk) begin
+		hold[0] <= in;
+
+		for (int i = 1; i < FPINT_CLZ_STAGES; ++i)
+			hold[i] <= hold[i - 1];
+	end
+
+endmodule
+
+// Stage 11: normalización
+module gfx_fpint_lane_shiftl
+(
+	input  logic                   clk,
+
+	input  gfx::fpint_clz_shiftl   in,
+	input  logic                   copy_flags,
+
+	output gfx::fpint_shiftl_round out
+);
+
+	import gfx::*;
+
+	localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(in.shift) + 1;
+
+	word normalized;
+
+	assign normalized = in.hold.add_sub << in.shift;
+
+	always_ff @(posedge clk) begin
+		out.slow <= in.hold.slow;
+		out.zero <= in.hold.zero;
+		out.sticky <= in.hold.sticky;
+		out.val.sign <= in.hold.max.sign;
+
+		{out.val.mant, out.guard, out.round, out.sticky_last} <=
 			normalized[$bits(normalized) - 2:$bits(normalized) - $bits(float_mant) - 4];
 
-		{overflow_10, exp_10} <=
-			{1'b0, clz_hold_out.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, clz_shift} + 1;
+		{out.overflow, out.val.exp} <=
+			{1'b0, in.hold.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, in.shift} + 1;
 
-		if (clz_shift[$bits(clz_shift) - 1])
-			zero_10 <= 1;
+		if (in.shift[$bits(in.shift) - 1])
+			out.zero <= 1;
 
-		if (copy_flags_10) begin
-			guard_10 <= clz_hold_out.guard;
-			round_10 <= clz_hold_out.round;
-			sticky_last <= 0;
-			overflow_10 <= 0;
+		if (copy_flags) begin
+			out.guard <= in.hold.guard;
+			out.round <= in.hold.round;
+			out.overflow <= 0;
+			out.sticky_last <= 0;
 		end
+	end
+
+endmodule
+
+// Stage 12: redondeo
+module gfx_fpint_lane_round
+(
+	input  logic                   clk,
+
+	input  gfx::fpint_shiftl_round in,
+	input  logic                   copy_flags,
+	                               enable,
+
+	output gfx::fpint_round_rnorm  out
+);
+
+	import gfx::*;
 
-		// Stage 11: redondeo
+	always_ff @(posedge clk) begin
+		out.val <= in.val;
+		out.slow <= in.slow | (~copy_flags & in.overflow & ~in.zero);
+		out.zero <= in.zero;
+		out.exp_step <= 0;
+
+		// Este es el modo de redondeo más usual: round to nearest, ties to even
+		if (enable & in.guard & (in.round | in.sticky | in.sticky_last | in.val.mant[0]))
+			{out.exp_step, out.val.mant} <= {1'b0, out.val.mant} + 1;
+	end
+
+endmodule
 
-		exp_11 <= exp_10;
-		mant_11 <= mant_10;
-		sign_11 <= sign_10;
-		slow_11 <= slow_10 | (~copy_flags_11 & overflow_10 & ~zero_10);
-		zero_11 <= zero_10;
-		exp_step <= 0;
+// Stage 13: ajuste de exponente por redondeo
+module gfx_fpint_lane_rnorm
+(
+	input  logic                   clk,
 
-		// Este es el modo más común: round to nearest, ties to even
-		if (enable_round_11 & guard_10 & (round_10 | sticky_10 | sticky_last | mant_10[0]))
-			{exp_step, mant_11} <= {1'b0, mant_10} + 1;
+	input  gfx::fpint_round_rnorm  in,
 
-		// Stage 12: ajuste de exponente por redondeo
+	output gfx::fpint_rnorm_encode out
+);
 
-		sign_12 <= sign_11;
-		slow_12 <= slow_11;
-		zero_12 <= zero_11;
-		mant_12 <= mant_11;
-		overflow_12 <= 0;
+	import gfx::*;
 
-		if (exp_step)
-			{overflow_12, exp_12} <= {1'b0, exp_11} + 1;
+	always_ff @(posedge clk) begin
+		out.slow <= in.slow;
+		out.zero <= in.zero;
+		out.overflow <= 0;
+		out.val.mant <= in.val.mant;
+		out.val.sign <= in.val.sign;
+
+		if (in.exp_step)
+			{out.overflow, out.val.exp} <= {1'b0, in.val.exp} + 1;
 		else
-			exp_12 <= exp_11;
+			out.val.exp <= in.val.exp;
+	end
+
+endmodule
+
+// Stage 14: salida y codificación de ceros y NaNs
+module gfx_fpint_lane_encode
+(
+	input  logic                   clk,
+
+	input  gfx::fpint_rnorm_encode in,
+	input  logic                   enable,
+
+	output gfx::float              q
+);
 
-		// Stage 13: ceros y NaNs
+	import gfx::*;
 
-		q.exp <= exp_12;
-		q.mant <= mant_12;
-		q.sign <= sign_12;
+	always_ff @(posedge clk) begin
+		q <= in.val;
 
-		if (encode_special_13) begin
-			if (slow_out) begin
+		if (enable) begin
+			if (&in.val.exp | in.slow | in.overflow) begin
 				q.exp <= FLOAT_EXP_MAX;
 				q.mant <= 1;
-			end else if (zero_12) begin
+			end else if (in.zero) begin
 				q.exp <= 0;
 				q.mant <= 0;
 			end
diff --git a/platform/wavelet3d/gfx_pkg.sv b/platform/wavelet3d/gfx_pkg.sv
index cfab6a5..5c420cc 100644
--- a/platform/wavelet3d/gfx_pkg.sv
+++ b/platform/wavelet3d/gfx_pkg.sv
@@ -5,7 +5,8 @@ package gfx;
 	typedef logic[7:0]  float_exp;
 
 	typedef logic[$bits(word) - $bits(float_exp) - 2:0] float_mant;
-	typedef logic[$bits(float_mant):0] float_mant_full; // Incluye '1.' explícito
+	typedef logic[$bits(float_mant):0]                  float_mant_full; // Incluye '1.' explícito
+	typedef logic[$bits(float_mant_full) + 1:0]         float_mant_ext;  // Considera overflow
 
 	localparam float_exp FLOAT_EXP_BIAS = (1 << ($bits(float_exp) - 1)) - 1;
 	localparam float_exp FLOAT_EXP_MAX  = {($bits(float_exp)){1'b1}};
@@ -56,14 +57,163 @@ package gfx;
 		is_float_special = in.exp_max | (in.exp_min & ~in.mant_zero);
 	endfunction
 
-	/* -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32
-	 */
-	localparam FADD_CLZ_STAGES = 4;
+	function float_mant_ext float_prepare_round(float in, float_class in_class);
+		float_prepare_round = {~in_class.exp_min, in.mant, 2'b00};
+	endfunction
+
+	// -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32
+	localparam int FPINT_CLZ_STAGES = 4;
+	localparam int FPINT_STAGES     = 7 + FPINT_CLZ_STAGES + 4;
+
+	localparam bit[$clog2($bits(float_mant_ext)):0] FPINT_MAX_SHIFT
+		= 1 << $clog2($bits(float_mant_ext));
+
+	typedef logic[$clog2(FPINT_MAX_SHIFT):0] fpint_shift;
+
+	typedef struct packed
+	{
+		logic setup_mul_float,
+		      setup_unit_b,
+		      mnorm_put_hi,
+		      mnorm_put_lo,
+		      mnorm_put_mul,
+		      mnorm_zero_b,
+		      mnorm_zero_flags,
+		      minmax_copy_flags,
+		      shiftr_int_signed,
+		      addsub_copy_flags,
+		      addsub_int_operand,
+		      clz_force_nop,
+		      shiftl_copy_flags,
+		      round_copy_flags,
+		      round_enable,
+		      encode_enable;
+	} fpint_op;
+
+	typedef struct packed
+	{
+		float a,
+		      b,
+		      a_mul,
+		      b_mul;
+	} fpint_setup_mulclass;
+
+	typedef struct packed
+	{
+		float       b;
+		float_exp   exp;
+		float_class a_class,
+		            b_class;
+		dword       product;
+		logic       sign,
+		            overflow;
+	} fpint_mulclass_mnorm;
+
+	typedef struct packed
+	{
+		float       a,
+		            b;
+		float_class a_class,
+		            b_class;
+		logic       slow,
+		            zero,
+		            guard,
+		            round,
+		            sticky,
+		            slow_in,
+		            overflow;
+	} fpint_mnorm_minmax;
+
+	typedef struct packed
+	{
+		float       max,
+		            min;
+		float_class max_class,
+		            min_class;
+		logic       slow,
+		            zero,
+		            guard,
+		            round,
+		            sticky;
+	} fpint_minmax_expdiff;
+
+	typedef struct packed
+	{
+		float       max,
+		            min;
+		float_class max_class,
+		            min_class;
+		fpint_shift exp_shift;
+		logic       slow,
+		            zero,
+		            guard,
+		            round,
+		            sticky;
+	} fpint_expdiff_shiftr;
+
+	typedef struct packed
+	{
+		float          max,
+		               min;
+		float_class    max_class,
+		               min_class;
+		float_mant_ext max_mant,
+		               min_mant,
+		               sticky_mask;
+		logic          slow,
+		               zero,
+		               guard,
+		               round,
+		               sticky,
+		               int_sign;
+	} fpint_shiftr_addsub;
 
 	typedef struct packed
 	{
-		logic fadd,
-		      fmul;
-	} arith_op;
+		float max;
+		word  add_sub;
+		logic slow,
+		      zero,
+		      guard,
+		      round,
+		      sticky;
+	} fpint_clz_hold;
+
+	typedef fpint_clz_hold fpint_addsub_clz;
+
+	typedef struct packed
+	{
+		fpint_clz_hold hold;
+		fpint_shift    shift;
+	} fpint_clz_shiftl;
+
+	typedef struct packed
+	{
+		float val;
+		logic slow,
+		      zero,
+		      guard,
+		      round,
+		      sticky,
+		      overflow,
+		      sticky_last;
+	} fpint_shiftl_round;
+
+	typedef struct packed
+	{
+		float val;
+		logic slow,
+		      zero,
+		      exp_step,
+		      overflow;
+	} fpint_round_rnorm;
+
+	typedef struct packed
+	{
+		float val;
+		logic slow,
+		      zero,
+		      overflow;
+	} fpint_rnorm_encode;
 
 endpackage
diff --git a/platform/wavelet3d/main.cpp b/platform/wavelet3d/main.cpp
index 037aee4..49c96c1 100644
--- a/platform/wavelet3d/main.cpp
+++ b/platform/wavelet3d/main.cpp
@@ -30,76 +30,76 @@ int main(int argc, char **argv)
 	std::cin >> a >> b;
 
 	// int->fp
-	top.mul_float_m1 = 0;
-	top.unit_b_m1 = 1;
-	top.float_a_1 = 0;
-	top.int_hi_a_1 = 0;
-	top.int_lo_a_1 = 1;
-	top.zero_flags_1 = 1;
-	top.zero_b_1 = 1;
-	top.copy_flags_2 = 0;
-	top.int_signed_4 = 1;
-	top.int_operand_5 = 1;
-	top.copy_flags_5 = 1;
-	top.enable_norm_6 = 1;
-	top.copy_flags_10 = 0;
-	top.copy_flags_11 = 0;
-	top.enable_round_11 = 1;
-	top.encode_special_13 = 1;
+	top.setup_mul_float = 0;
+	top.setup_unit_b = 1;
+	top.mnorm_put_hi = 0;
+	top.mnorm_put_lo = 1;
+	top.mnorm_put_mul = 0;
+	top.mnorm_zero_flags = 1;
+	top.mnorm_zero_b = 1;
+	top.minmax_copy_flags = 0;
+	top.shiftr_int_signed = 1;
+	top.addsub_int_operand = 1;
+	top.addsub_copy_flags = 1;
+	top.clz_force_nop = 1;
+	top.shiftl_copy_flags = 0;
+	top.round_copy_flags = 0;
+	top.round_enable = 1;
+	top.encode_enable = 1;
 
 	// mul int
-	//top.mul_float_m1 = 0;
-	//top.unit_b_m1 = 0;
-	//top.float_a_1 = 0;
-	//top.int_hi_a_1 = 0;
-	//top.int_lo_a_1 = 1;
-	//top.zero_flags_1 = 1;
-	//top.zero_b_1 = 1;
-	//top.copy_flags_2 = 1;
-	//top.int_signed_4 = 0;
-	//top.int_operand_5 = 0;
-	//top.copy_flags_5 = 1;
-	//top.enable_norm_6 = 0;
-	//top.copy_flags_10 = 1;
-	//top.copy_flags_11 = 1;
-	//top.enable_round_11 = 0;
-	//top.encode_special_13 = 0;
+	//top.setup_mul_float = 0;
+	//top.setup_unit_b = 0;
+	//top.mnorm_put_hi = 0;
+	//top.mnorm_put_lo = 1;
+	//top.mnorm_put_mul = 0;
+	//top.mnorm_zero_flags = 1;
+	//top.mnorm_zero_b = 1;
+	//top.minmax_copy_flags = 1;
+	//top.shiftr_int_signed = 0;
+	//top.addsub_int_operand = 0;
+	//top.addsub_copy_flags = 1;
+	//top.clz_force_nop = 0;
+	//top.shiftl_copy_flags = 1;
+	//top.round_copy_flags = 1;
+	//top.round_enable = 0;
+	//top.encode_enable = 0;
 
 	// mul fp
-	//top.mul_float_m1 = 1;
-	//top.unit_b_m1 = 0;
-	//top.float_a_1 = 1;
-	//top.int_hi_a_1 = 0;
-	//top.int_lo_a_1 = 0;
-	//top.zero_flags_1 = 0;
-	//top.zero_b_1 = 1;
-	//top.copy_flags_2 = 1;
-	//top.copy_flags_5 = 1;
-	//top.int_signed_4 = 0;
-	//top.int_operand_5 = 0;
-	//top.enable_norm_6 = 1;
-	//top.copy_flags_10 = 1;
-	//top.copy_flags_11 = 1;
-	//top.enable_round_11 = 1;
-	//top.encode_special_13 = 1;
+	//top.setup_mul_float = 1;
+	//top.setup_unit_b = 0;
+	//top.mnorm_put_hi = 0;
+	//top.mnorm_put_lo = 0;
+	//top.mnorm_put_mul = 1;
+	//top.mnorm_zero_flags = 0;
+	//top.mnorm_zero_b = 1;
+	//top.minmax_copy_flags = 1;
+	//top.shiftr_int_signed = 0;
+	//top.addsub_int_operand = 0;
+	//top.addsub_copy_flags = 1;
+	//top.clz_force_nop = 1;
+	//top.shiftl_copy_flags = 1;
+	//top.round_copy_flags = 1;
+	//top.round_enable = 1;
+	//top.encode_enable = 1;
 
 	// suma/resta
-	//top.mul_float_m1 = 0;
-	//top.unit_b_m1 = 1;
-	//top.float_a_1 = 0;
-	//top.int_hi_a_1 = 0;
-	//top.int_lo_a_1 = 1;
-	//top.zero_flags_1 = 0;
-	//top.zero_b_1 = 0;
-	//top.copy_flags_2 = 0;
-	//top.copy_flags_5 = 0;
-	//top.int_signed_4 = 0;
-	//top.int_operand_5 = 0;
-	//top.enable_norm_6 = 1;
-	//top.copy_flags_10 = 0;
-	//top.copy_flags_11 = 0;
-	//top.enable_round_11 = 1;
-	//top.encode_special_13 = 1;
+	//top.setup_mul_float = 0;
+	//top.setup_unit_b = 1;
+	//top.mnorm_put_hi = 0;
+	//top.mnorm_put_lo = 1;
+	//top.mnorm_put_mul = 0;
+	//top.mnorm_zero_flags = 0;
+	//top.mnorm_zero_b = 0;
+	//top.minmax_copy_flags = 0;
+	//top.shiftr_int_signed = 0;
+	//top.addsub_int_operand = 0;
+	//top.addsub_copy_flags = 0;
+	//top.clz_force_nop = 1;
+	//top.shiftl_copy_flags = 0;
+	//top.round_copy_flags = 0;
+	//top.round_enable = 1;
+	//top.encode_enable = 1;
 
 	top.a = *reinterpret_cast<unsigned*>(&a);
 	top.b = *reinterpret_cast<unsigned*>(&b);
diff --git a/platform/wavelet3d/mod.mk b/platform/wavelet3d/mod.mk
index 232d808..6b8bd47 100644
--- a/platform/wavelet3d/mod.mk
+++ b/platform/wavelet3d/mod.mk
@@ -1,9 +1,9 @@
 define core
   $(this)/deps :=
 
-  $(this)/rtl_top := gfx_fpint_lane
+  $(this)/rtl_top := gfx_fpint
   $(this)/rtl_dirs := .
-  $(this)/rtl_files := gfx_pkg.sv gfx_fpint_lane.sv
+  $(this)/rtl_files := gfx_pkg.sv gfx_fpint.sv
 
   $(this)/vl_main := main.cpp
   $(this)/vl_pkgconfig := python3-embed