summaryrefslogtreecommitdiff
path: root/rtl/gfx
diff options
context:
space:
mode:
authorAlejandro Soto <alejandro@34project.org>2024-05-05 17:38:55 -0600
committerAlejandro Soto <alejandro@34project.org>2024-05-05 18:12:08 -0600
commitca02833f22b08ceeeff501107371aa6667426115 (patch)
treef864c5fc238a292082d2096ce546270badce9f1d /rtl/gfx
parent081a8a3ba8bfe036f31da53f9c041a2caa30fce2 (diff)
rtl/gfx: rename platform/wavelet3d -> rtl/gfx
Diffstat (limited to 'rtl/gfx')
-rw-r--r--rtl/gfx/gfx_axib.sv81
-rw-r--r--rtl/gfx/gfx_axil.sv61
-rw-r--r--rtl/gfx/gfx_axil2regblock.sv30
-rw-r--r--rtl/gfx/gfx_beats.sv29
-rw-r--r--rtl/gfx/gfx_bootrom.sv66
-rw-r--r--rtl/gfx/gfx_clz.sv68
-rw-r--r--rtl/gfx/gfx_ctz.sv18
-rw-r--r--rtl/gfx/gfx_fifo.sv102
-rw-r--r--rtl/gfx/gfx_fixed_dotadd.sv55
-rw-r--r--rtl/gfx/gfx_fixed_muladd.sv77
-rw-r--r--rtl/gfx/gfx_front_back.sv37
-rw-r--r--rtl/gfx/gfx_isa.sv84
-rw-r--r--rtl/gfx/gfx_pipes.sv24
-rw-r--r--rtl/gfx/gfx_pkg.sv271
-rw-r--r--rtl/gfx/gfx_pkts.sv29
-rw-r--r--rtl/gfx/gfx_raster.sv930
-rw-r--r--rtl/gfx/gfx_regfile_io.sv106
-rw-r--r--rtl/gfx/gfx_rst_sync.sv13
-rw-r--r--rtl/gfx/gfx_sched.sv141
-rw-r--r--rtl/gfx/gfx_shader.sv77
-rw-r--r--rtl/gfx/gfx_shader_back.sv335
-rw-r--r--rtl/gfx/gfx_shader_fpint.sv932
-rw-r--r--rtl/gfx/gfx_shader_front.sv746
-rw-r--r--rtl/gfx/gfx_shader_group.sv17
-rw-r--r--rtl/gfx/gfx_shader_mem.sv17
-rw-r--r--rtl/gfx/gfx_shader_regs.sv302
-rw-r--r--rtl/gfx/gfx_shader_schedif.rdl91
-rw-r--r--rtl/gfx/gfx_shader_setup.sv37
-rw-r--r--rtl/gfx/gfx_shader_sfu.sv17
-rw-r--r--rtl/gfx/gfx_shake.sv24
-rw-r--r--rtl/gfx/gfx_sim_debug.sv50
-rw-r--r--rtl/gfx/gfx_skid_buf.sv20
-rw-r--r--rtl/gfx/gfx_skid_flow.sv31
-rw-r--r--rtl/gfx/gfx_wb.sv51
-rw-r--r--rtl/gfx/gfx_xbar_sched.sv146
-rw-r--r--rtl/gfx/mod.mk18
36 files changed, 5133 insertions, 0 deletions
diff --git a/rtl/gfx/gfx_axib.sv b/rtl/gfx/gfx_axib.sv
new file mode 100644
index 0000000..7b3cbdc
--- /dev/null
+++ b/rtl/gfx/gfx_axib.sv
@@ -0,0 +1,81 @@
+// AXI4 con burst
+interface gfx_axib;
+
+ import gfx::word;
+
+ logic awvalid,
+ awready;
+ logic[7:0] awlen;
+ logic[1:0] awburst;
+ word awaddr;
+
+ logic wlast;
+ logic wvalid;
+ logic wready;
+ word wdata;
+
+ logic bvalid;
+ logic bready;
+
+ logic arvalid,
+ arready;
+ logic[7:0] arlen;
+ logic[1:0] arburst;
+ word araddr;
+
+ logic rlast;
+ logic rvalid;
+ logic rready;
+ word rdata;
+
+ modport m
+ (
+ input awready,
+ wready,
+ bvalid,
+ arready,
+ rlast,
+ rvalid,
+ rdata,
+
+ output awlen,
+ awburst,
+ awvalid,
+ awaddr,
+ wlast,
+ wvalid,
+ wdata,
+ bready,
+ arlen,
+ arburst,
+ arvalid,
+ araddr,
+ rready
+ );
+
+ modport s
+ (
+ input awlen,
+ awburst,
+ awvalid,
+ awaddr,
+ wlast,
+ wvalid,
+ wdata,
+ bready,
+ arlen,
+ arburst,
+ arvalid,
+ araddr,
+ rready,
+
+ output awready,
+ wready,
+ bvalid,
+ arready,
+ rlast,
+ rvalid,
+ rdata
+ );
+
+endinterface
diff --git a/rtl/gfx/gfx_axil.sv b/rtl/gfx/gfx_axil.sv
new file mode 100644
index 0000000..c254e26
--- /dev/null
+++ b/rtl/gfx/gfx_axil.sv
@@ -0,0 +1,61 @@
+// AXI4-Lite, sin wstrb ni axprot
+interface gfx_axil;
+ import gfx::*;
+
+ logic awvalid;
+ logic awready;
+ word awaddr;
+
+ logic wvalid;
+ logic wready;
+ word wdata;
+
+ logic bvalid;
+ logic bready;
+
+ logic arvalid;
+ logic arready;
+ word araddr;
+
+ logic rvalid;
+ logic rready;
+ word rdata;
+
+ modport m
+ (
+ input awready,
+ wready,
+ bvalid,
+ arready,
+ rvalid,
+ rdata,
+
+ output awvalid,
+ awaddr,
+ wvalid,
+ wdata,
+ bready,
+ arvalid,
+ araddr,
+ rready
+ );
+
+ modport s
+ (
+ input awvalid,
+ awaddr,
+ wvalid,
+ wdata,
+ bready,
+ arvalid,
+ araddr,
+ rready,
+
+ output awready,
+ wready,
+ bvalid,
+ arready,
+ rvalid,
+ rdata
+ );
+endinterface
diff --git a/rtl/gfx/gfx_axil2regblock.sv b/rtl/gfx/gfx_axil2regblock.sv
new file mode 100644
index 0000000..2449b05
--- /dev/null
+++ b/rtl/gfx/gfx_axil2regblock.sv
@@ -0,0 +1,30 @@
+module gfx_axil2regblock
+(
+ gfx_axil.s axis,
+ axi4lite_intf.master axim
+);
+
+ assign axis.rdata = axim.RDATA;
+ assign axis.rvalid = axim.RVALID;
+ assign axis.bvalid = axim.BVALID;
+ assign axis.wready = axim.WREADY;
+ assign axis.arready = axim.ARREADY;
+ assign axis.awready = axim.AWREADY;
+
+ assign axim.AWVALID = axis.awvalid;
+ assign axim.AWADDR = axis.awaddr[$bits(axim.AWADDR) - 1:0];
+ assign axim.AWPROT = '0;
+
+ assign axim.WVALID = axis.wvalid;
+ assign axim.WDATA = axis.wdata;
+ assign axim.WSTRB = '1;
+
+ assign axim.BREADY = axis.bready;
+
+ assign axim.ARVALID = axis.arvalid;
+ assign axim.ARADDR = axis.araddr[$bits(axim.ARADDR) - 1:0];
+ assign axim.ARPROT = '0;
+
+ assign axim.RREADY = axis.rready;
+
+endmodule
diff --git a/rtl/gfx/gfx_beats.sv b/rtl/gfx/gfx_beats.sv
new file mode 100644
index 0000000..fcbb091
--- /dev/null
+++ b/rtl/gfx/gfx_beats.sv
@@ -0,0 +1,29 @@
+interface gfx_beats
+#(int WIDTH = $bits(gfx::word));
+
+ logic[WIDTH - 1:0] data;
+ logic ready;
+ logic valid;
+
+ modport tx
+ (
+ input ready,
+ output data,
+ valid
+ );
+
+ modport rx
+ (
+ input data,
+ valid,
+ output ready
+ );
+
+ modport peek
+ (
+ input data,
+ ready,
+ valid
+ );
+
+endinterface
diff --git a/rtl/gfx/gfx_bootrom.sv b/rtl/gfx/gfx_bootrom.sv
new file mode 100644
index 0000000..2c4581e
--- /dev/null
+++ b/rtl/gfx/gfx_bootrom.sv
@@ -0,0 +1,66 @@
+module gfx_bootrom
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_axil.s axis
+);
+
+ localparam ROM_WORDS_LOG = 8;
+
+ enum int unsigned
+ {
+ WAIT,
+ READ,
+ RDATA,
+ READY
+ } state;
+
+ word read, rom[1 << ROM_WORDS_LOG];
+ logic[ROM_WORDS_LOG - 1:0] read_addr;
+
+ assign axis.bvalid = 0;
+ assign axis.wready = 0;
+ assign axis.awready = 0;
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ state <= WAIT;
+ axis.rvalid <= 0;
+ axis.arready <= 0;
+ end else begin
+ axis.arready <= 0;
+
+ unique case (state)
+ WAIT:
+ if (axis.arvalid & ~axis.arready)
+ state <= READ;
+
+ READ:
+ state <= RDATA;
+
+ RDATA: begin
+ state <= READY;
+ axis.rvalid <= 1;
+ end
+
+ READY:
+ if (axis.rready) begin
+ state <= WAIT;
+ axis.rvalid <= 0;
+ axis.arready <= 1;
+ end
+ endcase
+ end
+
+ always_ff @(posedge clk) begin
+ read <= rom[read_addr];
+ read_addr <= axis.araddr[$bits(read_addr) + SUBWORD_BITS - 1:SUBWORD_BITS];
+ axis.rdata <= read;
+ end
+
+ initial
+ $readmemh("gfx_bootrom.hex", rom);
+
+endmodule
diff --git a/rtl/gfx/gfx_clz.sv b/rtl/gfx/gfx_clz.sv
new file mode 100644
index 0000000..8d6f100
--- /dev/null
+++ b/rtl/gfx/gfx_clz.sv
@@ -0,0 +1,68 @@
+/* Implementación en árbol de count leading zeros (CLZ).
+ * WIDTH debe ser una potencia de 2.
+ */
+module gfx_clz
+#(int WIDTH = 0)
+(
+ input logic clk,
+
+ input logic[WIDTH - 1:0] value,
+ output logic[$clog2(WIDTH):0] clz
+);
+
+ genvar i;
+ generate
+ if (WIDTH <= 1) begin
+ always_ff @(posedge clk)
+ clz <= !value;
+ end else if (WIDTH == 2) begin
+ always_ff @(posedge clk)
+ unique case (value)
+ 2'b00: clz <= 2'b10;
+ 2'b01: clz <= 2'b01;
+ 2'b10: clz <= 2'b00;
+ 2'b11: clz <= 2'b00;
+ endcase
+ end else if (WIDTH == 4) begin
+ // Eficiente en FPGAs con 4-LUTs
+ always_ff @(posedge clk)
+ if (value[3])
+ clz <= 3'b000;
+ else if (value[2])
+ clz <= 3'b001;
+ else if (value[1])
+ clz <= 3'b010;
+ else if (value[0])
+ clz <= 3'b011;
+ else
+ clz <= 3'b100;
+ end else begin
+ logic msb_right;
+ logic[$clog2(WIDTH) - 1:0] clz_left, clz_right;
+ logic[$clog2(WIDTH) - 2:0] tail_right;
+
+ assign {msb_right, tail_right} = clz_right;
+
+ gfx_clz #(WIDTH / 2) left
+ (
+ .clk(clk),
+ .clz(clz_left),
+ .value(value[WIDTH - 1:WIDTH / 2])
+ );
+
+ gfx_clz #(WIDTH / 2) right
+ (
+ .clk(clk),
+ .clz(clz_right),
+ .value(value[WIDTH / 2 - 1:0])
+ );
+
+ always_ff @(posedge clk)
+ if (clz_left[$clog2(WIDTH) - 1])
+ clz <= {msb_right, ~msb_right, tail_right};
+ else
+ clz <= {1'b0, clz_left};
+ end
+ endgenerate
+
+endmodule
diff --git a/rtl/gfx/gfx_ctz.sv b/rtl/gfx/gfx_ctz.sv
new file mode 100644
index 0000000..2713f8a
--- /dev/null
+++ b/rtl/gfx/gfx_ctz.sv
@@ -0,0 +1,18 @@
+// Count trailing zeros (ctz), clz al revés
+module gfx_ctz
+#(int WIDTH = 0)
+(
+ input logic clk,
+
+ input logic[WIDTH - 1:0] value,
+ output logic[$clog2(WIDTH):0] ctz
+);
+
+ gfx_clz #(WIDTH) clz
+ (
+ .clk,
+ .value({<<{value}}),
+ .clz(ctz)
+ );
+
+endmodule
diff --git a/rtl/gfx/gfx_fifo.sv b/rtl/gfx/gfx_fifo.sv
new file mode 100644
index 0000000..7174e4d
--- /dev/null
+++ b/rtl/gfx/gfx_fifo.sv
@@ -0,0 +1,102 @@
+module gfx_fifo
+#(int WIDTH = 0,
+ int DEPTH = 0)
+(
+ input logic clk,
+ rst_n,
+
+ gfx_beats.rx in,
+ gfx_beats.tx out
+);
+
+ logic do_read, do_write, full_if_eq, in_stall, out_stall,
+ may_read, may_write, read, read_ok, write;
+
+ logic[WIDTH - 1:0] fifo[DEPTH], read_data, write_data;
+ logic[$clog2(DEPTH) - 1:0] read_ptr, write_ptr;
+
+ assign do_read = read & may_read;
+ assign do_write = write & may_write;
+
+ always_comb begin
+ may_read = full_if_eq;
+ may_write = !full_if_eq;
+
+ if (read)
+ may_write = 1;
+
+ if (read_ptr != write_ptr) begin
+ may_read = 1;
+ may_write = 1;
+ end
+ end
+
+ gfx_skid_flow in_flow
+ (
+ .clk,
+ .rst_n,
+ .stall(in_stall),
+ .in_ready(in.ready),
+ .in_valid(in.valid),
+ .out_ready(may_write),
+ .out_valid(write)
+ );
+
+ gfx_skid_flow out_flow
+ (
+ .clk,
+ .rst_n,
+ .stall(out_stall),
+ .in_ready(read),
+ .in_valid(read_ok),
+ .out_ready(out.ready),
+ .out_valid(out.valid)
+ );
+
+ gfx_skid_buf #(WIDTH) in_skid
+ (
+ .clk,
+ .in(in.data),
+ .out(write_data),
+ .stall(in_stall)
+ );
+
+ gfx_skid_buf #(WIDTH) out_skid
+ (
+ .clk,
+ .in(read_data),
+ .out(out.data),
+ .stall(out_stall)
+ );
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ read_ok <= 0;
+ read_ptr <= 0;
+ write_ptr <= 0;
+ full_if_eq <= 0;
+ end else begin
+ if (~out_stall)
+ read_ok <= read && may_read;
+
+ if (do_read)
+ read_ptr <= read_ptr + 1;
+
+ if (do_write)
+ write_ptr <= write_ptr + 1;
+
+ if (do_read & ~do_write)
+ full_if_eq <= 0;
+ else if (~do_read & do_write)
+ full_if_eq <= 1;
+ end
+
+ always_ff @(posedge clk) begin
+ if (~out_stall)
+ read_data <= fifo[read_ptr];
+
+ if (may_write)
+ fifo[write_ptr] <= write_data;
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_fixed_dotadd.sv b/rtl/gfx/gfx_fixed_dotadd.sv
new file mode 100644
index 0000000..fdd5ffd
--- /dev/null
+++ b/rtl/gfx/gfx_fixed_dotadd.sv
@@ -0,0 +1,55 @@
+module gfx_fixed_dotadd
+(
+ input logic clk,
+
+ input gfx::fixed a0,
+ b0,
+ a1,
+ b1,
+ c,
+ input logic stall,
+
+ output gfx::fixed q
+);
+
+ import gfx::*;
+
+ fixed q0, a1_hold, b1_hold;
+
+ gfx_fixed_muladd muladd_0
+ (
+ .clk,
+ .a(a0),
+ .b(b0),
+ .c,
+ .q(q0),
+ .stall
+ );
+
+ gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes
+ (
+ .clk,
+ .in(a1),
+ .out(a1_hold),
+ .stall
+ );
+
+ gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes
+ (
+ .clk,
+ .in(b1),
+ .out(b1_hold),
+ .stall
+ );
+
+ gfx_fixed_muladd muladd_1
+ (
+ .clk,
+ .a(a1_hold),
+ .b(b1_hold),
+ .c(q0),
+ .q,
+ .stall
+ );
+
+endmodule
diff --git a/rtl/gfx/gfx_fixed_muladd.sv b/rtl/gfx/gfx_fixed_muladd.sv
new file mode 100644
index 0000000..22b7247
--- /dev/null
+++ b/rtl/gfx/gfx_fixed_muladd.sv
@@ -0,0 +1,77 @@
+module gfx_fixed_muladd
+(
+ input logic clk,
+
+ input gfx::fixed a,
+ b,
+ c,
+ input logic stall,
+
+ output gfx::fixed q
+);
+
+ import gfx::*;
+
+`ifndef VERILATOR
+ logic[2 * $bits(fixed) - $bits(fixed_frac) - 1:0] q_ext;
+
+ assign q = q_ext[$bits(fixed) - 1:0];
+
+ lpm_mult mult
+ (
+ .aclr(0),
+ .clock(clk),
+ .clken(!stall),
+
+ .sum({c, {`FIXED_FRAC{1'b0}}}),
+ .dataa(a),
+ .datab(b),
+ .result(q_ext)
+ );
+
+ defparam
+ mult.lpm_widtha = $bits(fixed),
+ mult.lpm_widthb = $bits(fixed),
+ mult.lpm_widths = $bits(fixed) + $bits(fixed_frac),
+ /* Esto es crucial. No está documentado en ningún lado (aparte de un
+ * comentario en r/fpga). Si lpm_widthp < lpm_widtha + lpm_widthb,
+ * entonces result contiene los lpm_widthp bits más significativos
+ * del producto, no los menos significativos como tendría sentido.
+ */
+ mult.lpm_widthp = 2 * $bits(fixed) - $bits(fixed_frac),
+ mult.lpm_representation = "SIGNED",
+ mult.lpm_pipeline = FIXED_MULADD_DEPTH;
+`else
+ logic[$bits(fixed) + $bits(fixed_frac) - 1:0] q_ext;
+
+ fixed a_hold, b_hold, c_hold;
+
+ assign q = q_ext[$bits(fixed) + $bits(fixed_frac) - 1:$bits(fixed_frac)] + c_hold;
+ assign q_ext = a_hold * b_hold;
+
+ gfx_pipes #(.WIDTH($bits(a)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes
+ (
+ .clk,
+ .in(a),
+ .out(a_hold),
+ .stall
+ );
+
+ gfx_pipes #(.WIDTH($bits(b)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes
+ (
+ .clk,
+ .in(b),
+ .out(b_hold),
+ .stall
+ );
+
+ gfx_pipes #(.WIDTH($bits(c)), .DEPTH(FIXED_MULADD_DEPTH)) c_pipes
+ (
+ .clk,
+ .in(c),
+ .out(c_hold),
+ .stall
+ );
+`endif
+
+endmodule
diff --git a/rtl/gfx/gfx_front_back.sv b/rtl/gfx/gfx_front_back.sv
new file mode 100644
index 0000000..b768532
--- /dev/null
+++ b/rtl/gfx/gfx_front_back.sv
@@ -0,0 +1,37 @@
+interface gfx_front_back
+import gfx::*;;
+
+ struct
+ {
+ wave_exec wave;
+ fpint_op p0;
+ mem_op p1;
+ sfu_op p2;
+ group_op p3;
+ } execute;
+
+ struct
+ {
+ logic valid;
+ group_id group;
+ } loop;
+
+ shader_dispatch dispatch;
+
+ modport front
+ (
+ input loop,
+
+ output execute,
+ dispatch
+ );
+
+ modport back
+ (
+ input execute,
+ dispatch,
+
+ output loop
+ );
+
+endinterface
diff --git a/rtl/gfx/gfx_isa.sv b/rtl/gfx/gfx_isa.sv
new file mode 100644
index 0000000..7239478
--- /dev/null
+++ b/rtl/gfx/gfx_isa.sv
@@ -0,0 +1,84 @@
+package gfx_isa;
+
+ typedef logic[3:0] sgpr_num;
+ typedef logic[2:0] vgpr_num;
+
+ typedef logic signed[7:0] pc_offset;
+
+ typedef union packed
+ {
+ sgpr_num sgpr;
+
+ struct packed
+ {
+ logic[$bits(sgpr_num) - $bits(vgpr_num) - 1:0] reserved;
+ vgpr_num num;
+ } vgpr;
+ } xgpr_num;
+
+ typedef struct packed
+ {
+ enum logic[1:0]
+ {
+ REGS_SVS = 2'b00,
+ REGS_SSS = 2'b01,
+ REGS_VVS = 2'b10,
+ REGS_VVV = 2'b11
+ } reg_mode;
+
+ union packed
+ {
+ struct packed
+ {
+ logic b_is_imm;
+
+ union packed
+ {
+ logic[12:0] imm;
+
+ struct packed
+ {
+ logic from_consts;
+ logic[7:0] reserved;
+ xgpr_num r;
+ } read;
+ } b;
+
+ xgpr_num ra,
+ rd;
+ } rr;
+ } dst_src;
+
+ logic reg_rev;
+
+ union packed
+ {
+ struct packed
+ {
+ enum logic[4:0]
+ {
+ INSN_FPINT_MOV = 0,
+ INSN_FPINT_FMUL = 1,
+ INSN_FPINT_IMUL = 2,
+ INSN_FPINT_FADD = 3,
+ INSN_FPINT_RES4 = 4,
+ INSN_FPINT_FMAX = 5,
+ INSN_FPINT_RES6 = 6,
+ INSN_FPINT_FMIN = 7,
+ INSN_FPINT_RES8 = 8,
+ INSN_FPINT_FCVT = 9,
+ INSN_FPINT_RES[10:31]
+ } op;
+ } fpint;
+ } by_class;
+
+ enum logic[1:0]
+ {
+ INSN_FPINT = 0,
+ INSN_MEM = 1,
+ INSN_SFU = 2,
+ INSN_GROUP = 3
+ } insn_class;
+ } insn_word;
+
+endpackage
diff --git a/rtl/gfx/gfx_pipes.sv b/rtl/gfx/gfx_pipes.sv
new file mode 100644
index 0000000..2fa875a
--- /dev/null
+++ b/rtl/gfx/gfx_pipes.sv
@@ -0,0 +1,24 @@
+module gfx_pipes
+#(int WIDTH=0, int DEPTH=0)
+(
+ input logic clk,
+
+ input logic[WIDTH - 1:0] in,
+ input logic stall,
+
+ output logic[WIDTH - 1:0] out
+);
+
+ logic[WIDTH - 1:0] pipes[DEPTH];
+
+ assign out = pipes[DEPTH - 1];
+
+ always_ff @(posedge clk)
+ if (~stall) begin
+ pipes[0] <= in;
+
+ for (integer i = 1; i < DEPTH; ++i)
+ pipes[i] <= pipes[i - 1];
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_pkg.sv b/rtl/gfx/gfx_pkg.sv
new file mode 100644
index 0000000..7072967
--- /dev/null
+++ b/rtl/gfx/gfx_pkg.sv
@@ -0,0 +1,271 @@
+package gfx;
+
+ typedef logic[31:0] word;
+
+ typedef word uword;
+ typedef logic signed[$bits(word) - 1:0] sword;
+ typedef logic[$bits(word) / 2 - 1:0] uhword;
+ typedef logic signed[$bits(word) / 2 - 1:0] shword;
+ typedef logic[2 * $bits(word) - 1:0] udword;
+ typedef logic signed[2 * $bits(word) - 1:0] sdword;
+ typedef logic signed[4 * $bits(word) - 1:0] qword;
+ typedef logic signed[8 * $bits(word) - 1:0] oword;
+
+ localparam int SUBWORD_BITS = $clog2($bits(word)) - $clog2($bits(byte));
+ localparam int BYTES_PER_WORD = 1 << SUBWORD_BITS;
+
+ typedef logic[$bits(word) - SUBWORD_BITS - 1:0] word_ptr;
+ typedef logic[$bits(word_ptr) - 1 - 1:0] dword_ptr;
+ typedef logic[$bits(word_ptr) - 2 - 1:0] qword_ptr;
+ typedef logic[$bits(word_ptr) - 3 - 1:0] oword_ptr;
+
+ typedef logic[7:0] float_exp;
+ typedef logic[$bits(word) - $bits(float_exp) - 2:0] float_mant;
+ typedef logic[$bits(float_mant):0] float_mant_full; // Incluye '1.' explícito
+ typedef logic[$bits(float_mant_full) + 1:0] float_mant_ext; // Considera overflow
+
+ localparam float_exp FLOAT_EXP_BIAS = (1 << ($bits(float_exp) - 1)) - 1;
+ localparam float_exp FLOAT_EXP_MAX = {($bits(float_exp)){1'b1}};
+
+ function float_mant_full full_mant(float_mant in);
+ full_mant = {1'b1, in};
+ endfunction
+
+ function float_mant implicit_mant(float_mant_full in);
+ assert (in[$bits(in) - 1]);
+ implicit_mant = in[$bits(in) - 2:0];
+ endfunction
+
+ typedef struct packed
+ {
+ logic sign;
+ float_exp exp;
+ float_mant mant;
+ } float;
+
+ /* Explicación de guard, round, sticky:
+ * https://drilian.com/2023/01/10/floating-point-numbers-and-rounding/
+ */
+ typedef struct packed
+ {
+ float normal;
+ logic slow,
+ zero,
+ guard,
+ round,
+ sticky;
+ } float_round;
+
+ typedef struct packed
+ {
+ logic exp_max,
+ exp_min,
+ mant_zero;
+ } float_class;
+
+ function float_class classify_float(float in);
+ classify_float.exp_max = &in.exp;
+ classify_float.exp_min = ~|in.exp;
+ classify_float.mant_zero = ~|in.mant;
+ endfunction
+
+ function logic is_float_special(float_class in);
+ is_float_special = in.exp_max | (in.exp_min & ~in.mant_zero);
+ endfunction
+
+ function float_mant_ext float_prepare_round(float in, float_class in_class);
+ float_prepare_round = {~in_class.exp_min, in.mant, 2'b00};
+ endfunction
+
+ typedef struct packed
+ {
+ logic setup_mul_float,
+ setup_unit_b,
+ mnorm_put_hi,
+ mnorm_put_lo,
+ mnorm_put_mul,
+ mnorm_zero_b,
+ mnorm_zero_flags,
+ minmax_abs,
+ minmax_swap,
+ minmax_zero_min,
+ minmax_copy_flags,
+ shiftr_int_signed,
+ addsub_copy_flags,
+ addsub_int_operand,
+ clz_force_nop,
+ shiftl_copy_flags,
+ round_copy_flags,
+ round_enable,
+ encode_enable,
+ writeback;
+ } fpint_op;
+
+ typedef struct packed
+ {
+ logic todo;
+ } mem_op;
+
+ typedef struct packed
+ {
+ logic todo;
+ } sfu_op;
+
+ typedef struct packed
+ {
+ logic todo;
+ } group_op;
+
+ // Q22.10
+ typedef logic[9:0] fixed_frac;
+ typedef logic[$bits(word) - $bits(fixed_frac) - 1:0] fixed_int;
+
+ typedef struct packed signed
+ {
+ fixed_int fint; // 'int' es una keyword
+ fixed_frac frac;
+ } fixed;
+
+ typedef struct packed
+ {
+ fixed x,
+ y;
+ } fixed_xy;
+
+ typedef struct packed
+ {
+ fixed a,
+ b,
+ c;
+ } vtx_fixed;
+
+ typedef struct packed
+ {
+ fixed_xy a,
+ b,
+ c;
+ } vtx_xy;
+
+ localparam int RASTER_BITS = 2;
+ localparam int RASTER_SUB_BITS = 4;
+ localparam int RASTER_SIZE = 1 << RASTER_BITS;
+ localparam int RASTER_COARSE_FRAGS = RASTER_SIZE * RASTER_SIZE;
+
+ typedef logic[RASTER_BITS - 1:0] raster_index;
+
+ // Caso RASTER_BITS = 2: -> 4,4,4,4 -> 8,8-> 16
+ localparam int RASTER_OUT_CLZ_DEPTH = 3;
+
+ // Asume RASTER_BITS == 2, hay que ajustarlo si cambia
+ typedef struct packed
+ {
+ // Esto ahorra muchos flops
+ //
+ // offsets[0] = inc * 0 = 0
+ // offsets[1] = inc * 1 = raster2_times1
+ // offsets[2] = inc * 2 = raster2_times1 << 1
+ // offsets[3] = inc * 3 = raster2_times3
+ fixed raster2_times1,
+ raster2_times3;
+ } raster_offsets;
+
+ function fixed raster_idx(raster_offsets offsets, raster_index idx);
+ unique case (idx)
+ RASTER_BITS'(0):
+ return '0;
+
+ RASTER_BITS'(1):
+ return offsets.raster2_times1;
+
+ RASTER_BITS'(2):
+ return offsets.raster2_times1 << 1;
+
+ RASTER_BITS'(3):
+ return offsets.raster2_times3;
+ endcase
+ endfunction
+
+ function raster_offsets make_raster_offsets(fixed inc);
+ make_raster_offsets.raster2_times1 = inc;
+ make_raster_offsets.raster2_times3 = inc + (inc << 1);
+ endfunction
+
+ typedef struct packed
+ {
+ raster_offsets x,
+ y;
+ } raster_offsets_xy;
+
+ typedef struct packed
+ {
+ logic[RASTER_SUB_BITS - 1:0] num;
+ logic[$bits(fixed_frac) - RASTER_SUB_BITS - 1:0] prec;
+ } raster_sub;
+
+ localparam int RASTER_COARSE_DIM_BITS = $bits(fixed) - $bits(raster_index) - $bits(raster_sub);
+
+ typedef logic signed[RASTER_COARSE_DIM_BITS - 1:0] raster_coarse_dim;
+
+ typedef struct packed
+ {
+ raster_coarse_dim x,
+ y;
+ } raster_coarse_xy;
+
+ typedef struct packed signed
+ {
+ raster_coarse_dim coarse;
+ raster_index fine;
+ raster_sub sub;
+ } raster_prec;
+
+ typedef struct packed
+ {
+ raster_prec x,
+ y;
+ } raster_prec_xy;
+
+ // Definir el número de lanes a partir de las dimensiones del
+ // rasterizer es una decisión crucial, el diseño entero depende de esto
+
+ localparam int SHADER_LANES = RASTER_COARSE_FRAGS;
+
+ typedef logic[RASTER_SIZE - 1:0] lane_no;
+ typedef logic[SHADER_LANES - 1:0] lane_mask;
+
+ typedef logic[5:0] group_id;
+
+ localparam int REGFILE_STAGES = 3;
+ localparam int REG_READ_STAGES = 2 + REGFILE_STAGES + 1;
+
+ typedef gfx_isa::sgpr_num sgpr_num;
+ typedef gfx_isa::vgpr_num vgpr_num;
+ typedef gfx_isa::xgpr_num xgpr_num;
+ typedef gfx_isa::pc_offset pc_offset;
+
+ typedef struct packed
+ {
+ // No incluye p0 porque p0 no tiene señal ready
+ logic p1,
+ p2,
+ p3,
+ valid;
+ } shader_dispatch;
+
+ typedef struct
+ {
+ group_id group;
+ xgpr_num dest;
+ logic dest_scalar;
+ } wave_exec;
+
+ localparam int FIXED_MULADD_DEPTH = 5;
+ localparam int FIXED_DOTADD_DEPTH = 2 * FIXED_MULADD_DEPTH;
+
+ localparam word BOOTROM_BASE = 32'h0010_0000;
+
+ localparam int SCHED_BRAM_WORDS = 2048; // 8KiB
+
+ typedef word irq_lines;
+
+endpackage
diff --git a/rtl/gfx/gfx_pkts.sv b/rtl/gfx/gfx_pkts.sv
new file mode 100644
index 0000000..41399ce
--- /dev/null
+++ b/rtl/gfx/gfx_pkts.sv
@@ -0,0 +1,29 @@
+interface gfx_pkts
+#(parameter int WIDTH = $bits(gfx::word));
+
+ import gfx::*;
+
+ logic tlast;
+ logic tready;
+ logic tvalid;
+ logic[WIDTH - 1:0] tdata;
+
+ modport tx
+ (
+ input tready,
+
+ output tdata,
+ tlast,
+ tvalid
+ );
+
+ modport rx
+ (
+ input tdata,
+ tlast,
+ tvalid,
+
+ output tready
+ );
+
+endinterface
diff --git a/rtl/gfx/gfx_raster.sv b/rtl/gfx/gfx_raster.sv
new file mode 100644
index 0000000..a57a672
--- /dev/null
+++ b/rtl/gfx/gfx_raster.sv
@@ -0,0 +1,930 @@
+module gfx_raster
+(
+ input logic clk,
+ rst_n,
+
+ gfx_pkts.rx geometry,
+
+ gfx_pkts.tx coverage
+);
+
+ import gfx::*;
+
+ gfx_raster_bounds setup_bounds
+ (
+ .clk,
+ .rst_n,
+
+ .geometry,
+
+ .edges_ref(bounds_edges_ref),
+ .edges_vtx(bounds_edges_vtx),
+ .edges_span(bounds_edges_span),
+ .edges_ready(bounds_edges_ready),
+ .edges_valid(bounds_edges_valid),
+ .edges_geom_id(bounds_edges_geom_id)
+ );
+
+ word bounds_edges_geom_id;
+ logic bounds_edges_ready, bounds_edges_valid;
+ vtx_xy bounds_edges_vtx;
+ fixed_xy bounds_edges_ref;
+ raster_prec_xy bounds_edges_span;
+
+ gfx_raster_edges setup_edges
+ (
+ .clk,
+ .rst_n,
+
+ .bounds_ref(bounds_edges_ref),
+ .bounds_vtx(bounds_edges_vtx),
+ .bounds_span(bounds_edges_span),
+ .bounds_ready(bounds_edges_ready),
+ .bounds_valid(bounds_edges_valid),
+ .bounds_geom_id(bounds_edges_geom_id),
+
+ .coarse_ref(edges_coarse_ref),
+ .coarse_base(edges_coarse_base),
+ .coarse_span(edges_coarse_span),
+ .coarse_ready(edges_coarse_ready),
+ .coarse_valid(edges_coarse_valid),
+ .coarse_geom_id(edges_coarse_geom_id),
+ .coarse_offsets(edges_coarse_offsets)
+ );
+
+ word edges_coarse_geom_id;
+ fixed edges_coarse_base;
+ logic edges_coarse_ready, edges_coarse_valid;
+ fixed_xy edges_coarse_ref;
+ raster_prec_xy edges_coarse_span;
+ raster_offsets_xy edges_coarse_offsets;
+
+ gfx_raster_coarse coarse
+ (
+ .clk,
+ .rst_n,
+
+ .edges_ref(edges_coarse_ref),
+ .edges_base(edges_coarse_base),
+ .edges_span(edges_coarse_span),
+ .edges_ready(edges_coarse_ready),
+ .edges_valid(edges_coarse_valid),
+ .edges_geom_id(edges_coarse_geom_id),
+ .edges_offsets(edges_coarse_offsets),
+
+ .fine_ref(coarse_fine_ref),
+ .fine_ready(coarse_fine_ready),
+ .fine_valid(coarse_fine_valid),
+ .fine_corner(coarse_fine_corner),
+ .fine_geom_id(coarse_fine_geom_id),
+ .fine_offsets(coarse_fine_offsets)
+ );
+
+ word coarse_fine_geom_id;
+ fixed coarse_fine_corner;
+ logic coarse_fine_ready, coarse_fine_valid;
+ fixed_xy coarse_fine_ref;
+ raster_offsets_xy coarse_fine_offsets;
+
+ gfx_raster_fine fine
+ (
+ .clk,
+ .rst_n,
+
+ .coarse_ref(coarse_fine_ref),
+ .coarse_ready(coarse_fine_ready),
+ .coarse_valid(coarse_fine_valid),
+ .coarse_corner(coarse_fine_corner),
+ .coarse_geom_id(coarse_fine_geom_id),
+ .coarse_offsets(coarse_fine_offsets),
+
+ .coverage
+ );
+
+endmodule
+
+module gfx_raster_bounds
+(
+ input logic clk,
+ rst_n,
+
+ gfx_pkts.rx geometry,
+
+ input logic edges_ready,
+ output logic edges_valid,
+ output gfx::word edges_geom_id,
+ output gfx::fixed_xy edges_ref,
+ output gfx::raster_prec_xy edges_span,
+ output gfx::vtx_xy edges_vtx
+);
+
+ import gfx::*;
+
+ enum int unsigned
+ {
+ IN_GEOM_ID,
+ IN_DIM_X,
+ IN_DIM_Y
+ } in_state;
+
+ enum int unsigned
+ {
+ VTX_A,
+ VTX_B,
+ VTX_C
+ } vtx_state;
+
+ logic a_lt_b, a_lt_c, b_lt_c, edges_handshake, geom_complete, geom_last,
+ geom_recv, in_vtx, next_dim, new_vtx;
+
+ logic end_new_dim, end_valid, vtx_valid, lt_new_dim, lt_valid, minmax_new_dim, minmax_valid;
+
+ fixed geom_data;
+ vtx_fixed dim_vtx, dim_vtx_x, dim_vtx_y;
+ raster_prec max, min;
+
+ assign geom_recv = geometry.tready & geometry.tvalid;
+ assign edges_handshake = edges_valid & edges_ready;
+
+ assign edges_vtx.a.x = dim_vtx_x.a;
+ assign edges_vtx.a.y = dim_vtx_y.a;
+ assign edges_vtx.b.x = dim_vtx_x.b;
+ assign edges_vtx.b.y = dim_vtx_y.b;
+ assign edges_vtx.c.x = dim_vtx_x.c;
+ assign edges_vtx.c.y = dim_vtx_y.c;
+
+ assign geometry.tready = edges_handshake | ~geom_complete;
+
+ always_comb begin
+ unique case (vtx_state)
+ VTX_C: next_dim = geom_recv;
+ default: next_dim = 0;
+ endcase
+
+ unique case (in_state)
+ IN_DIM_Y: geom_last = next_dim;
+ default: geom_last = 0;
+ endcase
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ in_state <= IN_GEOM_ID;
+ vtx_state <= VTX_A;
+
+ in_vtx <= 0;
+ new_vtx <= 0;
+ geom_complete <= 0;
+
+ lt_valid <= 0;
+ end_valid <= 0;
+ vtx_valid <= 0;
+ edges_valid <= 0;
+ minmax_valid <= 0;
+
+ lt_new_dim <= 0;
+ end_new_dim <= 0;
+ minmax_new_dim <= 0;
+
+ edges_geom_id <= 'x;
+ end else begin
+ end_valid <= 0;
+ vtx_valid <= end_valid;
+ lt_valid <= vtx_valid;
+ minmax_valid <= lt_valid;
+
+ if (~edges_valid | edges_ready)
+ edges_valid <= minmax_valid;
+
+ geom_complete <= (geom_complete | geom_last) & ~edges_handshake;
+
+ unique case (in_state)
+ IN_GEOM_ID:
+ if (geom_recv) begin
+ in_state <= IN_DIM_X;
+
+ in_vtx <= 1;
+ edges_geom_id <= geometry.tdata;
+ end
+
+ IN_DIM_X:
+ if (next_dim)
+ in_state <= IN_DIM_Y;
+
+ IN_DIM_Y:
+ if (next_dim) begin
+ in_state <= IN_GEOM_ID;
+
+ in_vtx <= 0;
+ end_valid <= 1;
+ end
+ endcase
+
+ new_vtx <= 0;
+
+ lt_new_dim <= 0;
+ minmax_new_dim <= lt_new_dim;
+ end_new_dim <= minmax_new_dim;
+
+ unique case (vtx_state)
+ VTX_A: begin
+ if (in_vtx & geom_recv) begin
+ new_vtx <= 1;
+ vtx_state <= VTX_B;
+ end
+
+ if (new_vtx) begin
+ dim_vtx.c <= geom_data;
+ lt_new_dim <= 1;
+ end
+ end
+
+ VTX_B: begin
+ if (geom_recv) begin
+ new_vtx <= 1;
+ vtx_state <= VTX_C;
+ end
+
+ if (new_vtx)
+ dim_vtx.a <= geom_data;
+ end
+
+ VTX_C: begin
+ if (geom_recv) begin
+ new_vtx <= 1;
+ vtx_state <= VTX_A;
+ end
+
+ if (new_vtx)
+ dim_vtx.b <= geom_data;
+ end
+ endcase
+
+ if (in_state == IN_DIM_Y & next_dim)
+ assert (geometry.tlast);
+ end
+
+ always_ff @(posedge clk) begin
+ geom_data <= geometry.tdata;
+
+ a_lt_b <= $signed(dim_vtx.a) < $signed(dim_vtx.b);
+ a_lt_c <= $signed(dim_vtx.a) < $signed(dim_vtx.c);
+ b_lt_c <= $signed(dim_vtx.b) < $signed(dim_vtx.c);
+
+ // Realmente no son 'x' o 'y' hasta cuando edges_valid = 1
+ if (lt_new_dim) begin
+ dim_vtx_y <= dim_vtx;
+ dim_vtx_x <= dim_vtx_y;
+ end
+
+ if (a_lt_b) begin
+ min <= a_lt_c ? dim_vtx_y.a : dim_vtx_y.c;
+ max <= b_lt_c ? dim_vtx_y.c : dim_vtx_y.b;
+ end else begin
+ min <= b_lt_c ? dim_vtx_y.b : dim_vtx_y.c;
+ max <= a_lt_c ? dim_vtx_y.c : dim_vtx_y.a;
+ end
+
+ {min.fine, min.sub} <= '0;
+ {max.fine, max.sub} <= '0;
+
+ if (end_new_dim) begin
+ edges_ref.y <= min;
+ edges_ref.x <= edges_ref.y;
+
+ edges_span.y <= max - min;
+ edges_span.x <= edges_span.y;
+ end
+ end
+
+endmodule
+
+module gfx_raster_edges
+(
+ input logic clk,
+ rst_n,
+
+ input logic bounds_valid,
+ input gfx::word bounds_geom_id,
+ input gfx::fixed_xy bounds_ref,
+ input gfx::raster_prec_xy bounds_span,
+ input gfx::vtx_xy bounds_vtx,
+ output logic bounds_ready,
+
+ input logic coarse_ready,
+ output logic coarse_valid,
+ output gfx::word coarse_geom_id,
+ output gfx::fixed_xy coarse_ref,
+ output gfx::raster_prec_xy coarse_span,
+ output gfx::fixed coarse_base,
+ output gfx::raster_offsets_xy coarse_offsets
+);
+
+ import gfx::*;
+
+ enum int unsigned
+ {
+ EDGE_AB,
+ EDGE_BC,
+ EDGE_CA,
+ // EDGE_CA cumple doble función como OFFSETS_AB
+ OFFSETS_BC,
+ OFFSETS_CA,
+ OUT
+ } state;
+
+ struct
+ {
+ fixed_xy cur,
+ delay1,
+ delay2;
+ } inc;
+
+ logic coarse_handshake, coarse_stall, offsets_flow;
+ fixed_xy delta, p, q;
+
+ // - 2 porque coarse valid va al final
+ logic[FIXED_DOTADD_DEPTH - 2:0] dotadd_valid;
+
+ assign coarse_stall = coarse_valid & ~coarse_ready;
+ assign coarse_handshake = coarse_valid & coarse_ready;
+
+ gfx_fixed_dotadd edge_base
+ (
+ .clk,
+ .c(0),
+ .q(coarse_base),
+ .a0(delta.x),
+ .b0(inc.cur.x),
+ .a1(delta.y),
+ .b1(inc.cur.y),
+ .stall(coarse_stall)
+ );
+
+ always_comb
+ unique case (state)
+ OUT: offsets_flow = coarse_handshake;
+ default: offsets_flow = 1;
+ endcase
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ state <= EDGE_AB;
+
+ p <= 'x;
+ q <= 'x;
+ coarse_ref <= 'x;
+ coarse_geom_id <= 'x;
+
+ bounds_ready <= 0;
+ coarse_valid <= 0;
+
+ for (int i = 0; i < $bits(dotadd_valid) - 1; ++i)
+ dotadd_valid[i] <= 0;
+ end else begin
+ for (int i = 1; i < $bits(dotadd_valid); ++i)
+ dotadd_valid[i] <= dotadd_valid[i - 1];
+
+ if (~coarse_stall)
+ coarse_valid <= dotadd_valid[$bits(dotadd_valid) - 1];
+
+ bounds_ready <= 0;
+ dotadd_valid[0] <= 0;
+
+ unique case (state)
+ EDGE_AB: begin
+ if (bounds_valid)
+ state <= EDGE_BC;
+
+ coarse_ref <= bounds_ref;
+ coarse_span <= bounds_span;
+ coarse_geom_id <= bounds_geom_id;
+
+ p <= bounds_vtx.a;
+ q <= bounds_vtx.b;
+ end
+
+ EDGE_BC: begin
+ state <= EDGE_CA;
+ bounds_ready <= 1;
+
+ p <= bounds_vtx.b;
+ q <= bounds_vtx.c;
+ end
+
+ EDGE_CA: begin
+ state <= OFFSETS_BC;
+
+ p <= bounds_vtx.c;
+ q <= bounds_vtx.a;
+
+ // Esto ocurre justamente en un momento en que ab, bc, ca
+ // quedan todos en sus lugares correctos en la pipeline
+ dotadd_valid[0] <= 1;
+ end
+
+ OFFSETS_BC:
+ state <= OFFSETS_CA;
+
+ OFFSETS_CA:
+ state <= OUT;
+
+ OUT:
+ if (coarse_handshake)
+ state <= EDGE_AB;
+ endcase
+ end
+
+ always_ff @(posedge clk) begin
+ delta.x <= coarse_ref.x - q.x;
+ delta.y <= coarse_ref.y - q.y;
+
+ inc.cur.x <= p.y - q.y;
+ inc.cur.y <= q.x - p.x;
+
+ //TODO: top-left rule
+ if (offsets_flow) begin
+ inc.delay1 <= inc.cur;
+ inc.delay2 <= inc.delay1;
+
+ coarse_offsets.x <= make_raster_offsets(inc.delay2.x);
+ coarse_offsets.y <= make_raster_offsets(inc.delay2.y);
+ end
+ end
+
+endmodule
+
+module gfx_raster_coarse
+(
+ input logic clk,
+ rst_n,
+
+ input logic edges_valid,
+ input gfx::word edges_geom_id,
+ input gfx::fixed_xy edges_ref,
+ input gfx::raster_prec_xy edges_span,
+ input gfx::fixed edges_base,
+ input gfx::raster_offsets_xy edges_offsets,
+ output logic edges_ready,
+
+ input logic fine_ready,
+ output logic fine_valid,
+ output gfx::word fine_geom_id,
+ output gfx::fixed_xy fine_ref,
+ output gfx::fixed fine_corner,
+ output gfx::raster_offsets_xy fine_offsets
+);
+
+ import gfx::*;
+
+ enum int unsigned
+ {
+ SETUP,
+ TEST_AB,
+ TEST_BC,
+ TEST_CA,
+ OUT
+ } state;
+
+ struct
+ {
+ fixed cur,
+ next,
+ prev;
+ } corner, edge_fn, vertical;
+
+ struct
+ {
+ raster_offsets_xy cur,
+ next,
+ prev;
+ } offsets;
+
+ logic edges_recv, end_block, end_x, end_y, first_run,
+ mask, mask_reset, new_geom, test_flow, out_flow;
+
+ fixed edge_test, reference_x, vertical_inc;
+ fixed_xy max_offset, min_offset, test_offset;
+ raster_coarse_xy stride;
+ raster_coarse_dim width;
+ raster_offsets_xy next_offsets;
+
+ function fixed coarse_offset(raster_offsets offsets);
+ return raster_idx(offsets, RASTER_BITS'(1)) << RASTER_BITS;
+ endfunction
+
+ assign end_x = stride.x == '0;
+ assign end_y = stride.y == '0;
+ assign end_block = end_x & end_y;
+
+ assign edge_test = edge_fn.cur + test_offset.x + test_offset.y;
+ assign vertical_inc = vertical.cur + coarse_offset(offsets.cur.y);
+
+ assign fine_corner = corner.cur;
+ assign fine_offsets = offsets.cur; // Vuelve a cur luego de 3 ciclos
+
+ assign min_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(0));
+ assign min_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(0));
+ assign max_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(RASTER_SIZE - 1));
+ assign max_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(RASTER_SIZE - 1));
+ assign next_offsets = edges_recv ? edges_offsets : offsets.next;
+
+ always_comb begin
+ unique case (state)
+ SETUP: new_geom = 1;
+ default: new_geom = 0;
+ endcase
+
+ unique case (state)
+ TEST_AB: mask_reset = 1;
+ default: mask_reset = 0;
+ endcase
+
+ unique case (state)
+ SETUP: edges_ready = 1;
+ default: edges_ready = 0;
+ endcase
+
+ unique case (state)
+ SETUP:
+ edges_recv = 1;
+
+ TEST_AB, TEST_BC:
+ edges_recv = first_run;
+
+ default:
+ edges_recv = 0;
+ endcase
+
+ unique case (state)
+ OUT: fine_valid = mask;
+ default: fine_valid = 0;
+ endcase
+
+ unique case (state)
+ OUT: begin
+ out_flow = ~mask | fine_ready;
+ test_flow = 0;
+ end
+
+ default: begin
+ out_flow = 0;
+ test_flow = 1;
+ end
+ endcase
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ state <= SETUP;
+ first_run <= 1;
+ end else
+ unique case (state)
+ SETUP:
+ if (edges_valid)
+ state <= TEST_AB;
+
+ TEST_AB:
+ state <= TEST_BC;
+
+ TEST_BC:
+ state <= TEST_CA;
+
+ TEST_CA:
+ state <= OUT;
+
+ OUT: begin
+ first_run <= end_block;
+ if (out_flow)
+ state <= end_block ? SETUP : TEST_AB;
+ end
+ endcase
+
+ always_ff @(posedge clk) begin
+ if (new_geom) begin
+ width <= edges_span.x.coarse;
+ stride.x <= edges_span.x.coarse;
+ stride.y <= edges_span.y.coarse;
+ reference_x <= edges_ref.x;
+
+ fine_ref <= edges_ref;
+ fine_geom_id <= edges_geom_id;
+ end
+
+ if (out_flow) begin
+ stride.x <= stride.x - 1;
+ fine_ref.x.fint <= fine_ref.x.fint + ($bits(fixed_int))'(RASTER_SIZE);
+
+ if (end_x) begin
+ fine_ref.x <= reference_x;
+ fine_ref.y.fint <= fine_ref.y.fint + ($bits(fixed_int))'(RASTER_SIZE);
+
+ stride.x <= width;
+ stride.y <= stride.y - 1;
+ end
+ end
+
+ if (test_flow) begin
+ offsets.cur <= next_offsets;
+ offsets.next <= offsets.prev;
+ offsets.prev <= offsets.cur;
+
+ vertical.cur <= vertical.next;
+ vertical.next <= vertical.prev;
+ vertical.prev <= vertical.cur;
+
+ edge_fn.cur <= edge_fn.next;
+ edge_fn.next <= edge_fn.prev;
+ edge_fn.prev <= edge_fn.cur + coarse_offset(offsets.cur.x);
+
+ if (end_x) begin
+ edge_fn.prev <= vertical_inc;
+ vertical.prev <= vertical_inc;
+ end
+
+ corner.cur <= corner.next;
+ corner.next <= corner.prev;
+ corner.prev <= edge_fn.cur;
+
+ if (coarse_offset(next_offsets.x) >= 'sd0)
+ test_offset.x <= max_offset.x;
+ else
+ test_offset.x <= min_offset.x;
+
+ if (coarse_offset(next_offsets.y) >= 'sd0)
+ test_offset.y <= max_offset.y;
+ else
+ test_offset.y <= min_offset.y;
+
+ mask <= (mask | mask_reset) & 1/*(edge_test >= 'sd0)*/;
+ end
+
+ if (edges_recv) begin
+ edge_fn.cur <= edges_base;
+ vertical.cur <= edges_base;
+ end
+ end
+
+endmodule
+
+module gfx_raster_fine
+(
+ input logic clk,
+ rst_n,
+
+ input logic coarse_valid,
+ input gfx::word coarse_geom_id,
+ input gfx::fixed_xy coarse_ref,
+ input gfx::fixed coarse_corner,
+ input gfx::raster_offsets_xy coarse_offsets,
+ output logic coarse_ready,
+
+ gfx_pkts.tx coverage
+);
+
+ import gfx::*;
+
+ enum int unsigned
+ {
+ IN_C,
+ IN_A,
+ IN_B,
+ IN_MASK
+ } in_state;
+
+ enum int unsigned
+ {
+ OUT_ACCEPT,
+ OUT_GEOM_ID,
+ OUT_POS,
+ OUT_MASK,
+ OUT_BARY_C,
+ OUT_BARY_A,
+ OUT_BARY_B
+ } out_state;
+
+ struct
+ {
+ fixed cur,
+ next,
+ prev;
+ } corner;
+
+ struct
+ {
+ raster_offsets_xy cur,
+ next,
+ prev;
+ } offsets;
+
+ logic begin_bary, hold_block, in_valid, mask_in_clean,
+ mask_in_reset, new_block, out_last;
+
+ word geom_id;
+ fixed bary_coord;
+ lane_no lane, lane_ctz, lane_hold;
+ fixed_xy block_ref;
+ lane_mask mask_in, mask, mask_ctz;
+ raster_index lane_x, lane_y;
+ logic[$bits(lane_ctz):0] ctz_count;
+
+ function shword ref_half(raster_prec dim);
+ return dim.coarse[$bits(shword) - 1:0];
+ endfunction
+
+ assign lane_ctz = ctz_count[$bits(lane_ctz) - 1:0];
+ assign in_valid = mask_in_clean & |mask_in;
+ assign out_last = ~|mask;
+ assign {lane_y, lane_x} = lane;
+
+ // **IMPORTANTE**: Esto va a fallar a partir de RASTER_BITS >= 3,
+ // ya que la fsm asume que ctz termina en 3 ciclos o menos
+
+ gfx_ctz #(RASTER_COARSE_FRAGS) ctz
+ (
+ .clk,
+ .value(mask_ctz),
+ .ctz(ctz_count)
+ );
+
+ always_comb begin
+ unique case (out_state)
+ OUT_ACCEPT: new_block = 1;
+ default: new_block = 0;
+ endcase
+
+ unique case (out_state)
+ OUT_ACCEPT: mask_ctz = mask_in;
+ default: mask_ctz = mask;
+ endcase
+
+ unique case (out_state)
+ OUT_ACCEPT: coverage.tvalid = 0;
+ default: coverage.tvalid = 1;
+ endcase
+
+ unique case (out_state)
+ OUT_MASK, OUT_BARY_B:
+ begin_bary = coverage.tready;
+
+ default:
+ begin_bary = 0;
+ endcase
+
+ unique case (out_state)
+ OUT_BARY_B: coverage.tlast = out_last;
+ default: coverage.tlast = 0;
+ endcase
+
+ unique case (out_state)
+ OUT_GEOM_ID:
+ coverage.tdata = geom_id;
+
+ OUT_POS:
+ coverage.tdata = {ref_half(coarse_ref.y), ref_half(block_ref.x)};
+
+ OUT_MASK:
+ coverage.tdata = {{($bits(word) - $bits(mask)){1'b0}}, mask};
+
+ OUT_BARY_C, OUT_BARY_A, OUT_BARY_B:
+ coverage.tdata = bary_coord;
+
+ default:
+ coverage.tdata = 'x;
+ endcase
+
+ unique case (out_state)
+ OUT_MASK:
+ lane = lane_ctz;
+
+ default:
+ lane = lane_hold;
+ endcase
+
+ unique case (in_state)
+ IN_C: coarse_ready = new_block;
+ default: coarse_ready = 0;
+ endcase
+
+ unique case (in_state)
+ IN_C: hold_block = new_block;
+ IN_A: hold_block = 1;
+ IN_B: hold_block = 1;
+ IN_MASK: hold_block = 0;
+ endcase
+
+ unique case (in_state)
+ IN_C: mask_in_reset = 1;
+ default: mask_in_reset = 0;
+ endcase
+
+ unique case (in_state)
+ IN_MASK: mask_in_clean = 1;
+ default: mask_in_clean = 0;
+ endcase
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ in_state <= IN_C;
+ out_state <= OUT_ACCEPT;
+ end else begin
+ unique case (in_state)
+ IN_C:
+ if (coarse_valid & new_block)
+ in_state <= IN_A;
+
+ IN_A:
+ in_state <= IN_B;
+
+ IN_B:
+ in_state <= IN_MASK;
+
+ IN_MASK:
+ in_state <= IN_C;
+ endcase
+
+ unique case (out_state)
+ OUT_ACCEPT:
+ if (in_valid)
+ out_state <= OUT_GEOM_ID;
+
+ OUT_GEOM_ID:
+ if (coverage.tready)
+ out_state <= OUT_POS;
+
+ OUT_POS:
+ if (coverage.tready)
+ out_state <= OUT_MASK;
+
+ OUT_MASK:
+ if (coverage.tready)
+ out_state <= OUT_BARY_C;
+
+ OUT_BARY_C:
+ if (coverage.tready)
+ out_state <= OUT_BARY_A;
+
+ OUT_BARY_A:
+ if (coverage.tready)
+ out_state <= OUT_BARY_B;
+
+ OUT_BARY_B:
+ if (coverage.tready)
+ out_state <= out_last ? OUT_ACCEPT : OUT_BARY_C;
+ endcase
+ end
+
+ always_ff @(posedge clk) begin
+ // Prueba paralela de signos, esto hace el heavy lifting de fine raster
+ // Nótese que muchos sumadores serán eliminados en síntesis
+ for (int i = 0; i < RASTER_SIZE; ++i)
+ for (int j = 0; j < RASTER_SIZE; ++j)
+ mask_in[i * RASTER_SIZE + j] <=
+ (mask_in[i * RASTER_SIZE + j] | mask_in_reset)
+ & (coarse_corner
+ + raster_idx(coarse_offsets.y, RASTER_BITS'(i))
+ + raster_idx(coarse_offsets.x, RASTER_BITS'(j))
+ >= 'sd0);
+
+ // Recalculamos las coordenadas baricéntricas de cada fragmento que
+ // no haya sido descartado. La razón de esto es evitar almacenar y
+ // luego multiplexar las coordenadas de un bloque entero (48 words).
+ if (coverage.tready)
+ bary_coord <= corner.next
+ + raster_idx(offsets.next.y, RASTER_BITS'(lane_y))
+ + raster_idx(offsets.next.x, RASTER_BITS'(lane_x));
+
+ if (new_block & mask_in_reset) begin
+ geom_id <= coarse_geom_id;
+ block_ref <= coarse_ref;
+ end
+
+ // new_block = 0 => coverage.tvalid = 1
+ if (new_block | coverage.tready) begin
+ corner.cur <= corner.next;
+ corner.next <= corner.prev;
+ corner.prev <= corner.cur;
+
+ offsets.cur <= offsets.next;
+ offsets.next <= offsets.prev;
+ offsets.prev <= offsets.cur;
+ end
+
+ if (hold_block) begin
+ // Para prev en vez de cur para que los primeros valores queden en
+ // cur justamente al llegar a OUT_BARY_C
+ corner.prev <= coarse_corner;
+ offsets.prev <= coarse_offsets;
+ end
+
+ if (new_block)
+ mask <= mask_in;
+
+ if (begin_bary) begin
+ mask <= mask & (mask - 1);
+ lane_hold <= lane_ctz;
+ end
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_regfile_io.sv b/rtl/gfx/gfx_regfile_io.sv
new file mode 100644
index 0000000..2459049
--- /dev/null
+++ b/rtl/gfx/gfx_regfile_io.sv
@@ -0,0 +1,106 @@
+interface gfx_regfile_io;
+
+ import gfx::*;
+
+ struct
+ {
+ group_id group;
+ sgpr_num a_sgpr,
+ b_sgpr;
+ vgpr_num a_vgpr,
+ b_vgpr;
+ logic[12:0] b_imm;
+ logic a_scalar,
+ b_scalar,
+ b_is_imm,
+ b_is_const,
+ scalar_rev;
+ } op;
+
+ struct
+ {
+ logic write;
+ group_id group;
+ sgpr_num sgpr;
+ word data;
+ } sgpr_write;
+
+ struct
+ {
+ lane_mask mask;
+ group_id group;
+ vgpr_num vgpr;
+ word data[SHADER_LANES];
+ } vgpr_write;
+
+ word a[SHADER_LANES], b[SHADER_LANES], sgpr_write_data, vgpr_write_data[SHADER_LANES];
+ logic mask_wb_write, pc_wb_write;
+ word_ptr pc_back, pc_front, pc_wb;
+ group_id mask_back_group, mask_wb_group, pc_back_group, pc_front_group, pc_wb_group;
+ lane_mask mask_back, mask_wb;
+
+ modport ab
+ (
+ input a,
+ b
+ );
+
+ modport read
+ (
+ output op
+ );
+
+ modport bind_
+ (
+ input pc_front,
+
+ output pc_front_group
+ );
+
+ modport wb
+ (
+ input pc_back,
+ mask_back,
+
+ output sgpr_write,
+ vgpr_write,
+
+ pc_back_group,
+ mask_back_group,
+
+ pc_wb,
+ pc_wb_group,
+ pc_wb_write,
+
+ mask_wb,
+ mask_wb_group,
+ mask_wb_write
+ );
+
+ modport regs
+ (
+ input op,
+ sgpr_write,
+ vgpr_write,
+
+ pc_back_group,
+ pc_front_group,
+ mask_back_group,
+
+ pc_wb,
+ pc_wb_group,
+ pc_wb_write,
+
+ mask_wb,
+ mask_wb_group,
+ mask_wb_write,
+
+ output a,
+ b,
+
+ pc_back,
+ pc_front,
+ mask_back
+ );
+
+endinterface
diff --git a/rtl/gfx/gfx_rst_sync.sv b/rtl/gfx/gfx_rst_sync.sv
new file mode 100644
index 0000000..2a8ea3b
--- /dev/null
+++ b/rtl/gfx/gfx_rst_sync.sv
@@ -0,0 +1,13 @@
+//FIXME: peligro
+module gfx_rst_sync
+(
+ input logic clk,
+ rst_n,
+
+ output logic srst_n
+);
+
+ always_ff @(posedge clk or negedge rst_n)
+ srst_n <= ~rst_n ? 0 : 1;
+
+endmodule
diff --git a/rtl/gfx/gfx_sched.sv b/rtl/gfx/gfx_sched.sv
new file mode 100644
index 0000000..0ffaecd
--- /dev/null
+++ b/rtl/gfx/gfx_sched.sv
@@ -0,0 +1,141 @@
+module gfx_sched
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+ srst_n,
+
+ gfx_axil.m axim,
+
+ input irq_lines irq
+);
+
+ // verilator tracing_off
+
+ logic axi_ready, axi_valid, bram_ready, bram_read, bram_write, bram_write_next,
+ mem_instr, mem_la_read, mem_la_write, mem_ready, mem_valid, select_bram;
+
+ word bram[SCHED_BRAM_WORDS];
+ word axi_rdata, bram_rdata, mem_addr, mem_la_addr, mem_rdata, mem_wdata;
+ logic[$bits(word) / $bits(byte) - 1:0] mem_wstrb;
+
+ logic[$clog2(SCHED_BRAM_WORDS) - 1:0] bram_addr;
+
+ assign bram_addr = mem_addr[$bits(bram_addr) + SUBWORD_BITS - 1:SUBWORD_BITS];
+ assign mem_ready = (axi_valid & axi_ready) | bram_ready;
+ assign mem_rdata = bram_ready ? bram_rdata : axi_rdata;
+ assign select_bram = ~|mem_la_addr[$bits(mem_la_addr) - 1:$bits(bram_addr) + SUBWORD_BITS];
+ assign bram_write_next = mem_la_write & select_bram;
+
+ defparam core.ENABLE_COUNTERS = 0;
+ defparam core.ENABLE_COUNTERS64 = 0;
+ defparam core.BARREL_SHIFTER = 1;
+ defparam core.COMPRESSED_ISA = 1;
+ defparam core.CATCH_MISALIGN = 0;
+ defparam core.CATCH_ILLINSN = 0;
+ defparam core.ENABLE_MUL = 1;
+ defparam core.ENABLE_DIV = 1;
+ defparam core.ENABLE_IRQ = 1;
+ defparam core.ENABLE_IRQ_QREGS = 0;
+ defparam core.ENABLE_IRQ_TIMER = 0;
+ defparam core.PROGADDR_RESET = BOOTROM_BASE;
+
+ picorv32 core
+ (
+ .clk,
+ .resetn(srst_n),
+ .trap(),
+
+ .mem_valid,
+ .mem_instr,
+ .mem_ready,
+
+ .mem_addr,
+ .mem_wdata,
+ .mem_wstrb,
+ .mem_rdata,
+
+ .mem_la_read,
+ .mem_la_write,
+ .mem_la_addr,
+ .mem_la_wdata(),
+ .mem_la_wstrb(),
+
+ .pcpi_valid(),
+ .pcpi_insn(),
+ .pcpi_rs1(),
+ .pcpi_rs2(),
+ .pcpi_wr(),
+ .pcpi_rd(),
+ .pcpi_wait(0),
+ .pcpi_ready(0),
+
+ .irq,
+ .eoi(),
+
+ .trace_valid(),
+ .trace_data()
+ );
+
+ picorv32_axi_adapter axi
+ (
+ .clk,
+ .resetn(srst_n),
+
+ .mem_axi_awvalid(axim.awvalid),
+ .mem_axi_awready(axim.awready),
+ .mem_axi_awaddr(axim.awaddr),
+ .mem_axi_awprot(),
+
+ .mem_axi_wvalid(axim.wvalid),
+ .mem_axi_wready(axim.wready),
+ .mem_axi_wdata(axim.wdata),
+ .mem_axi_wstrb(), // Potenciales sorpresas
+
+ .mem_axi_bvalid(axim.bvalid),
+ .mem_axi_bready(axim.bready),
+
+ .mem_axi_arvalid(axim.arvalid),
+ .mem_axi_arready(axim.arready),
+ .mem_axi_araddr(axim.araddr),
+ .mem_axi_arprot(),
+
+ .mem_axi_rvalid(axim.rvalid),
+ .mem_axi_rready(axim.rready),
+ .mem_axi_rdata(axim.rdata),
+
+ .mem_valid(mem_valid & axi_valid),
+ .mem_instr,
+ .mem_ready(axi_ready),
+ .mem_addr,
+ .mem_wdata,
+ .mem_wstrb,
+ .mem_rdata(axi_rdata)
+ );
+
+ always_ff @(posedge clk) begin
+ if (bram_write) begin
+ for (int i = 0; i < $bits(mem_wstrb); ++i)
+ if (mem_wstrb[i])
+ bram[bram_addr][i] <= mem_wdata[i];
+
+ bram_rdata <= 'x;
+ end else
+ bram_rdata <= bram[bram_addr];
+ end
+
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ axi_valid <= 0;
+ bram_read <= 0;
+ bram_ready <= 0;
+ bram_write <= 0;
+ end else begin
+ axi_valid <= ~select_bram | (axi_valid & ~axi_ready);
+ bram_read <= mem_la_read & select_bram;
+ bram_write <= bram_write_next;
+ bram_ready <= bram_read | bram_write_next;
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_shader.sv b/rtl/gfx/gfx_shader.sv
new file mode 100644
index 0000000..322ffb5
--- /dev/null
+++ b/rtl/gfx/gfx_shader.sv
@@ -0,0 +1,77 @@
+module gfx_shader
+import gfx::*;
+import gfx_shader_schedif_pkg::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_axib.m insn_mem,
+
+ gfx_axil.s sched
+);
+
+ axi4lite_intf #(.ADDR_WIDTH(GFX_SHADER_SCHEDIF_MIN_ADDR_WIDTH)) regblock();
+
+ gfx_axil2regblock axil2regblock
+ (
+ .axis(sched),
+ .axim(regblock.master)
+ );
+
+ gfx_shader_schedif__in_t schedif_in;
+ gfx_shader_schedif__out_t schedif_out;
+
+ gfx_front_back front_back();
+ gfx_regfile_io regfile();
+ gfx_shader_setup setup();
+
+ assign schedif_in.SETUP_CTRL.GPR_DONE.hwset = setup.sched.set_done.gpr;
+ assign schedif_in.SETUP_CTRL.MASK_DONE.hwset = setup.sched.set_done.mask;
+ assign schedif_in.SETUP_CTRL.SUBMIT_DONE.hwset = setup.sched.set_done.submit;
+
+ assign setup.sched.write.pc = schedif_out.SETUP_SUBMIT.PC.value;
+ assign setup.sched.write.gpr = schedif_out.SETUP_CTRL.XGPR.value;
+ assign setup.sched.write.mask = schedif_out.SETUP_MASK.MASK.value;
+ assign setup.sched.write.group = schedif_out.SETUP_CTRL.GROUP.value;
+ assign setup.sched.write.pc_set = schedif_out.SETUP_SUBMIT.PC.swmod;
+ assign setup.sched.write.gpr_set = schedif_out.SETUP_GPR.VALUE.swmod;
+ assign setup.sched.write.mask_set = schedif_out.SETUP_MASK.MASK.swmod;
+ assign setup.sched.write.gpr_value = schedif_out.SETUP_GPR.VALUE.value;
+
+ gfx_shader_front frontend
+ (
+ .clk,
+ .rst_n,
+ .front(front_back.front),
+ .reg_bind(regfile.bind_),
+ .reg_read(regfile.read),
+ .fetch_mem(insn_mem),
+ .icache_flush(schedif_out.CORE.IFLUSH.value)
+ );
+
+ gfx_shader_back backend
+ (
+ .clk,
+ .rst_n,
+ .back(front_back.back),
+ .setup(setup.core),
+ .reg_wb(regfile.wb),
+ .read_data(regfile.ab)
+ );
+
+ gfx_shader_regs regs
+ (
+ .clk,
+ .io(regfile.regs)
+ );
+
+ gfx_shader_schedif schedif
+ (
+ .clk,
+ .arst_n(rst_n),
+ .s_axil(regblock.slave),
+ .hwif_in(schedif_in),
+ .hwif_out(schedif_out)
+ );
+
+endmodule
diff --git a/rtl/gfx/gfx_shader_back.sv b/rtl/gfx/gfx_shader_back.sv
new file mode 100644
index 0000000..4929192
--- /dev/null
+++ b/rtl/gfx/gfx_shader_back.sv
@@ -0,0 +1,335 @@
+module gfx_shader_back
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_front_back.back back,
+
+ gfx_regfile_io.ab read_data,
+ gfx_regfile_io.wb reg_wb,
+
+ gfx_shader_setup.core setup
+);
+
+ logic abort;
+
+ gfx_wb out_wb(), p0_wb(), p1_wb(), p2_wb(), p3_wb();
+ gfx_shake p1_shake(), p2_shake(), p3_shake();
+
+ gfx_shader_abort p0_abort
+ (
+ .clk,
+ .p1(p1_shake.peek),
+ .p2(p2_shake.peek),
+ .p3(p3_shake.peek),
+ .abort
+ );
+
+ gfx_shader_fpint p0
+ (
+ .clk,
+ .rst_n,
+ .op(back.execute.p0),
+ .wb(p0_wb.tx),
+ .wave(back.execute.wave),
+ .abort,
+ .read_data,
+ .in_valid(back.dispatch.valid)
+ );
+
+ gfx_shader_mem p1
+ (
+ .clk,
+ .rst_n,
+ .op(back.execute.p1),
+ .wb(p1_wb.tx),
+ .wave(back.execute.wave),
+ .in_shake(p1_shake.rx),
+ .read_data
+ );
+
+ gfx_shader_sfu p2
+ (
+ .clk,
+ .rst_n,
+ .op(back.execute.p2),
+ .wb(p2_wb.tx),
+ .wave(back.execute.wave),
+ .in_shake(p2_shake.rx),
+ .read_data
+ );
+
+ gfx_shader_group p3
+ (
+ .clk,
+ .rst_n,
+ .op(back.execute.p3),
+ .wb(p3_wb.tx),
+ .wave(back.execute.wave),
+ .in_shake(p3_shake.rx),
+ .read_data
+ );
+
+ gfx_shader_writeback_arbiter4 writeback_arbiter
+ (
+ .clk,
+ .rst_n,
+ .p0(p0_wb.rx),
+ .p1(p1_wb.rx),
+ .p2(p2_wb.rx),
+ .p3(p3_wb.rx),
+ .out(out_wb.tx)
+ );
+
+ gfx_shader_writeback writeback
+ (
+ .clk,
+ .rst_n,
+ .wb(out_wb.rx),
+ .regs(reg_wb),
+ .setup,
+ .loop_group(back.loop.group),
+ .loop_valid(back.loop.valid)
+ );
+
+endmodule
+
+module gfx_shader_abort
+(
+ input logic clk,
+
+ gfx_shake.peek p1,
+ p2,
+ p3,
+
+ output logic abort
+);
+
+ always_ff @(posedge clk)
+ abort <=
+ (p1.valid & p1.ready)
+ | (p2.valid & p2.ready)
+ | (p3.valid & p3.ready);
+
+endmodule
+
+module gfx_shader_writeback_arbiter4
+(
+ input logic clk,
+ rst_n,
+
+ gfx_wb.rx p0,
+ p1,
+ p2,
+ p3,
+
+ gfx_wb.tx out
+);
+
+ assert property (
+ @(posedge clk)
+ disable iff (~rst_n)
+
+ (p0.ready & out.ready)
+ );
+
+ gfx_wb p0_p1(), p2_p3();
+
+ gfx_shader_writeback_arbiter2_prio arbiter_p0_p1
+ (
+ .clk,
+ .rst_n,
+ .a(p0),
+ .b(p1),
+ .out(p0_p1.tx)
+ );
+
+ gfx_shader_writeback_arbiter2_prio arbiter_p2_p3
+ (
+ .clk,
+ .rst_n,
+ .a(p2),
+ .b(p3),
+ .out(p2_p3.tx)
+ );
+
+ gfx_shader_writeback_arbiter2_prio arbiter_out
+ (
+ .clk,
+ .rst_n,
+ .a(p0_p1.rx),
+ .b(p2_p3.tx),
+ .out
+ );
+
+endmodule
+
+module gfx_shader_writeback_arbiter2_prio
+(
+ input logic clk,
+ rst_n,
+
+ gfx_wb.rx a,
+ b,
+
+ gfx_wb.tx out
+);
+
+ //TODO
+ assign a.ready = out.ready;
+ assign b.ready = 0;
+
+ assign out.dest = a.dest;
+ assign out.lanes = a.lanes;
+ assign out.group = a.group;
+ assign out.valid = a.valid;
+ assign out.scalar = a.scalar;
+ assign out.writeback = a.writeback;
+
+ assign out.mask = a.mask;
+ assign out.mask_update = a.mask_update;
+
+ assign out.pc_add = a.pc_add;
+ assign out.pc_inc = a.pc_inc;
+ assign out.pc_update = a.pc_update;
+
+endmodule
+
+module gfx_shader_writeback
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_wb.rx wb,
+
+ gfx_regfile_io.wb regs,
+
+ output logic loop_valid,
+ output group_id loop_group,
+
+ gfx_shader_setup.core setup
+);
+
+ struct
+ {
+ group_id group;
+ word lanes[SHADER_LANES];
+ pc_offset pc_add;
+ lane_mask mask;
+ vgpr_num vgpr;
+ logic pc_update,
+ mask_update,
+ vgpr_update;
+ } loop_hold[REGFILE_STAGES], loop_out;
+
+ logic loop_valid_hold[REGFILE_STAGES], loop_out_valid, mask_wb, scalar_wb,
+ setup_gpr, setup_mask, setup_submit;
+
+ assign wb.ready = 1;
+
+ assign loop_out = loop_hold[REGFILE_STAGES - 1];
+ assign loop_out_valid = loop_valid_hold[REGFILE_STAGES - 1];
+
+ assign loop_valid = loop_out_valid | setup_submit;
+
+ assign regs.pc_back_group = wb.group;
+ assign regs.mask_back_group = wb.group;
+
+ assign regs.pc_wb_write = (loop_out_valid & loop_out.pc_update) | setup_submit;
+ assign regs.mask_wb_write = mask_wb | setup_mask;
+ assign regs.sgpr_write.write = scalar_wb | setup_gpr;
+
+ assign regs.vgpr_write.vgpr = loop_out.vgpr;
+ assign regs.vgpr_write.group = loop_out.group;
+
+ assign mask_wb = loop_out_valid & loop_out.mask_update;
+ assign scalar_wb = wb.valid & wb.writeback & wb.scalar;
+
+ always_comb begin
+ loop_group = setup.write.group;
+ regs.pc_wb = setup.write.pc;
+ regs.pc_wb_group = setup.write.group;
+
+ if (loop_out_valid) begin
+ loop_group = loop_out.group;
+ regs.pc_wb = regs.pc_back + word_ptr'(loop_out.pc_add);
+ regs.pc_wb_group = loop_out.group;
+ end
+
+ regs.mask_wb = setup.write.mask;
+ regs.mask_wb_group = setup.write.group;
+
+ if (mask_wb) begin
+ regs.mask_wb = loop_out.mask;
+ regs.mask_wb_group = loop_out.group;
+ end
+
+ regs.sgpr_write.data = setup.write.gpr_value;
+ regs.sgpr_write.sgpr = setup.write.gpr.sgpr;
+ regs.sgpr_write.group = setup.write.group;
+
+ if (scalar_wb) begin
+ regs.sgpr_write.data = wb.lanes[0];
+ regs.sgpr_write.sgpr = wb.dest.sgpr;
+ regs.sgpr_write.group = wb.group;
+ end
+
+ for (int i = 0; i < SHADER_LANES; ++i)
+ regs.vgpr_write.data[i] = loop_out.lanes[i];
+
+ regs.vgpr_write.mask = regs.mask_back;
+ if (~loop_out_valid | ~loop_out.vgpr_update)
+ regs.vgpr_write.mask = '0;
+ end
+
+ always_ff @(posedge clk) begin
+ // Blocking assignments por bug de verilator (ver for de lanes abajo)
+
+ for (int i = REGFILE_STAGES - 1; i > 0; --i)
+ loop_hold[i] = loop_hold[i - 1];
+
+ loop_hold[0].mask = wb.mask;
+ loop_hold[0].vgpr = wb.dest.vgpr.num;
+ loop_hold[0].group = wb.group;
+ loop_hold[0].pc_add = wb.pc_add;
+ loop_hold[0].pc_update = wb.pc_update;
+ loop_hold[0].mask_update = wb.mask_update;
+ loop_hold[0].vgpr_update = wb.writeback & ~wb.scalar;
+
+ // https://github.com/verilator/verilator/issues/4804
+ for (int i = 0; i < SHADER_LANES; ++i)
+ loop_hold[0].lanes[i] = wb.lanes[i];
+
+ if (wb.pc_inc)
+ loop_hold[0].pc_add = pc_offset'(1);
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ setup_gpr <= 0;
+ setup_mask <= 0;
+ setup_submit <= 0;
+
+ setup.set_done.gpr <= 0;
+ setup.set_done.mask <= 0;
+ setup.set_done.submit <= 0;
+
+ for (int i = 0; i < $size(loop_valid_hold); ++i)
+ loop_valid_hold[i] <= 0;
+ end else begin
+ setup_gpr <= (setup_gpr & scalar_wb) | setup.write.gpr_set;
+ setup_mask <= (setup_mask & mask_wb) | setup.write.mask_set;
+ setup_submit <= (setup_submit & loop_out_valid) | setup.write.pc_set;
+
+ setup.set_done.gpr <= setup_gpr & ~scalar_wb;
+ setup.set_done.mask <= setup_mask & ~mask_wb;
+ setup.set_done.submit <= setup_submit & ~loop_out_valid;
+
+ loop_valid_hold[0] <= wb.valid;
+ for (int i = 1; i < REGFILE_STAGES; ++i)
+ loop_valid_hold[i] <= loop_valid_hold[i - 1];
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_shader_fpint.sv b/rtl/gfx/gfx_shader_fpint.sv
new file mode 100644
index 0000000..a418dcc
--- /dev/null
+++ b/rtl/gfx/gfx_shader_fpint.sv
@@ -0,0 +1,932 @@
+// -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32
+localparam int FPINT_CLZ_STAGES = 4;
+
+localparam bit[$clog2($bits(gfx::float_mant_ext)):0] FPINT_MAX_SHIFT
+ = 1 << $clog2($bits(gfx::float_mant_ext));
+
+typedef logic[$clog2(FPINT_MAX_SHIFT):0] fpint_shift;
+
+/* Las 15 etapas son:
+ * - setup
+ * - mulclass
+ * - mnorm
+ * - minmax
+ * - expdiff
+ * - shiftr
+ * - addsub
+ * - clz0-clz3
+ * - shiftl
+ * - round
+ * - rnorm
+ * - encode
+ */
+
+typedef struct
+{
+ gfx::float a,
+ b,
+ a_mul,
+ b_mul;
+} fpint_setup_mulclass;
+
+typedef struct
+{
+ gfx::float b;
+ gfx::float_exp exp;
+ gfx::float_class a_class,
+ b_class;
+ gfx::udword product;
+ logic sign,
+ overflow;
+} fpint_mulclass_mnorm;
+
+typedef struct
+{
+ gfx::float a,
+ b;
+ gfx::float_class a_class,
+ b_class;
+ logic slow,
+ zero,
+ guard,
+ round,
+ sticky,
+ slow_in,
+ overflow;
+} fpint_mnorm_minmax;
+
+typedef struct
+{
+ gfx::float max,
+ min;
+ gfx::float_class max_class,
+ min_class;
+ logic slow,
+ zero,
+ guard,
+ round,
+ sticky;
+} fpint_minmax_expdiff;
+
+typedef struct
+{
+ gfx::float max,
+ min;
+ gfx::float_class max_class,
+ min_class;
+ fpint_shift exp_shift;
+ logic slow,
+ zero,
+ guard,
+ round,
+ sticky;
+} fpint_expdiff_shiftr;
+
+typedef struct
+{
+ gfx::float max,
+ min;
+ gfx::float_class max_class,
+ min_class;
+ gfx::float_mant_ext max_mant,
+ min_mant,
+ sticky_mask;
+ logic slow,
+ zero,
+ guard,
+ round,
+ sticky,
+ int_sign;
+} fpint_shiftr_addsub;
+
+typedef struct
+{
+ gfx::float max;
+ gfx::word add_sub;
+ logic slow,
+ zero,
+ guard,
+ round,
+ sticky;
+} fpint_clz_hold;
+
+typedef fpint_clz_hold fpint_addsub_clz;
+
+typedef struct
+{
+ fpint_clz_hold hold;
+ fpint_shift shift;
+} fpint_clz_shiftl;
+
+typedef struct
+{
+ gfx::float val;
+ logic slow,
+ zero,
+ guard,
+ round,
+ sticky,
+ overflow,
+ sticky_last;
+} fpint_shiftl_round;
+
+typedef struct
+{
+ gfx::float val;
+ logic slow,
+ zero,
+ exp_step,
+ overflow;
+} fpint_round_rnorm;
+
+typedef struct
+{
+ gfx::float val;
+ logic slow,
+ zero,
+ overflow;
+} fpint_rnorm_encode;
+
+module gfx_shader_fpint
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ input fpint_op op,
+ input wave_exec wave,
+ input logic abort,
+ in_valid,
+
+ gfx_regfile_io.ab read_data,
+
+ gfx_wb.tx wb
+);
+
+ localparam int FPINT_STAGES = 7 + FPINT_CLZ_STAGES + 4;
+
+ struct
+ {
+ fpint_op op;
+ wave_exec wave;
+ } stage[FPINT_STAGES];
+
+ logic stage_valid[FPINT_STAGES];
+
+ assign wb.dest = stage[FPINT_STAGES - 1].wave.dest;
+ assign wb.mask = 'x;
+ assign wb.group = stage[FPINT_STAGES - 1].wave.group;
+ assign wb.pc_add = 'x;
+ assign wb.pc_inc = 1;
+ assign wb.scalar = stage[FPINT_STAGES - 1].wave.dest_scalar;
+ assign wb.pc_update = wb.writeback;
+ assign wb.writeback = stage[FPINT_STAGES - 1].op.writeback;
+ assign wb.mask_update = 0;
+
+ // Ojo: stage_valid[0], pero stage[0] no
+ assign stage_valid[0] = in_valid;
+
+ genvar lane;
+ generate
+ for (lane = 0; lane < SHADER_LANES; ++lane) begin: lanes
+ gfx_shader_fpint_lane unit
+ (
+ .clk(clk),
+ .a(read_data.a[lane]),
+ .b(read_data.b[lane]),
+ .q(wb.lanes[lane]),
+ .mul_float_0(op.setup_mul_float),
+ .unit_b_0(op.setup_unit_b),
+ .put_hi_2(stage[2 - 1].op.mnorm_put_hi),
+ .put_lo_2(stage[2 - 1].op.mnorm_put_lo),
+ .put_mul_2(stage[2 - 1].op.mnorm_put_mul),
+ .zero_b_2(stage[2 - 1].op.mnorm_zero_b),
+ .zero_flags_2(stage[2 - 1].op.mnorm_zero_flags),
+ .abs_3(stage[3 - 1].op.minmax_abs),
+ .swap_3(stage[3 - 1].op.minmax_swap),
+ .zero_min_3(stage[3 - 1].op.minmax_zero_min),
+ .copy_flags_3(stage[3 - 1].op.minmax_copy_flags),
+ .int_signed_5(stage[5 - 1].op.shiftr_int_signed),
+ .copy_flags_6(stage[6 - 1].op.addsub_copy_flags),
+ .int_operand_6(stage[6 - 1].op.addsub_int_operand),
+ .force_nop_7(stage[7 - 1].op.clz_force_nop),
+ .copy_flags_11(stage[11 - 1].op.shiftl_copy_flags),
+ .copy_flags_12(stage[12 - 1].op.round_copy_flags),
+ .enable_12(stage[12 - 1].op.round_enable),
+ .enable_14(stage[14 - 1].op.encode_enable)
+ );
+ end
+ endgenerate
+
+ always_ff @(posedge clk) begin
+ stage[0].op <= op;
+ stage[0].wave <= wave;
+
+ for (int i = 1; i < FPINT_STAGES; ++i)
+ stage[i] <= stage[i - 1];
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ for (int i = 1; i < FPINT_STAGES; ++i)
+ stage_valid[i] <= 0;
+
+ wb.valid <= 0;
+ end else begin
+ for (int i = 1; i < FPINT_STAGES; ++i)
+ stage_valid[i] <= stage_valid[i - 1];
+
+ // Se levanta 1 ciclo luego que in_valid
+ stage_valid[2] <= stage_valid[1] & ~abort;
+
+ wb.valid <= stage_valid[FPINT_STAGES - 1];
+ end
+
+endmodule
+
+module gfx_shader_fpint_lane
+import gfx::*;
+(
+ input logic clk,
+
+ input word a,
+ b,
+
+ input logic mul_float_0,
+ unit_b_0,
+ put_hi_2,
+ put_lo_2,
+ put_mul_2,
+ zero_b_2,
+ zero_flags_2,
+ abs_3,
+ swap_3,
+ zero_min_3,
+ copy_flags_3,
+ int_signed_5,
+ copy_flags_6,
+ int_operand_6,
+ force_nop_7,
+ copy_flags_11,
+ copy_flags_12,
+ enable_12,
+ enable_14,
+
+ output word q
+);
+
+ /* Notas de implementación para floating-point
+ *
+ * === PRODUCTO ===
+ *
+ * Queremos calcular q = a * b.
+ *
+ * Donde a = (-1)^s * 1.m * 2^f,
+ * b = (-1)^t * 1.n * 2^g
+ *
+ * Entonces q = (-1)^(s + t) (1.m * 1.n) 2^(f + g)
+ *
+ * El producto es entre números >= 1.0 y < 2.0. En el peor caso:
+ * Mejor caso: 1.000... * 1.000... ~ 1.000...
+ * Peor caso: 1.999... * 1.999... ~ 3.999... = 2^1 * 1.999
+ *
+ * Así que, si el producto es >= 2, hay que hacerle >> 1 a la mantisa
+ * y sumarle 1 al exponente para normalizar.
+ *
+ *
+ * === SUMA/RESTA ===
+ *
+ * Queremos calcular q = a + b. Curiosamente, eso es más complicado que a * b.
+ * Hay que ajustar el exponente del menor entre a y b para que coincida
+ * con el del mayor (desnormalizando), realizar la operación y finalmente
+ * renormalizar. Se hace suma o resta dependiendo de relaciones de signos,
+ * no según la operación de entrada (eso último solo le hace xor al signo de b).
+ * Recordar aquí que IEEE 754 es una especie de signo-magnitud y no complemento.
+ *
+ * En el caso de una resta, el exponente normalizado puede ser mucho más
+ * pequeño que cualquiera de los exponentes de entrada. Necesitamos
+ * entonces de lǵoica CLZ (count leading zeros) para renormalizar.
+ *
+ *
+ * === CONVERSIÓN INTEGER->FP ===
+ *
+ * Esto simplemente usa el mismo datapath de fadd, con el abs del entero
+ * como entrada como entrada de clz. El exponente de referencia se fija
+ * en 30 (aludiendo al segundo msb de un entero de 32 bits). A partir de
+ * ese punto es idéntico a un fadd, las etapas de clz se encargan de ajustar
+ * el exponente.
+ */
+
+ fpint_setup_mulclass setup_mulclass;
+ fpint_mulclass_mnorm mulclass_mnorm;
+ fpint_mnorm_minmax mnorm_minmax;
+ fpint_minmax_expdiff minmax_expdiff;
+ fpint_expdiff_shiftr expdiff_shiftr;
+ fpint_shiftr_addsub shiftr_addsub;
+ fpint_addsub_clz addsub_clz;
+ fpint_clz_shiftl clz_shiftl;
+ fpint_shiftl_round shiftl_round;
+ fpint_round_rnorm round_rnorm;
+ fpint_rnorm_encode rnorm_encode;
+
+ gfx_shader_fpint_setup stage_0
+ (
+ .clk(clk),
+ .a(a),
+ .b(b),
+ .out(setup_mulclass),
+ .unit_b(unit_b_0),
+ .mul_float(mul_float_0)
+ );
+
+ gfx_shader_fpint_mulclass stage_1
+ (
+ .clk(clk),
+ .in(setup_mulclass),
+ .out(mulclass_mnorm)
+ );
+
+ gfx_shader_fpint_mnorm stage_2
+ (
+ .clk(clk),
+ .in(mulclass_mnorm),
+ .out(mnorm_minmax),
+ .put_hi(put_hi_2),
+ .put_lo(put_lo_2),
+ .put_mul(put_mul_2),
+ .zero_b(zero_b_2),
+ .zero_flags(zero_flags_2)
+ );
+
+ gfx_shader_fpint_minmax stage_3
+ (
+ .clk(clk),
+ .in(mnorm_minmax),
+ .out(minmax_expdiff),
+ .abs(abs_3),
+ .swap(swap_3),
+ .zero_min(zero_min_3),
+ .copy_flags(copy_flags_3)
+ );
+
+ gfx_shader_fpint_expdiff stage_4
+ (
+ .clk(clk),
+ .in(minmax_expdiff),
+ .out(expdiff_shiftr)
+ );
+
+ gfx_shader_fpint_shiftr stage_5
+ (
+ .clk(clk),
+ .in(expdiff_shiftr),
+ .out(shiftr_addsub),
+ .int_signed(int_signed_5)
+ );
+
+ gfx_shader_fpint_addsub stage_6
+ (
+ .clk(clk),
+ .in(shiftr_addsub),
+ .out(addsub_clz),
+ .copy_flags(copy_flags_6),
+ .int_operand(int_operand_6)
+ );
+
+ gfx_shader_fpint_clz stage_7_8_9_10
+ (
+ .clk(clk),
+ .in(addsub_clz),
+ .out(clz_shiftl),
+ .force_nop(force_nop_7)
+ );
+
+ gfx_shader_fpint_shiftl stage_11
+ (
+ .clk(clk),
+ .in(clz_shiftl),
+ .out(shiftl_round),
+ .copy_flags(copy_flags_11)
+ );
+
+ gfx_shader_fpint_round stage_12
+ (
+ .clk(clk),
+ .in(shiftl_round),
+ .out(round_rnorm),
+ .enable(enable_12),
+ .copy_flags(copy_flags_12)
+ );
+
+ gfx_shader_fpint_rnorm stage_13
+ (
+ .clk(clk),
+ .in(round_rnorm),
+ .out(rnorm_encode)
+ );
+
+ gfx_shader_fpint_encode stage_14
+ (
+ .clk(clk),
+ .q(q),
+ .in(rnorm_encode),
+ .enable(enable_14)
+ );
+
+endmodule
+
+// Stage 0: argumentos de mul
+module gfx_shader_fpint_setup
+import gfx::*;
+(
+ input logic clk,
+
+ input word a,
+ b,
+ input logic mul_float,
+ unit_b,
+
+ output fpint_setup_mulclass out
+);
+
+ always_ff @(posedge clk) begin
+ out.a <= a;
+ out.b <= b;
+ out.a_mul <= a;
+ out.b_mul <= b;
+
+ /* Nótese que el orden es sign-exp-mant. Esto coloca el '1.' implícito
+ * en la posición correcta para multiplicar las mantisas.
+ */
+ if (mul_float) begin
+ out.a_mul.exp <= 1;
+ out.b_mul.exp <= 1;
+ out.a_mul.sign <= 0;
+ out.b_mul.sign <= 0;
+ end
+
+ if (unit_b) begin
+ out.b_mul.exp <= 0;
+ out.b_mul.mant <= 1;
+ out.b_mul.sign <= 0;
+ end
+ end
+
+endmodule
+
+// Stage 1: multiplicación de fp o enteros
+module gfx_shader_fpint_mulclass
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_setup_mulclass in,
+
+ output fpint_mulclass_mnorm out
+);
+
+ always_ff @(posedge clk) begin
+ out.b <= in.b;
+ out.sign <= in.a.sign ^ in.b.sign;
+ out.a_class <= classify_float(in.a);
+ out.b_class <= classify_float(in.b);
+ out.product <= in.a_mul * in.b_mul;
+ {out.overflow, out.exp} <= {1'b0, in.a.exp} + {1'b0, in.b.exp} - {1'b0, FLOAT_EXP_BIAS};
+ end
+
+endmodule
+
+// Stage 2: normalización
+module gfx_shader_fpint_mnorm
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_mulclass_mnorm in,
+ input logic put_hi,
+ put_lo,
+ put_mul,
+ zero_b,
+ zero_flags,
+
+ output fpint_mnorm_minmax out
+);
+
+ word product_hi, product_lo;
+ logic guard, lo_msb, lo_reduce, round, slow_in_next;
+ float_mant_full hi;
+ logic[$bits(float_mant_full) - 3:0] lo;
+
+ assign lo_msb = lo[$bits(lo) - 1];
+ assign lo_reduce = |lo[$bits(lo) - 2:0];
+ assign slow_in_next = is_float_special(in.a_class) | is_float_special(in.b_class);
+ assign {product_hi, product_lo} = in.product;
+ assign {hi, guard, round, lo} = in.product[2 * $bits(float_mant_full) - 1:0];
+
+ always_ff @(posedge clk) begin
+ if (put_mul) begin
+ out.slow <= slow_in_next | (in.overflow & ~in.a_class.exp_min & ~in.a_class.exp_min);
+ out.zero <= in.a_class.exp_min | in.b_class.exp_min;
+ end else begin
+ out.slow <= 0;
+ out.zero <= 0;
+ end
+
+ out.a.sign <= in.sign;
+ out.overflow <= 0;
+
+ if (hi[$bits(hi) - 1]) begin
+ out.guard <= guard;
+ out.round <= round;
+ out.sticky <= lo_msb | lo_reduce;
+ out.a.mant <= implicit_mant(hi);
+ {out.overflow, out.a.exp} <= {1'b0, in.exp} + 1;
+ end else begin
+ /* Bit antes de msb es necesariamente 1, ya que los msb de
+ * ambos multiplicandos son 1. Ver assert en implicit_mant().
+ */
+ out.guard <= round;
+ out.round <= lo_msb;
+ out.sticky <= lo_reduce;
+
+ out.a.exp <= in.exp;
+ out.a.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard});
+ end
+
+ unique case (1'b1)
+ put_mul: ;
+
+ put_hi:
+ out.a <= product_hi;
+
+ put_lo:
+ out.a <= product_lo;
+ endcase
+
+ out.a_class <= in.a_class;
+ out.slow_in <= slow_in_next;
+
+ if (zero_flags) begin
+ out.a_class <= classify_float(0);
+ out.slow_in <= 0;
+ end
+
+ if (zero_b) begin
+ out.b <= 0;
+ out.b_class <= classify_float(0);
+ end else begin
+ out.b <= in.b;
+ out.b_class <= in.b_class;
+ end
+ end
+
+endmodule
+
+// Stage 3: ordenar tal que abs(max) >= abs(min)
+module gfx_shader_fpint_minmax
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_mnorm_minmax in,
+ input logic abs,
+ swap,
+ zero_min,
+ copy_flags,
+
+ output fpint_minmax_expdiff out
+);
+
+ logic abs_b_gt_abs_a, b_gt_a;
+
+ /* Wiki dice:
+ *
+ * A property of the single- and double-precision formats is that
+ * their encoding allows one to easily sort them without using
+ * floating-point hardware, as if the bits represented sign-magnitude
+ * integers, although it is unclear whether this was a design
+ * consideration (it seems noteworthy that the earlier IBM hexadecimal
+ * floating-point representation also had this property for normalized
+ * numbers).
+ */
+ assign abs_b_gt_abs_a = {in.b.exp, in.b.mant} > {in.a.exp, in.a.mant};
+
+ always_comb begin
+ unique case ({in.b.sign, in.a.sign})
+ 2'b00: b_gt_a = abs_b_gt_abs_a;
+ 2'b01: b_gt_a = 1;
+ 2'b10: b_gt_a = 0;
+ 2'b11: b_gt_a = abs_b_gt_abs_a;
+ endcase
+
+ if (abs)
+ b_gt_a = abs_b_gt_abs_a;
+ end
+
+ always_ff @(posedge clk) begin
+ if (b_gt_a ^ swap) begin
+ out.max <= in.b;
+ out.min <= in.a;
+ out.max_class <= in.b_class;
+ out.min_class <= in.a_class;
+ end else begin
+ out.max <= in.a;
+ out.min <= in.b;
+ out.max_class <= in.a_class;
+ out.min_class <= in.b_class;
+ end
+
+ if (zero_min) begin
+ out.min <= 0;
+ out.min_class <= classify_float(0);
+ end
+
+ out.guard <= in.guard;
+ out.round <= in.round;
+ out.sticky <= in.sticky;
+
+ if (copy_flags) begin
+ out.slow <= in.slow | in.overflow;
+ out.zero <= in.zero;
+ end else begin
+ out.slow <= in.slow_in;
+ out.zero <= 0;
+ end
+ end
+
+endmodule
+
+// Stage 4: exp_shift amount
+module gfx_shader_fpint_expdiff
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_minmax_expdiff in,
+
+ output fpint_expdiff_shiftr out
+);
+
+ float_exp exp_delta;
+
+ assign exp_delta = in.max.exp - in.min.exp;
+
+ always_ff @(posedge clk) begin
+ out.max <= in.max;
+ out.min <= in.min;
+ out.slow <= in.slow;
+ out.zero <= in.zero;
+ out.guard <= in.guard;
+ out.round <= in.round;
+ out.sticky <= in.sticky;
+ out.max_class <= in.max_class;
+ out.min_class <= in.min_class;
+
+ out.exp_shift <= exp_delta[$bits(out.exp_shift) - 1:0];
+ if (exp_delta > {{($bits(exp_delta) - $bits(FPINT_MAX_SHIFT)){1'b0}}, FPINT_MAX_SHIFT})
+ out.exp_shift <= FPINT_MAX_SHIFT;
+ end
+
+endmodule
+
+// Stage 5: shifts y abs(max) para enteros con signo
+module gfx_shader_fpint_shiftr
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_expdiff_shiftr in,
+ input logic int_signed,
+
+ output fpint_shiftr_addsub out
+);
+
+ always_ff @(posedge clk) begin
+ out.min <= in.min;
+ out.slow <= in.slow;
+ out.zero <= in.zero;
+ out.guard <= in.guard;
+ out.round <= in.round;
+ out.sticky <= in.sticky;
+ out.min_class <= in.min_class;
+
+ out.max_mant <= float_prepare_round(in.max, in.max_class);
+ out.min_mant <= float_prepare_round(in.min, in.min_class) >> in.exp_shift;
+ out.sticky_mask <= {($bits(out.min_mant)){1'b1}} << in.exp_shift;
+
+ out.max <= in.max;
+ out.int_sign <= in.max[$bits(in.max) - 1];
+
+ if (int_signed & in.max[$bits(in.max) - 1])
+ out.max <= -in.max;
+ end
+
+endmodule
+
+// Stage 6: suma de mantisas
+module gfx_shader_fpint_addsub
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_shiftr_addsub in,
+ input logic copy_flags,
+ int_operand,
+
+ output fpint_addsub_clz out
+);
+
+ localparam int INT_SHIFT_REF = $bits(word) - 2;
+
+ function word fp_add_sub_arg(float_mant_ext arg);
+ fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}};
+ endfunction
+
+ always_ff @(posedge clk) begin
+ out.max <= in.max;
+ out.slow <= in.slow;
+ out.zero <= in.zero;
+ out.guard <= in.guard;
+ out.round <= in.round;
+
+ if (int_operand) begin
+ out.max.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0];
+ out.max.sign <= in.int_sign;
+ end
+
+ if (copy_flags)
+ out.sticky <= in.sticky;
+ else
+ out.sticky <= |(float_prepare_round(in.min, in.min_class) & ~in.sticky_mask);
+
+ if (int_operand)
+ out.add_sub <= in.max;
+ else if (in.max.sign ^ in.min.sign)
+ out.add_sub <= fp_add_sub_arg(in.max_mant) - fp_add_sub_arg(in.min_mant);
+ else
+ out.add_sub <= fp_add_sub_arg(in.max_mant) + fp_add_sub_arg(in.min_mant);
+ end
+
+endmodule
+
+// Stages 7-10: encontrar el 1 más significativo
+module gfx_shader_fpint_clz
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_addsub_clz in,
+ input logic force_nop,
+
+ output fpint_clz_shiftl out
+);
+
+ word clz_in;
+ fpint_clz_hold hold[FPINT_CLZ_STAGES];
+
+ assign out.hold = hold[FPINT_CLZ_STAGES - 1];
+
+ gfx_clz #($bits(word)) clz
+ (
+ .clk(clk),
+ .clz(out.shift),
+ .value(clz_in)
+ );
+
+ always_comb begin
+ clz_in = in.add_sub;
+ if (force_nop)
+ clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01;
+ end
+
+ always_ff @(posedge clk) begin
+ hold[0] <= in;
+
+ for (int i = 1; i < FPINT_CLZ_STAGES; ++i)
+ hold[i] <= hold[i - 1];
+ end
+
+endmodule
+
+// Stage 11: normalización
+module gfx_shader_fpint_shiftl
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_clz_shiftl in,
+ input logic copy_flags,
+
+ output fpint_shiftl_round out
+);
+
+ localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(in.shift) + 1;
+
+ word normalized;
+
+ assign normalized = in.hold.add_sub << in.shift;
+
+ always_ff @(posedge clk) begin
+ out.slow <= in.hold.slow;
+ out.zero <= in.hold.zero;
+ out.sticky <= in.hold.sticky;
+ out.val.sign <= in.hold.max.sign;
+
+ {out.val.mant, out.guard, out.round, out.sticky_last} <=
+ normalized[$bits(normalized) - 2:$bits(normalized) - $bits(float_mant) - 4];
+
+ {out.overflow, out.val.exp} <=
+ {1'b0, in.hold.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, in.shift} + 1;
+
+ if (in.shift[$bits(in.shift) - 1])
+ out.zero <= 1;
+
+ if (copy_flags) begin
+ out.guard <= in.hold.guard;
+ out.round <= in.hold.round;
+ out.overflow <= 0;
+ out.sticky_last <= 0;
+ end
+ end
+
+endmodule
+
+// Stage 12: redondeo
+module gfx_shader_fpint_round
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_shiftl_round in,
+ input logic copy_flags,
+ enable,
+
+ output fpint_round_rnorm out
+);
+
+ always_ff @(posedge clk) begin
+ out.val <= in.val;
+ out.slow <= in.slow | (~copy_flags & in.overflow & ~in.zero);
+ out.zero <= in.zero;
+ out.exp_step <= 0;
+
+ // Este es el modo de redondeo más usual: round to nearest, ties to even
+ if (enable & in.guard & (in.round | in.sticky | in.sticky_last | in.val.mant[0]))
+ {out.exp_step, out.val.mant} <= {1'b0, out.val.mant} + 1;
+ end
+
+endmodule
+
+// Stage 13: ajuste de exponente por redondeo
+module gfx_shader_fpint_rnorm
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_round_rnorm in,
+
+ output fpint_rnorm_encode out
+);
+
+ always_ff @(posedge clk) begin
+ out.slow <= in.slow;
+ out.zero <= in.zero;
+ out.overflow <= 0;
+ out.val.mant <= in.val.mant;
+ out.val.sign <= in.val.sign;
+
+ if (in.exp_step)
+ {out.overflow, out.val.exp} <= {1'b0, in.val.exp} + 1;
+ else
+ out.val.exp <= in.val.exp;
+ end
+
+endmodule
+
+// Stage 14: salida y codificación de ceros y NaNs
+module gfx_shader_fpint_encode
+import gfx::*;
+(
+ input logic clk,
+
+ input fpint_rnorm_encode in,
+ input logic enable,
+
+ output float q
+);
+
+ always_ff @(posedge clk) begin
+ q <= in.val;
+
+ if (enable) begin
+ if (&in.val.exp | in.slow | in.overflow) begin
+ q.exp <= FLOAT_EXP_MAX;
+ q.mant <= 1;
+ end else if (in.zero) begin
+ q.exp <= 0;
+ q.mant <= 0;
+ end
+ end
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_shader_front.sv b/rtl/gfx/gfx_shader_front.sv
new file mode 100644
index 0000000..52074fd
--- /dev/null
+++ b/rtl/gfx/gfx_shader_front.sv
@@ -0,0 +1,746 @@
+typedef struct
+{
+ logic valid,
+ retry;
+ gfx::group_id group;
+ gfx_isa::insn_word insn;
+} front_wave;
+
+typedef struct
+{
+ gfx::xgpr_num dest;
+ logic dest_scalar;
+} front_reg_passthru;
+
+typedef logic[4:0] icache_line_num;
+
+typedef logic[$bits(gfx::oword_ptr) - $bits(icache_line_num) - 1:0] icache_tag;
+
+typedef struct packed
+{
+ icache_tag tag;
+ icache_line_num line;
+} icache_line_tag;
+
+typedef struct packed
+{
+ icache_line_tag line_tag;
+ logic[2:0] word_num;
+} icache_ptr;
+
+module gfx_shader_front
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_axib.m fetch_mem,
+
+ input logic icache_flush,
+
+ gfx_regfile_io.read reg_read,
+ gfx_regfile_io.bind_ reg_bind,
+
+ gfx_front_back.front front
+);
+
+ word fetch_insn, port_insn;
+ logic fetch_hit, p0_writeback;
+ front_wave bind_wave, dec_wave, port_dec_wave;
+ front_reg_passthru reg_passthru;
+
+ assign front.execute.wave.dest = reg_passthru.dest;
+ assign front.execute.wave.dest_scalar = reg_passthru.dest_scalar;
+
+ gfx_shader_bind bind_
+ (
+ .clk,
+ .rst_n,
+ .mem(fetch_mem),
+ .wave(bind_wave),
+ .regs(reg_bind),
+ .loop_valid(front.loop.valid),
+ .loop_group(front.loop.group),
+ .icache_flush
+ );
+
+ gfx_shader_read_regs reg_dec
+ (
+ .clk,
+ .rst_n,
+ .in(bind_wave),
+ .out(dec_wave),
+ .read(reg_read),
+ .passthru(reg_passthru)
+ );
+
+ gfx_shader_decode_class class_dec
+ (
+ .clk,
+ .rst_n,
+ .wave(dec_wave),
+ .out_group(front.execute.wave.group),
+ .port_wave(port_dec_wave),
+ .dispatch(front.dispatch),
+ .p0_writeback
+ );
+
+ gfx_shader_decode_fpint p0_dec
+ (
+ .clk,
+ .op(front.execute.p0),
+ .insn(port_dec_wave.insn),
+ .writeback(p0_writeback)
+ );
+
+endmodule
+
+module gfx_shader_bind
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_axib.m mem,
+
+ input logic icache_flush,
+
+ input logic loop_valid,
+ input group_id loop_group,
+
+ gfx_regfile_io.bind_ regs,
+
+ output front_wave wave
+);
+
+ localparam int ICACHE_STAGES = 6;
+ localparam int BIND_STAGES = REGFILE_STAGES + ICACHE_STAGES;
+
+ gfx_beats #($bits(group_id)) runnable_in(), runnable_out();
+
+ logic ar_stall, request_ready, request_valid, valids[BIND_STAGES];
+ group_id groups[BIND_STAGES];
+ icache_line_tag araddr, request_addr;
+
+ assign mem.bready = 0;
+ assign mem.wvalid = 0;
+ assign mem.awvalid = 0;
+
+ assign mem.arlen = ($bits(mem.arlen))'($bits(oword) / $bits(word) - 1);
+ assign mem.araddr = {araddr, ($clog2($bits(oword)) - $clog2($bits(word)) + SUBWORD_BITS)'('0)};
+ assign mem.arburst = 2'b01; // Incremental mode
+
+ assign runnable_in.tx.data = loop_group;
+ assign runnable_in.tx.valid = loop_valid;
+
+ assign regs.pc_front_group = runnable_out.rx.data;
+ assign runnable_out.rx.ready = 1;
+
+ assign wave.group = groups[$size(groups) - 1];
+
+ gfx_skid_buf #($bits(araddr)) ar_skid
+ (
+ .clk,
+ .in(request_addr),
+ .out(araddr),
+ .stall(ar_stall)
+ );
+
+ gfx_skid_flow ar_flow
+ (
+ .clk,
+ .rst_n,
+ .stall(ar_stall),
+ .in_ready(request_ready),
+ .in_valid(request_valid),
+ .out_ready(mem.arready),
+ .out_valid(mem.arvalid)
+ );
+
+ //TODO: Podríamos quitar ~25 entries sin afectar throughput, latencia o correctitud
+ gfx_fifo #(.WIDTH($bits(group_id)), .DEPTH(1 << $bits(group_id))) runnable
+ (
+ .clk,
+ .rst_n,
+ .in(runnable_in.rx),
+ .out(runnable_out.tx)
+ );
+
+ gfx_shader_bind_icache icache
+ (
+ .clk,
+ .rst_n,
+
+ .icache_flush,
+ .read_addr(regs.pc_front),
+ .read_valid(valids[REGFILE_STAGES - 1]),
+
+ .request_addr,
+ .request_valid,
+ .request_ready,
+
+ .fetch_data(mem.rdata),
+ .fetch_last(mem.rlast),
+ .fetch_valid(mem.rvalid),
+ .fetch_ready(mem.rready),
+
+ .insn(wave.insn),
+ .insn_retry(wave.retry),
+ .insn_valid(wave.valid)
+ );
+
+ always_ff @(posedge clk) begin
+ groups[0] <= runnable_out.rx.data;
+ for (int i = 1; i < $size(groups); ++i)
+ groups[i] <= groups[i - 1];
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n)
+ for (int i = 0; i < $size(valids); ++i)
+ valids[i] <= 0;
+ else begin
+ valids[0] <= runnable_out.rx.valid;
+ for (int i = 1; i < $size(valids); ++i)
+ valids[i] <= valids[i - 1];
+ end
+
+endmodule
+
+module gfx_shader_bind_icache
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ input logic icache_flush,
+
+ input logic read_valid,
+ input icache_ptr read_addr,
+
+ input logic fetch_last,
+ fetch_valid,
+ input word fetch_data,
+ output logic fetch_ready,
+
+ input logic request_ready,
+ output logic request_valid,
+ output icache_line_tag request_addr,
+
+ output logic insn_valid,
+ insn_retry,
+ output word insn
+);
+
+ // Dan Gisselquist limita a (1 << 3) bursts por defecto.
+ // Ver LGMAXBURST en axixbar.v
+ localparam int PENDING_FIFO_DEPTH = 8;
+
+ enum int unsigned
+ {
+ FLUSH,
+ RUN
+ } state;
+
+ struct
+ {
+ logic valid,
+ accessed,
+ hit;
+ icache_tag tag;
+ oword data;
+ } cache[1 << $bits(icache_line_num)], read, read_hold;
+
+ gfx_beats #($bits(icache_line_tag)) pending_in(), pending_out();
+
+ logic accessed_write, accessed_write_enable, burst, fetch_done, hit_write,
+ in_flush, hit_commit, hit_write_enable, retry_4, retry_5, rollback,
+ tag_hit, valid_1, valid_2, valid_3, valid_4, valid_5, valid_write,
+ valid_write_enable;
+
+ icache_ptr read_addr_1, read_addr_2, read_addr_3, read_addr_4, read_addr_5;
+ icache_tag tag_write;
+ icache_line_num accessed_write_line, flush_ptr, hit_write_line, valid_write_line;
+ icache_line_tag pending_pop;
+
+ oword data_write;
+ word[1:0] data_5;
+ word[7:0] fetch_shift;
+ qword[1:0] data_3;
+ udword[1:0] data_4;
+
+ assign data_3 = read.data;
+ assign tag_hit = read.tag == read_addr_3.line_tag.tag;
+ assign fetch_ready = ~fetch_done;
+ assign pending_pop = pending_out.rx.data;
+
+ assign request_addr = read_addr_4.line_tag;
+ assign request_valid = burst & pending_in.tx.ready;
+ assign pending_in.tx.data = read_addr_4.line_tag;
+ assign pending_in.tx.valid = burst & request_ready;
+ assign pending_out.rx.ready = fetch_done & ~hit_commit & ~rollback;
+
+ gfx_fifo #(.WIDTH($bits(icache_line_tag)), .DEPTH(PENDING_FIFO_DEPTH)) pending
+ (
+ .clk,
+ .rst_n,
+ .in(pending_in.rx),
+ .out(pending_out.tx)
+ );
+
+ always_comb
+ unique case (state)
+ FLUSH: in_flush = 1;
+ RUN: in_flush = 0;
+ endcase
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ state <= FLUSH;
+ flush_ptr <= '0;
+ fetch_done <= 0;
+
+ valid_1 <= 0;
+ valid_2 <= 0;
+ valid_3 <= 0;
+ valid_4 <= 0;
+ valid_5 <= 0;
+
+ burst <= 0;
+ end else begin
+ unique case (state)
+ FLUSH:
+ if (~icache_flush & &flush_ptr)
+ state <= RUN;
+
+ RUN:
+ if (icache_flush)
+ state <= FLUSH;
+ endcase
+
+ flush_ptr <= flush_ptr + 1;
+ if (icache_flush)
+ flush_ptr <= '0;
+
+ if (fetch_done)
+ fetch_done <= hit_commit | ~pending_out.rx.valid | rollback;
+ else if (fetch_ready & fetch_valid)
+ fetch_done <= fetch_last;
+
+ valid_1 <= read_valid;
+ valid_2 <= valid_1;
+ valid_3 <= valid_2;
+ valid_4 <= valid_3;
+ valid_5 <= valid_4;
+
+ burst <= valid_3 & ~tag_hit & ~read.accessed & (~read.valid | read.hit);
+ end
+
+ always_ff @(posedge clk) begin
+ tag_write <= pending_pop.tag;
+ data_write <= fetch_shift;
+
+ valid_write <= 1;
+ valid_write_line <= pending_pop.line;
+ valid_write_enable <= fetch_done & ~hit_commit & pending_out.rx.valid & ~rollback;
+
+ accessed_write <= 0;
+ accessed_write_enable <= 1;
+
+ if (rollback)
+ accessed_write_line <= read_addr_5.line_tag.line;
+ else if (fetch_done & ~hit_commit & pending_out.rx.valid)
+ accessed_write_line <= pending_pop.line;
+ else begin
+ accessed_write <= 1;
+ accessed_write_line <= read_addr.line_tag.line;
+ accessed_write_enable <= read_valid;
+ end
+
+ hit_write <= hit_commit;
+ if (hit_commit) begin
+ hit_write_line <= read_addr_4.line_tag.line;
+ hit_write_enable <= 1;
+ end else begin
+ hit_write_line <= pending_pop.line;
+ hit_write_enable <= fetch_done & pending_out.rx.valid & ~rollback;
+ end
+
+ if (in_flush) begin
+ valid_write <= 0;
+ valid_write_line <= flush_ptr;
+ valid_write_enable <= 1;
+
+ accessed_write <= 0;
+ accessed_write_line <= flush_ptr;
+ accessed_write_enable <= 1;
+
+ hit_write <= 0;
+ hit_write_line <= flush_ptr;
+ hit_write_enable <= 1;
+ end
+
+ if (valid_write_enable) begin
+ cache[valid_write_line].tag <= tag_write;
+ cache[valid_write_line].data <= data_write;
+ cache[valid_write_line].valid <= valid_write;
+ end
+
+ if (accessed_write_enable)
+ cache[accessed_write_line].accessed <= accessed_write;
+
+ if (hit_write_enable)
+ cache[hit_write_line].hit <= hit_write;
+
+ read_addr_1 <= read_addr;
+
+ read_hold <= cache[read_addr_1.line_tag.line];
+ read_addr_2 <= read_addr_1;
+
+ read <= read_hold;
+ read_addr_3 <= read_addr_2;
+
+ data_4 <= data_3[read_addr_3.word_num[2]];
+ retry_4 <= ~tag_hit | ~read.valid;
+ hit_commit <= valid_3 & tag_hit & read.valid;
+ read_addr_4 <= read_addr_3;
+
+ data_5 <= data_4[read_addr_4.word_num[1]];
+ retry_5 <= retry_4;
+ rollback <= burst & (~request_valid | ~pending_in.tx.valid);
+ read_addr_5 <= read_addr_4;
+
+ insn <= data_5[read_addr_5.word_num[0]];
+ insn_retry <= retry_5;
+ insn_valid <= valid_5;
+
+ if (fetch_ready & fetch_valid) begin
+ fetch_shift[0] <= fetch_data;
+ for (int i = 1; i < $size(fetch_shift); ++i)
+ fetch_shift[i] <= fetch_shift[i - 1];
+ end
+ end
+
+endmodule
+
+module gfx_shader_read_regs
+import gfx::*;
+import gfx_isa::*;
+(
+ input logic clk,
+ rst_n,
+
+ input front_wave in,
+
+ gfx_regfile_io.read read,
+
+ output front_wave out,
+ output front_reg_passthru passthru
+);
+
+ // + 1 por next-cycle de read.op
+ localparam int PASSTHRU_DEPTH = REG_READ_STAGES + 1 - 2;
+ localparam int HOLD_DEPTH = PASSTHRU_DEPTH - 2;
+
+ logic reg_rev;
+ logic valid[HOLD_DEPTH];
+ front_wave out_hold[HOLD_DEPTH];
+ front_reg_passthru passthru_hold[PASSTHRU_DEPTH];
+
+ assign passthru = passthru_hold[$size(passthru_hold) - 1];
+
+ assign reg_rev = in.insn.reg_rev;
+
+ always_comb begin
+ out = out_hold[$size(out_hold) - 1];
+ out.valid = valid[$size(valid) - 1];
+ end
+
+ always_ff @(posedge clk) begin
+ out_hold[0] <= in;
+ for (int i = 1; i < $size(out_hold); ++i)
+ out_hold[i] <= out_hold[i - 1];
+
+ passthru_hold[0].dest <= in.insn.dst_src.rr.rd;
+ unique case (in.insn.reg_mode)
+ REGS_SVS, REGS_SSS:
+ passthru_hold[0].dest_scalar <= 1;
+
+ REGS_VVS, REGS_VVV:
+ passthru_hold[0].dest_scalar <= 0;
+ endcase
+
+ for (int i = 1; i < $size(passthru_hold); ++i)
+ passthru_hold[i] <= passthru_hold[i - 1];
+
+ read.op.group <= in.group;
+
+ read.op.b_imm <= in.insn.dst_src.rr.b.imm;
+ read.op.a_sgpr <= in.insn.dst_src.rr.ra.sgpr;
+ read.op.b_sgpr <= in.insn.dst_src.rr.b.read.r.sgpr;
+ read.op.a_vgpr <= in.insn.dst_src.rr.ra.vgpr.num;
+ read.op.b_vgpr <= in.insn.dst_src.rr.b.read.r.vgpr.num;
+ read.op.b_is_imm <= in.insn.dst_src.rr.b_is_imm;
+ read.op.b_is_const <= in.insn.dst_src.rr.b.read.from_consts;
+ read.op.scalar_rev <= reg_rev;
+
+ unique case (in.insn.reg_mode)
+ REGS_SVS, REGS_VVS: begin
+ read.op.a_scalar <= reg_rev;
+ read.op.b_scalar <= ~reg_rev;
+ end
+
+ REGS_SSS: begin
+ read.op.a_scalar <= 1;
+ read.op.b_scalar <= 1;
+ end
+
+ REGS_VVV: begin
+ read.op.a_scalar <= 0;
+ read.op.b_scalar <= 0;
+ end
+ endcase
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n)
+ for (int i = 0; i < HOLD_DEPTH; ++i)
+ valid[i] <= 0;
+ else begin
+ valid[0] <= in.valid;
+
+ for (int i = 1; i < HOLD_DEPTH; ++i)
+ valid[i] <= valid[i - 1];
+ end
+
+endmodule
+
+module gfx_shader_decode_class
+import gfx::*;
+import gfx_isa::*;
+(
+ input logic clk,
+ rst_n,
+
+ input front_wave wave,
+ output front_wave port_wave,
+ output group_id out_group,
+
+ output shader_dispatch dispatch,
+ output logic p0_writeback
+);
+
+ logic is_fsu, is_mem, is_group, hold_valid, retry;
+ front_wave hold_wave;
+
+ assign p0_writeback = ~(is_mem | is_fsu | is_group | retry);
+
+ always_comb begin
+ port_wave = hold_wave;
+ port_wave.valid = hold_valid;
+ end
+
+ always_ff @(posedge clk) begin
+ hold_wave <= wave;
+ out_group <= port_wave.group;
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ // Intencionalmente repetitivo
+ if (~rst_n) begin
+ is_fsu <= 0;
+ is_mem <= 0;
+ is_group <= 0;
+
+ retry <= 0;
+ hold_valid <= 0;
+
+ dispatch <= '0;
+ end else begin
+ is_fsu <= 0;
+ is_mem <= 0;
+ is_group <= 0;
+
+ retry <= wave.retry;
+ hold_valid <= wave.valid;
+
+ unique case (wave.insn.insn_class)
+ INSN_FPINT: ; // p0 no tiene ready
+ INSN_MEM: is_mem <= 1;
+ INSN_SFU: is_fsu <= 1;
+ INSN_GROUP: is_group <= 1;
+
+ default:
+ {is_mem, is_fsu, is_group} <= 'x;
+ endcase
+
+ dispatch.p1 <= is_mem;
+ dispatch.p2 <= is_fsu;
+ dispatch.p3 <= is_group;
+
+ if (~hold_valid | retry) begin
+ dispatch.p1 <= 0;
+ dispatch.p2 <= 0;
+ dispatch.p3 <= 0;
+ end
+
+ dispatch.valid <= hold_valid;
+ end
+
+endmodule
+
+module gfx_shader_decode_fpint
+import gfx::*;
+import gfx_isa::*;
+(
+ input logic clk,
+
+ input insn_word insn,
+ input logic writeback,
+
+ output fpint_op op
+);
+
+ always_ff @(posedge clk) begin
+ unique case (insn.by_class.fpint.op)
+ INSN_FPINT_MOV: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 1;
+ op.mnorm_zero_b <= 1;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ INSN_FPINT_FMUL: begin
+ op.setup_mul_float <= 1;
+ op.setup_unit_b <= 0;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 0;
+ op.mnorm_put_mul <= 1;
+ op.mnorm_zero_flags <= 0;
+ op.mnorm_zero_b <= 1;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ INSN_FPINT_IMUL: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 0;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 1;
+ op.mnorm_zero_b <= 1;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 0;
+ op.encode_enable <= 0;
+ end
+
+ INSN_FPINT_FADD: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 0;
+ op.mnorm_zero_b <= 0;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 0;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 0;
+ op.clz_force_nop <= 0;
+ op.shiftl_copy_flags <= 0;
+ op.round_copy_flags <= 0;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ INSN_FPINT_FMAX, INSN_FPINT_FMIN: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 0;
+ op.mnorm_zero_b <= 0;
+ op.minmax_abs <= 0;
+ op.minmax_swap <= insn.by_class.fpint.op == INSN_FPINT_FMIN;
+ op.minmax_zero_min <= 1;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 0;
+ op.encode_enable <= 0;
+ end
+
+ INSN_FPINT_FCVT: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 1;
+ op.mnorm_zero_b <= 1;
+
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 0;
+ op.shiftr_int_signed <= 1;
+ op.addsub_int_operand <= 1;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 0;
+ op.shiftl_copy_flags <= 0;
+ op.round_copy_flags <= 0;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ default:
+ op <= 'x;
+ endcase
+
+ op.writeback <= writeback;
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_shader_group.sv b/rtl/gfx/gfx_shader_group.sv
new file mode 100644
index 0000000..e668877
--- /dev/null
+++ b/rtl/gfx/gfx_shader_group.sv
@@ -0,0 +1,17 @@
+module gfx_shader_group
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ input group_op op,
+ input wave_exec wave,
+
+ gfx_regfile_io.ab read_data,
+
+ gfx_shake.rx in_shake,
+
+ gfx_wb.tx wb
+);
+
+endmodule
diff --git a/rtl/gfx/gfx_shader_mem.sv b/rtl/gfx/gfx_shader_mem.sv
new file mode 100644
index 0000000..403c9e4
--- /dev/null
+++ b/rtl/gfx/gfx_shader_mem.sv
@@ -0,0 +1,17 @@
+module gfx_shader_mem
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ input mem_op op,
+ input wave_exec wave,
+
+ gfx_regfile_io.ab read_data,
+
+ gfx_shake.rx in_shake,
+
+ gfx_wb.tx wb
+);
+
+endmodule
diff --git a/rtl/gfx/gfx_shader_regs.sv b/rtl/gfx/gfx_shader_regs.sv
new file mode 100644
index 0000000..ef3a129
--- /dev/null
+++ b/rtl/gfx/gfx_shader_regs.sv
@@ -0,0 +1,302 @@
+module gfx_shader_regs
+import gfx::*;
+(
+ input logic clk,
+
+ gfx_regfile_io.regs io
+);
+
+ // verilator tracing_off
+
+ localparam PC_TABLE_PORTS = 2;
+ localparam MASK_TABLE_PORTS = 1;
+
+ word hold_imm[REGFILE_STAGES], imm_out, read_a_data_sgpr, read_b_data_scalar,
+ read_b_data_sgpr, read_const, read_a_data_vgpr[SHADER_LANES],
+ read_b_data_vgpr[SHADER_LANES], sgpr_out_a, sgpr_out_b;
+
+ group_id mask_read_groups[MASK_TABLE_PORTS], pc_read_groups[PC_TABLE_PORTS];
+ word_ptr pc_read[PC_TABLE_PORTS];
+ lane_mask mask_read[MASK_TABLE_PORTS];
+
+ logic a_scalar_out, b_is_const_out, b_is_imm_out, b_scalar_out, scalar_rev_out;
+ group_id hold_read_group_1, hold_read_group_2;
+ sgpr_num hold_read_a_sgpr;
+ vgpr_num hold_read_a_vgpr_1, hold_read_a_vgpr_2, hold_read_b_vgpr_1, hold_read_b_vgpr_2;
+ logic[REGFILE_STAGES - 1:0] hold_b_is_imm, hold_b_is_const;
+ logic[REGFILE_STAGES + 1 - 1:0] hold_scalar_rev;
+ logic[REGFILE_STAGES + 2 - 1:0] hold_a_scalar, hold_b_scalar;
+
+ assign io.pc_back = pc_read[0];
+ assign io.pc_front = pc_read[1];
+ assign pc_read_groups[0] = io.pc_back_group;
+ assign pc_read_groups[1] = io.pc_front_group;
+
+ assign io.mask_back = mask_read[0];
+ assign pc_read_groups[0] = io.mask_back_group;
+
+ assign imm_out = hold_imm[$size(hold_imm) - 1];
+ assign a_scalar_out = hold_a_scalar[$bits(hold_a_scalar) - 1];
+ assign b_scalar_out = hold_b_scalar[$bits(hold_b_scalar) - 1];
+ assign b_is_imm_out = hold_b_is_imm[$bits(hold_b_is_imm) - 1];
+ assign b_is_const_out = hold_b_is_const[$bits(hold_b_is_const) - 1];
+ assign scalar_rev_out = hold_scalar_rev[$bits(hold_scalar_rev) - 1];
+
+ gfx_shader_table #(.DATA_WIDTH($bits(word_ptr)), .READ_PORTS(PC_TABLE_PORTS)) pc_table
+ (
+ .clk,
+ .read(pc_read),
+ .write(io.pc_wb),
+ .read_groups(pc_read_groups),
+ .write_group(io.pc_wb_group),
+ .write_enable(io.pc_wb_write)
+ );
+
+ gfx_shader_table #(.DATA_WIDTH($bits(lane_mask)), .READ_PORTS(MASK_TABLE_PORTS)) mask_table
+ (
+ .clk,
+ .read(mask_read),
+ .write(io.mask_wb),
+ .read_groups(mask_read_groups),
+ .write_group(io.mask_wb_group),
+ .write_enable(io.mask_wb_write)
+ );
+
+ gfx_shader_consts consts
+ (
+ .clk,
+ .num(io.op.b_sgpr),
+ .value(read_const)
+ );
+
+ gfx_shader_regfile #($bits(group_id) + $bits(sgpr_num)) sgprs
+ (
+ .clk,
+
+ .read_a_num({hold_read_group_1, hold_read_a_sgpr}),
+ .read_b_num({io.op.group, io.op.b_sgpr}),
+ .read_a_data(read_a_data_sgpr),
+ .read_b_data(read_b_data_sgpr),
+
+ .write(io.sgpr_write.write),
+ .write_num({io.sgpr_write.group, io.sgpr_write.sgpr}),
+ .write_data(io.sgpr_write.data)
+ );
+
+ generate
+ for (genvar i = 0; i < SHADER_LANES; ++i) begin: vgprs
+ gfx_shader_regfile #($bits(group_id) + $bits(vgpr_num)) vgprs
+ (
+ .clk,
+
+ .read_a_num({hold_read_group_2, hold_read_a_vgpr_2}),
+ .read_b_num({hold_read_group_2, hold_read_b_vgpr_2}),
+ .read_a_data(read_a_data_vgpr[i]),
+ .read_b_data(read_b_data_vgpr[i]),
+
+ .write(io.vgpr_write.mask[i]),
+ .write_num({io.vgpr_write.group, io.vgpr_write.vgpr}),
+ .write_data(io.vgpr_write.data[i])
+ );
+ end
+ endgenerate
+
+ always_ff @(posedge clk) begin
+ hold_imm[0] <= {{($bits(word) - $bits(io.op.b_imm)){1'b0}}, io.op.b_imm};
+ hold_a_scalar[0] <= io.op.a_scalar;
+ hold_b_scalar[0] <= io.op.b_scalar;
+ hold_b_is_imm[0] <= io.op.b_is_imm;
+ hold_b_is_const[0] <= io.op.b_is_const;
+ hold_scalar_rev[0] <= io.op.scalar_rev;
+
+ for (int i = 1; i < REGFILE_STAGES; ++i) begin
+ hold_imm[i] <= hold_imm[i - 1];
+ hold_a_scalar[i] <= hold_a_scalar[i - 1];
+ hold_b_scalar[i] <= hold_b_scalar[i - 1];
+ hold_b_is_imm[i] <= hold_b_is_imm[i - 1];
+ hold_b_is_const[i] <= hold_b_is_const[i - 1];
+ hold_scalar_rev[i] <= hold_scalar_rev[i - 1];
+ end
+
+ for (int i = REGFILE_STAGES; i < REGFILE_STAGES + 2; ++i) begin
+ hold_a_scalar[i] <= hold_a_scalar[i - 1];
+ hold_b_scalar[i] <= hold_b_scalar[i - 1];
+ end
+
+ hold_scalar_rev[REGFILE_STAGES] <= hold_scalar_rev[REGFILE_STAGES - 1];
+
+ hold_read_a_sgpr <= io.op.a_sgpr;
+ hold_read_group_1 <= io.op.group;
+ hold_read_group_2 <= hold_read_group_1;
+
+ hold_read_a_vgpr_1 <= io.op.a_vgpr;
+ hold_read_a_vgpr_2 <= hold_read_a_vgpr_1;
+
+ hold_read_b_vgpr_1 <= io.op.b_vgpr;
+ hold_read_b_vgpr_2 <= hold_read_b_vgpr_1;
+
+ if (b_is_imm_out)
+ read_b_data_scalar <= imm_out;
+ else if (b_is_const_out)
+ read_b_data_scalar <= read_const;
+ else
+ read_b_data_scalar <= read_b_data_sgpr;
+
+ if (scalar_rev_out) begin
+ sgpr_out_a <= read_b_data_scalar;
+ sgpr_out_b <= read_a_data_sgpr;
+ end else begin
+ sgpr_out_a <= read_a_data_sgpr;
+ sgpr_out_b <= read_b_data_scalar;
+ end
+
+ for (int i = 0; i < SHADER_LANES; ++i) begin
+ io.a[i] <= a_scalar_out ? sgpr_out_a : read_a_data_vgpr[i];
+ io.b[i] <= b_scalar_out ? sgpr_out_b : read_a_data_vgpr[i];
+ end
+ end
+
+endmodule
+
+module gfx_shader_consts
+import gfx::*;
+(
+ input logic clk,
+
+ input sgpr_num num,
+ output word value
+);
+
+ word hold_out, rom[1 << $bits(sgpr_num)];
+ sgpr_num hold_in;
+
+ always_ff @(posedge clk) begin
+ value <= hold_out;
+ hold_in <= num;
+ hold_out <= rom[hold_in];
+ end
+
+ initial begin
+ rom[0] = 'hffff_ffff; // -1
+ rom[1] = 'h7fff_ffff; // 2^31 - 1, útil para abs de fp
+ rom[2] = 'h8000_0000; // 2^31, útil para neg de fp
+ rom[3] = 'h3f80_0000; // +1.0
+ rom[4] = 'hbf80_0000; // -1.0
+ end
+
+endmodule
+
+module gfx_shader_regfile
+import gfx::*;
+#(int DEPTH_LOG = 0)
+(
+ input logic clk,
+
+ input logic[DEPTH_LOG - 1:0] read_a_num,
+ read_b_num,
+ output word read_a_data,
+ read_b_data,
+
+ input logic write,
+ input logic[DEPTH_LOG - 1:0] write_num,
+ input word write_data
+);
+
+ gfx_shader_regfile_port #(DEPTH_LOG) a
+ (
+ .clk,
+ .write,
+ .read_num(read_a_num),
+ .read_data(read_a_data),
+ .write_num,
+ .write_data
+ );
+
+ gfx_shader_regfile_port #(DEPTH_LOG) b
+ (
+ .clk,
+ .write,
+ .read_num(read_b_num),
+ .read_data(read_b_data),
+ .write_num,
+ .write_data
+ );
+
+endmodule
+
+module gfx_shader_regfile_port
+import gfx::*;
+#(int DEPTH_LOG = 0)
+(
+ input logic clk,
+
+ input logic[DEPTH_LOG - 1:0] read_num,
+ output word read_data,
+
+ input logic write,
+ input logic[DEPTH_LOG - 1:0] write_num,
+ input word write_data
+);
+
+ word file[1 << DEPTH_LOG], hold_read_data, hold_write_data;
+ logic hold_write;
+ logic[DEPTH_LOG - 1:0] hold_read_num, hold_write_num;
+
+ // hold_write no necesita rst_n porque cualquier write inicial es inofensivo
+
+ always_ff @(posedge clk) begin
+ hold_write <= write;
+ hold_read_num <= read_num;
+ hold_write_num <= write_num;
+ hold_write_data <= write_data;
+
+ hold_read_data <= file[hold_read_num];
+ if (hold_write)
+ file[hold_write_num] <= hold_write_data;
+
+ read_data <= hold_read_data;
+ end
+
+endmodule
+
+module gfx_shader_table
+import gfx::*;
+#(int DATA_WIDTH = 0,
+ int READ_PORTS = 0)
+(
+ input logic clk,
+
+ input group_id write_group,
+ read_groups[READ_PORTS],
+
+ input logic[DATA_WIDTH - 1:0] write,
+ input logic write_enable,
+
+ output logic[DATA_WIDTH - 1:0] read[READ_PORTS]
+);
+
+ genvar i;
+
+ generate
+ for (i = 0; i < READ_PORTS; ++i) begin: ports
+ logic write_enable_hold;
+ group_id read_group_hold, write_group_hold;
+ logic[DATA_WIDTH - 1:0] data[1 << $bits(group_id)], read_hold, write_hold;
+
+ always_ff @(posedge clk) begin
+ write_hold <= write;
+ read_group_hold <= read_groups[i];
+ write_group_hold <= write_group;
+ write_enable_hold <= write_enable;
+
+ read_hold <= data[read_group_hold];
+
+ if (write_enable_hold)
+ data[write_group_hold] <= write_hold;
+
+ read[i] <= read_hold;
+ end
+ end
+ endgenerate
+
+endmodule
diff --git a/rtl/gfx/gfx_shader_schedif.rdl b/rtl/gfx/gfx_shader_schedif.rdl
new file mode 100644
index 0000000..c846da9
--- /dev/null
+++ b/rtl/gfx/gfx_shader_schedif.rdl
@@ -0,0 +1,91 @@
+addrmap gfx_shader_schedif {
+ name = "Scheduler<->core interface";
+
+ default hw = r;
+ default sw = w;
+ default regwidth = 32;
+
+ reg {
+ name = "Shader core control register";
+
+ field {
+ desc = "Set this field to flush the instruction cache";
+
+ singlepulse;
+ } IFLUSH[0:0] = 0;
+ } CORE @ 0x00;
+
+ reg {
+ name = "Wavefront setup control register";
+
+ default hw = na;
+ default sw = r;
+ default precedence = hw;
+
+ field {
+ desc = "Wavefront group number";
+
+ hw = r;
+ sw = rw;
+ } GROUP[5:0];
+
+ field {
+ desc = "Destination SGPR number";
+
+ hw = r;
+ sw = rw;
+ } XGPR[11:8];
+
+ field {
+ desc = "PC table update done, group submitted";
+
+ rclr;
+ hwset;
+ } SUBMIT_DONE[16:16] = 0;
+
+ field {
+ desc = "General-purpose register update done";
+
+ rclr;
+ hwset;
+ } GPR_DONE[17:17] = 0;
+
+ field {
+ desc = "Lane mask update done";
+
+ rclr;
+ hwset;
+ } MASK_DONE[18:18] = 0;
+ } SETUP_CTRL @ 0x04;
+
+ reg {
+ name = "SGPR/VGPR write register";
+
+ field {
+ desc = "Value to write";
+
+ swmod;
+ } VALUE[31:0];
+ } SETUP_GPR @ 0x08;
+
+ reg {
+ name = "Lane mask write register";
+
+ field {
+ desc = "Mask value to write";
+
+ swmod;
+ } MASK[15:0];
+ } SETUP_MASK @ 0x0c;
+
+ reg {
+ name = "Group submit register";
+
+ field {
+ desc = "Initial group program counter, submits group on write";
+
+ swmod;
+ } PC[31:2];
+ } SETUP_SUBMIT @ 0x10;
+};
+
diff --git a/rtl/gfx/gfx_shader_setup.sv b/rtl/gfx/gfx_shader_setup.sv
new file mode 100644
index 0000000..f46fb66
--- /dev/null
+++ b/rtl/gfx/gfx_shader_setup.sv
@@ -0,0 +1,37 @@
+interface gfx_shader_setup
+import gfx::*;;
+
+ struct
+ {
+ group_id group;
+ word_ptr pc;
+ xgpr_num gpr;
+ word gpr_value;
+ lane_mask mask;
+ logic pc_set,
+ gpr_set,
+ mask_set;
+ } write;
+
+ struct
+ {
+ logic gpr,
+ mask,
+ submit;
+ } set_done;
+
+ modport core
+ (
+ input write,
+
+ output set_done
+ );
+
+ modport sched
+ (
+ input set_done,
+
+ output write
+ );
+
+endinterface
diff --git a/rtl/gfx/gfx_shader_sfu.sv b/rtl/gfx/gfx_shader_sfu.sv
new file mode 100644
index 0000000..d65e522
--- /dev/null
+++ b/rtl/gfx/gfx_shader_sfu.sv
@@ -0,0 +1,17 @@
+module gfx_shader_sfu
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ input sfu_op op,
+ input wave_exec wave,
+
+ gfx_regfile_io.ab read_data,
+
+ gfx_shake.rx in_shake,
+
+ gfx_wb.tx wb
+);
+
+endmodule
diff --git a/rtl/gfx/gfx_shake.sv b/rtl/gfx/gfx_shake.sv
new file mode 100644
index 0000000..baae0c3
--- /dev/null
+++ b/rtl/gfx/gfx_shake.sv
@@ -0,0 +1,24 @@
+interface gfx_shake;
+
+ logic ready;
+ logic valid;
+
+ modport tx
+ (
+ input ready,
+ output valid
+ );
+
+ modport rx
+ (
+ input valid,
+ output ready
+ );
+
+ modport peek
+ (
+ input ready,
+ valid
+ );
+
+endinterface
diff --git a/rtl/gfx/gfx_sim_debug.sv b/rtl/gfx/gfx_sim_debug.sv
new file mode 100644
index 0000000..4b4622a
--- /dev/null
+++ b/rtl/gfx/gfx_sim_debug.sv
@@ -0,0 +1,50 @@
+module gfx_sim_debug
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_axil.s axis
+);
+
+ enum int unsigned
+ {
+ INPUT,
+ STALL
+ } state;
+
+ assign axis.rvalid = 0;
+ assign axis.arready = 0;
+ assign axis.awready = 1;
+
+ always_comb
+ unique case (state)
+ INPUT: begin
+ axis.wready = 1;
+ axis.bvalid = axis.wvalid;
+ end
+
+ STALL: begin
+ axis.wready = 0;
+ axis.bvalid = 1;
+ end
+ endcase
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n)
+ state <= INPUT;
+ else
+ unique case (state)
+ INPUT:
+ if (axis.wvalid) begin
+ $display("%c", axis.wdata[7:0]);
+ if (~axis.bready)
+ state <= STALL;
+ end
+
+ STALL:
+ if (axis.bready)
+ state <= INPUT;
+ endcase
+
+endmodule
diff --git a/rtl/gfx/gfx_skid_buf.sv b/rtl/gfx/gfx_skid_buf.sv
new file mode 100644
index 0000000..e3e5247
--- /dev/null
+++ b/rtl/gfx/gfx_skid_buf.sv
@@ -0,0 +1,20 @@
+module gfx_skid_buf
+#(int WIDTH = 0)
+(
+ input logic clk,
+
+ input logic[WIDTH - 1:0] in,
+ input logic stall,
+
+ output logic[WIDTH - 1:0] out
+);
+
+ logic[WIDTH - 1:0] skid;
+
+ assign out = stall ? skid : in;
+
+ always_ff @(posedge clk)
+ if (~stall)
+ skid <= in;
+
+endmodule
diff --git a/rtl/gfx/gfx_skid_flow.sv b/rtl/gfx/gfx_skid_flow.sv
new file mode 100644
index 0000000..7890ae3
--- /dev/null
+++ b/rtl/gfx/gfx_skid_flow.sv
@@ -0,0 +1,31 @@
+module gfx_skid_flow
+(
+ input logic clk,
+ rst_n,
+
+ input logic in_valid,
+ out_ready,
+
+ output logic in_ready,
+ out_valid,
+ stall
+);
+
+ logic was_ready, was_valid;
+
+ assign stall = ~in_ready;
+ assign in_ready = was_ready | ~was_valid;
+ assign out_valid = in_valid | stall;
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ was_ready <= 0;
+ was_valid <= 0;
+ end else begin
+ was_ready <= out_ready;
+
+ if (~stall)
+ was_valid <= in_valid;
+ end
+
+endmodule
diff --git a/rtl/gfx/gfx_wb.sv b/rtl/gfx/gfx_wb.sv
new file mode 100644
index 0000000..20c7c64
--- /dev/null
+++ b/rtl/gfx/gfx_wb.sv
@@ -0,0 +1,51 @@
+interface gfx_wb;
+
+ import gfx::*;
+
+ word lanes[SHADER_LANES];
+ logic mask_update, pc_inc, pc_update, ready, scalar, valid, writeback;
+ group_id group;
+ xgpr_num dest;
+ lane_mask mask;
+ pc_offset pc_add;
+
+ modport tx
+ (
+ input ready,
+
+ output dest,
+ group,
+ lanes,
+ valid,
+ scalar,
+ writeback,
+
+ mask,
+ mask_update,
+
+ pc_add,
+ pc_inc,
+ pc_update
+ );
+
+ modport rx
+ (
+ input dest,
+ group,
+ lanes,
+ valid,
+ scalar,
+ writeback,
+
+ mask,
+ mask_update,
+
+ pc_add,
+ pc_inc,
+ pc_update,
+
+ output ready
+ );
+
+
+endinterface
diff --git a/rtl/gfx/gfx_xbar_sched.sv b/rtl/gfx/gfx_xbar_sched.sv
new file mode 100644
index 0000000..95e4afb
--- /dev/null
+++ b/rtl/gfx/gfx_xbar_sched.sv
@@ -0,0 +1,146 @@
+module gfx_xbar_sched
+import gfx::*;
+(
+ input logic clk,
+ srst_n,
+
+ gfx_axil.s sched,
+
+ gfx_axil.m debug,
+ gfx_axil.m bootrom,
+ gfx_axil.m shader_0
+);
+
+ localparam word BOOTROM_MASK = 32'hfff0_0000;
+ localparam word DEBUG_BASE = 32'h0020_0000;
+ localparam word DEBUG_MASK = 32'hfff0_0000;
+ localparam word SHADER_0_BASE = 32'h0100_0000;
+ localparam word SHADER_0_MASK = 32'hfff0_0000;
+
+ defparam xbar.NM = 1;
+ defparam xbar.NS = 3;
+ defparam xbar.OPT_LOWPOWER = 0;
+
+ defparam xbar.SLAVE_ADDR = {
+ SHADER_0_BASE,
+ DEBUG_BASE,
+ BOOTROM_BASE
+ };
+
+ defparam xbar.SLAVE_MASK = {
+ SHADER_0_MASK,
+ DEBUG_MASK,
+ BOOTROM_MASK
+ };
+
+ axilxbar xbar
+ (
+ .S_AXI_ACLK(clk),
+ .S_AXI_ARESETN(srst_n),
+
+ .S_AXI_AWVALID(sched.awvalid),
+ .S_AXI_AWREADY(sched.awready),
+ .S_AXI_AWADDR(sched.awaddr),
+ .S_AXI_AWPROT('0),
+
+ .S_AXI_WVALID(sched.wvalid),
+ .S_AXI_WREADY(sched.wready),
+ .S_AXI_WDATA(sched.wdata),
+ .S_AXI_WSTRB('1),
+
+ .S_AXI_BVALID(sched.bvalid),
+ .S_AXI_BREADY(sched.bready),
+ .S_AXI_BRESP(),
+
+ .S_AXI_ARVALID(sched.arvalid),
+ .S_AXI_ARREADY(sched.arready),
+ .S_AXI_ARADDR(sched.araddr),
+ .S_AXI_ARPROT('0),
+
+ .S_AXI_RVALID(sched.rvalid),
+ .S_AXI_RREADY(sched.rready),
+ .S_AXI_RDATA(sched.rdata),
+ .S_AXI_RRESP(),
+
+ .M_AXI_AWADDR({
+ shader_0.awaddr,
+ debug.awaddr,
+ bootrom.awaddr
+ }),
+ .M_AXI_AWPROT(),
+ .M_AXI_AWVALID({
+ shader_0.awvalid,
+ debug.awvalid,
+ bootrom.awvalid
+ }),
+ .M_AXI_AWREADY({
+ shader_0.awready,
+ debug.awready,
+ bootrom.awready
+ }),
+
+ .M_AXI_WDATA({
+ shader_0.wdata,
+ debug.wdata,
+ bootrom.wdata
+ }),
+ .M_AXI_WSTRB(),
+ .M_AXI_WVALID({
+ shader_0.wvalid,
+ debug.wvalid,
+ bootrom.wvalid
+ }),
+ .M_AXI_WREADY({
+ shader_0.wready,
+ debug.wready,
+ bootrom.wready
+ }),
+
+ .M_AXI_BRESP('0),
+ .M_AXI_BVALID({
+ shader_0.bvalid,
+ debug.bvalid,
+ bootrom.bvalid
+ }),
+ .M_AXI_BREADY({
+ shader_0.bready,
+ debug.bready,
+ bootrom.bready
+ }),
+
+ .M_AXI_ARADDR({
+ shader_0.araddr,
+ debug.araddr,
+ bootrom.araddr
+ }),
+ .M_AXI_ARPROT(),
+ .M_AXI_ARVALID({
+ shader_0.arvalid,
+ debug.arvalid,
+ bootrom.arvalid
+ }),
+ .M_AXI_ARREADY({
+ shader_0.arready,
+ debug.arready,
+ bootrom.arready
+ }),
+
+ .M_AXI_RDATA({
+ shader_0.rdata,
+ debug.rdata,
+ bootrom.rdata
+ }),
+ .M_AXI_RRESP('0),
+ .M_AXI_RVALID({
+ shader_0.rvalid,
+ debug.rvalid,
+ bootrom.rvalid
+ }),
+ .M_AXI_RREADY({
+ shader_0.rready,
+ debug.rready,
+ bootrom.rready
+ })
+ );
+
+endmodule
diff --git a/rtl/gfx/mod.mk b/rtl/gfx/mod.mk
new file mode 100644
index 0000000..7525276
--- /dev/null
+++ b/rtl/gfx/mod.mk
@@ -0,0 +1,18 @@
+cores := gfx_shader_schedif
+
+define core
+ $(this)/deps := axixbar gfx_shader_schedif picorv32
+
+ $(this)/rtl_top := gfx_top
+ $(this)/rtl_dirs := .
+ $(this)/rtl_files := gfx_isa.sv gfx_pkg.sv
+endef
+
+define core/gfx_shader_schedif
+ $(this)/hooks := regblock
+
+ $(this)/regblock_rdl := gfx_shader_schedif.rdl
+ $(this)/regblock_top := gfx_shader_schedif
+ $(this)/regblock_args := --default-reset arst_n
+ $(this)/regblock_cpuif := axi4-lite
+endef