From ca02833f22b08ceeeff501107371aa6667426115 Mon Sep 17 00:00:00 2001 From: Alejandro Soto Date: Sun, 5 May 2024 17:38:55 -0600 Subject: rtl/gfx: rename platform/wavelet3d -> rtl/gfx --- platform/wavelet3d/gfx_axib.sv | 81 --- platform/wavelet3d/gfx_axil.sv | 61 -- platform/wavelet3d/gfx_axil2regblock.sv | 30 - platform/wavelet3d/gfx_beats.sv | 29 - platform/wavelet3d/gfx_bootrom.sv | 66 --- platform/wavelet3d/gfx_clz.sv | 68 --- platform/wavelet3d/gfx_ctz.sv | 18 - platform/wavelet3d/gfx_fifo.sv | 102 ---- platform/wavelet3d/gfx_fixed_dotadd.sv | 55 -- platform/wavelet3d/gfx_fixed_muladd.sv | 77 --- platform/wavelet3d/gfx_front_back.sv | 37 -- platform/wavelet3d/gfx_isa.sv | 84 --- platform/wavelet3d/gfx_pipes.sv | 24 - platform/wavelet3d/gfx_pkg.sv | 271 --------- platform/wavelet3d/gfx_pkts.sv | 29 - platform/wavelet3d/gfx_raster.sv | 930 ----------------------------- platform/wavelet3d/gfx_regfile_io.sv | 106 ---- platform/wavelet3d/gfx_rst_sync.sv | 13 - platform/wavelet3d/gfx_sched.sv | 139 ----- platform/wavelet3d/gfx_shader.sv | 77 --- platform/wavelet3d/gfx_shader_back.sv | 335 ----------- platform/wavelet3d/gfx_shader_fpint.sv | 932 ------------------------------ platform/wavelet3d/gfx_shader_front.sv | 746 ------------------------ platform/wavelet3d/gfx_shader_group.sv | 17 - platform/wavelet3d/gfx_shader_mem.sv | 17 - platform/wavelet3d/gfx_shader_regs.sv | 302 ---------- platform/wavelet3d/gfx_shader_schedif.rdl | 91 --- platform/wavelet3d/gfx_shader_setup.sv | 37 -- platform/wavelet3d/gfx_shader_sfu.sv | 17 - platform/wavelet3d/gfx_shake.sv | 24 - platform/wavelet3d/gfx_sim_debug.sv | 50 -- platform/wavelet3d/gfx_skid_buf.sv | 20 - platform/wavelet3d/gfx_skid_flow.sv | 31 - platform/wavelet3d/gfx_top.sv | 160 ----- platform/wavelet3d/gfx_wb.sv | 51 -- platform/wavelet3d/gfx_xbar_sched.sv | 146 ----- platform/wavelet3d/mod.mk | 17 +- platform/wavelet3d/w3d_top.sv | 160 +++++ rtl/gfx/gfx_axib.sv | 81 +++ rtl/gfx/gfx_axil.sv | 61 ++ rtl/gfx/gfx_axil2regblock.sv | 30 + rtl/gfx/gfx_beats.sv | 29 + rtl/gfx/gfx_bootrom.sv | 66 +++ rtl/gfx/gfx_clz.sv | 68 +++ rtl/gfx/gfx_ctz.sv | 18 + rtl/gfx/gfx_fifo.sv | 102 ++++ rtl/gfx/gfx_fixed_dotadd.sv | 55 ++ rtl/gfx/gfx_fixed_muladd.sv | 77 +++ rtl/gfx/gfx_front_back.sv | 37 ++ rtl/gfx/gfx_isa.sv | 84 +++ rtl/gfx/gfx_pipes.sv | 24 + rtl/gfx/gfx_pkg.sv | 271 +++++++++ rtl/gfx/gfx_pkts.sv | 29 + rtl/gfx/gfx_raster.sv | 930 +++++++++++++++++++++++++++++ rtl/gfx/gfx_regfile_io.sv | 106 ++++ rtl/gfx/gfx_rst_sync.sv | 13 + rtl/gfx/gfx_sched.sv | 141 +++++ rtl/gfx/gfx_shader.sv | 77 +++ rtl/gfx/gfx_shader_back.sv | 335 +++++++++++ rtl/gfx/gfx_shader_fpint.sv | 932 ++++++++++++++++++++++++++++++ rtl/gfx/gfx_shader_front.sv | 746 ++++++++++++++++++++++++ rtl/gfx/gfx_shader_group.sv | 17 + rtl/gfx/gfx_shader_mem.sv | 17 + rtl/gfx/gfx_shader_regs.sv | 302 ++++++++++ rtl/gfx/gfx_shader_schedif.rdl | 91 +++ rtl/gfx/gfx_shader_setup.sv | 37 ++ rtl/gfx/gfx_shader_sfu.sv | 17 + rtl/gfx/gfx_shake.sv | 24 + rtl/gfx/gfx_sim_debug.sv | 50 ++ rtl/gfx/gfx_skid_buf.sv | 20 + rtl/gfx/gfx_skid_flow.sv | 31 + rtl/gfx/gfx_wb.sv | 51 ++ rtl/gfx/gfx_xbar_sched.sv | 146 +++++ rtl/gfx/mod.mk | 18 + rtl/mod.mk | 2 +- 75 files changed, 5297 insertions(+), 5288 deletions(-) delete mode 100644 platform/wavelet3d/gfx_axib.sv delete mode 100644 platform/wavelet3d/gfx_axil.sv delete mode 100644 platform/wavelet3d/gfx_axil2regblock.sv delete mode 100644 platform/wavelet3d/gfx_beats.sv delete mode 100644 platform/wavelet3d/gfx_bootrom.sv delete mode 100644 platform/wavelet3d/gfx_clz.sv delete mode 100644 platform/wavelet3d/gfx_ctz.sv delete mode 100644 platform/wavelet3d/gfx_fifo.sv delete mode 100644 platform/wavelet3d/gfx_fixed_dotadd.sv delete mode 100644 platform/wavelet3d/gfx_fixed_muladd.sv delete mode 100644 platform/wavelet3d/gfx_front_back.sv delete mode 100644 platform/wavelet3d/gfx_isa.sv delete mode 100644 platform/wavelet3d/gfx_pipes.sv delete mode 100644 platform/wavelet3d/gfx_pkg.sv delete mode 100644 platform/wavelet3d/gfx_pkts.sv delete mode 100644 platform/wavelet3d/gfx_raster.sv delete mode 100644 platform/wavelet3d/gfx_regfile_io.sv delete mode 100644 platform/wavelet3d/gfx_rst_sync.sv delete mode 100644 platform/wavelet3d/gfx_sched.sv delete mode 100644 platform/wavelet3d/gfx_shader.sv delete mode 100644 platform/wavelet3d/gfx_shader_back.sv delete mode 100644 platform/wavelet3d/gfx_shader_fpint.sv delete mode 100644 platform/wavelet3d/gfx_shader_front.sv delete mode 100644 platform/wavelet3d/gfx_shader_group.sv delete mode 100644 platform/wavelet3d/gfx_shader_mem.sv delete mode 100644 platform/wavelet3d/gfx_shader_regs.sv delete mode 100644 platform/wavelet3d/gfx_shader_schedif.rdl delete mode 100644 platform/wavelet3d/gfx_shader_setup.sv delete mode 100644 platform/wavelet3d/gfx_shader_sfu.sv delete mode 100644 platform/wavelet3d/gfx_shake.sv delete mode 100644 platform/wavelet3d/gfx_sim_debug.sv delete mode 100644 platform/wavelet3d/gfx_skid_buf.sv delete mode 100644 platform/wavelet3d/gfx_skid_flow.sv delete mode 100644 platform/wavelet3d/gfx_top.sv delete mode 100644 platform/wavelet3d/gfx_wb.sv delete mode 100644 platform/wavelet3d/gfx_xbar_sched.sv create mode 100644 platform/wavelet3d/w3d_top.sv create mode 100644 rtl/gfx/gfx_axib.sv create mode 100644 rtl/gfx/gfx_axil.sv create mode 100644 rtl/gfx/gfx_axil2regblock.sv create mode 100644 rtl/gfx/gfx_beats.sv create mode 100644 rtl/gfx/gfx_bootrom.sv create mode 100644 rtl/gfx/gfx_clz.sv create mode 100644 rtl/gfx/gfx_ctz.sv create mode 100644 rtl/gfx/gfx_fifo.sv create mode 100644 rtl/gfx/gfx_fixed_dotadd.sv create mode 100644 rtl/gfx/gfx_fixed_muladd.sv create mode 100644 rtl/gfx/gfx_front_back.sv create mode 100644 rtl/gfx/gfx_isa.sv create mode 100644 rtl/gfx/gfx_pipes.sv create mode 100644 rtl/gfx/gfx_pkg.sv create mode 100644 rtl/gfx/gfx_pkts.sv create mode 100644 rtl/gfx/gfx_raster.sv create mode 100644 rtl/gfx/gfx_regfile_io.sv create mode 100644 rtl/gfx/gfx_rst_sync.sv create mode 100644 rtl/gfx/gfx_sched.sv create mode 100644 rtl/gfx/gfx_shader.sv create mode 100644 rtl/gfx/gfx_shader_back.sv create mode 100644 rtl/gfx/gfx_shader_fpint.sv create mode 100644 rtl/gfx/gfx_shader_front.sv create mode 100644 rtl/gfx/gfx_shader_group.sv create mode 100644 rtl/gfx/gfx_shader_mem.sv create mode 100644 rtl/gfx/gfx_shader_regs.sv create mode 100644 rtl/gfx/gfx_shader_schedif.rdl create mode 100644 rtl/gfx/gfx_shader_setup.sv create mode 100644 rtl/gfx/gfx_shader_sfu.sv create mode 100644 rtl/gfx/gfx_shake.sv create mode 100644 rtl/gfx/gfx_sim_debug.sv create mode 100644 rtl/gfx/gfx_skid_buf.sv create mode 100644 rtl/gfx/gfx_skid_flow.sv create mode 100644 rtl/gfx/gfx_wb.sv create mode 100644 rtl/gfx/gfx_xbar_sched.sv create mode 100644 rtl/gfx/mod.mk diff --git a/platform/wavelet3d/gfx_axib.sv b/platform/wavelet3d/gfx_axib.sv deleted file mode 100644 index 7b3cbdc..0000000 --- a/platform/wavelet3d/gfx_axib.sv +++ /dev/null @@ -1,81 +0,0 @@ -// AXI4 con burst -interface gfx_axib; - - import gfx::word; - - logic awvalid, - awready; - logic[7:0] awlen; - logic[1:0] awburst; - word awaddr; - - logic wlast; - logic wvalid; - logic wready; - word wdata; - - logic bvalid; - logic bready; - - logic arvalid, - arready; - logic[7:0] arlen; - logic[1:0] arburst; - word araddr; - - logic rlast; - logic rvalid; - logic rready; - word rdata; - - modport m - ( - input awready, - wready, - bvalid, - arready, - rlast, - rvalid, - rdata, - - output awlen, - awburst, - awvalid, - awaddr, - wlast, - wvalid, - wdata, - bready, - arlen, - arburst, - arvalid, - araddr, - rready - ); - - modport s - ( - input awlen, - awburst, - awvalid, - awaddr, - wlast, - wvalid, - wdata, - bready, - arlen, - arburst, - arvalid, - araddr, - rready, - - output awready, - wready, - bvalid, - arready, - rlast, - rvalid, - rdata - ); - -endinterface diff --git a/platform/wavelet3d/gfx_axil.sv b/platform/wavelet3d/gfx_axil.sv deleted file mode 100644 index c254e26..0000000 --- a/platform/wavelet3d/gfx_axil.sv +++ /dev/null @@ -1,61 +0,0 @@ -// AXI4-Lite, sin wstrb ni axprot -interface gfx_axil; - import gfx::*; - - logic awvalid; - logic awready; - word awaddr; - - logic wvalid; - logic wready; - word wdata; - - logic bvalid; - logic bready; - - logic arvalid; - logic arready; - word araddr; - - logic rvalid; - logic rready; - word rdata; - - modport m - ( - input awready, - wready, - bvalid, - arready, - rvalid, - rdata, - - output awvalid, - awaddr, - wvalid, - wdata, - bready, - arvalid, - araddr, - rready - ); - - modport s - ( - input awvalid, - awaddr, - wvalid, - wdata, - bready, - arvalid, - araddr, - rready, - - output awready, - wready, - bvalid, - arready, - rvalid, - rdata - ); -endinterface diff --git a/platform/wavelet3d/gfx_axil2regblock.sv b/platform/wavelet3d/gfx_axil2regblock.sv deleted file mode 100644 index 2449b05..0000000 --- a/platform/wavelet3d/gfx_axil2regblock.sv +++ /dev/null @@ -1,30 +0,0 @@ -module gfx_axil2regblock -( - gfx_axil.s axis, - axi4lite_intf.master axim -); - - assign axis.rdata = axim.RDATA; - assign axis.rvalid = axim.RVALID; - assign axis.bvalid = axim.BVALID; - assign axis.wready = axim.WREADY; - assign axis.arready = axim.ARREADY; - assign axis.awready = axim.AWREADY; - - assign axim.AWVALID = axis.awvalid; - assign axim.AWADDR = axis.awaddr[$bits(axim.AWADDR) - 1:0]; - assign axim.AWPROT = '0; - - assign axim.WVALID = axis.wvalid; - assign axim.WDATA = axis.wdata; - assign axim.WSTRB = '1; - - assign axim.BREADY = axis.bready; - - assign axim.ARVALID = axis.arvalid; - assign axim.ARADDR = axis.araddr[$bits(axim.ARADDR) - 1:0]; - assign axim.ARPROT = '0; - - assign axim.RREADY = axis.rready; - -endmodule diff --git a/platform/wavelet3d/gfx_beats.sv b/platform/wavelet3d/gfx_beats.sv deleted file mode 100644 index fcbb091..0000000 --- a/platform/wavelet3d/gfx_beats.sv +++ /dev/null @@ -1,29 +0,0 @@ -interface gfx_beats -#(int WIDTH = $bits(gfx::word)); - - logic[WIDTH - 1:0] data; - logic ready; - logic valid; - - modport tx - ( - input ready, - output data, - valid - ); - - modport rx - ( - input data, - valid, - output ready - ); - - modport peek - ( - input data, - ready, - valid - ); - -endinterface diff --git a/platform/wavelet3d/gfx_bootrom.sv b/platform/wavelet3d/gfx_bootrom.sv deleted file mode 100644 index 2c4581e..0000000 --- a/platform/wavelet3d/gfx_bootrom.sv +++ /dev/null @@ -1,66 +0,0 @@ -module gfx_bootrom -import gfx::*; -( - input logic clk, - rst_n, - - gfx_axil.s axis -); - - localparam ROM_WORDS_LOG = 8; - - enum int unsigned - { - WAIT, - READ, - RDATA, - READY - } state; - - word read, rom[1 << ROM_WORDS_LOG]; - logic[ROM_WORDS_LOG - 1:0] read_addr; - - assign axis.bvalid = 0; - assign axis.wready = 0; - assign axis.awready = 0; - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - state <= WAIT; - axis.rvalid <= 0; - axis.arready <= 0; - end else begin - axis.arready <= 0; - - unique case (state) - WAIT: - if (axis.arvalid & ~axis.arready) - state <= READ; - - READ: - state <= RDATA; - - RDATA: begin - state <= READY; - axis.rvalid <= 1; - end - - READY: - if (axis.rready) begin - state <= WAIT; - axis.rvalid <= 0; - axis.arready <= 1; - end - endcase - end - - always_ff @(posedge clk) begin - read <= rom[read_addr]; - read_addr <= axis.araddr[$bits(read_addr) + SUBWORD_BITS - 1:SUBWORD_BITS]; - axis.rdata <= read; - end - - initial - $readmemh("gfx_bootrom.hex", rom); - -endmodule diff --git a/platform/wavelet3d/gfx_clz.sv b/platform/wavelet3d/gfx_clz.sv deleted file mode 100644 index 8d6f100..0000000 --- a/platform/wavelet3d/gfx_clz.sv +++ /dev/null @@ -1,68 +0,0 @@ -/* Implementación en árbol de count leading zeros (CLZ). - * WIDTH debe ser una potencia de 2. - */ -module gfx_clz -#(int WIDTH = 0) -( - input logic clk, - - input logic[WIDTH - 1:0] value, - output logic[$clog2(WIDTH):0] clz -); - - genvar i; - generate - if (WIDTH <= 1) begin - always_ff @(posedge clk) - clz <= !value; - end else if (WIDTH == 2) begin - always_ff @(posedge clk) - unique case (value) - 2'b00: clz <= 2'b10; - 2'b01: clz <= 2'b01; - 2'b10: clz <= 2'b00; - 2'b11: clz <= 2'b00; - endcase - end else if (WIDTH == 4) begin - // Eficiente en FPGAs con 4-LUTs - always_ff @(posedge clk) - if (value[3]) - clz <= 3'b000; - else if (value[2]) - clz <= 3'b001; - else if (value[1]) - clz <= 3'b010; - else if (value[0]) - clz <= 3'b011; - else - clz <= 3'b100; - end else begin - logic msb_right; - logic[$clog2(WIDTH) - 1:0] clz_left, clz_right; - logic[$clog2(WIDTH) - 2:0] tail_right; - - assign {msb_right, tail_right} = clz_right; - - gfx_clz #(WIDTH / 2) left - ( - .clk(clk), - .clz(clz_left), - .value(value[WIDTH - 1:WIDTH / 2]) - ); - - gfx_clz #(WIDTH / 2) right - ( - .clk(clk), - .clz(clz_right), - .value(value[WIDTH / 2 - 1:0]) - ); - - always_ff @(posedge clk) - if (clz_left[$clog2(WIDTH) - 1]) - clz <= {msb_right, ~msb_right, tail_right}; - else - clz <= {1'b0, clz_left}; - end - endgenerate - -endmodule diff --git a/platform/wavelet3d/gfx_ctz.sv b/platform/wavelet3d/gfx_ctz.sv deleted file mode 100644 index 2713f8a..0000000 --- a/platform/wavelet3d/gfx_ctz.sv +++ /dev/null @@ -1,18 +0,0 @@ -// Count trailing zeros (ctz), clz al revés -module gfx_ctz -#(int WIDTH = 0) -( - input logic clk, - - input logic[WIDTH - 1:0] value, - output logic[$clog2(WIDTH):0] ctz -); - - gfx_clz #(WIDTH) clz - ( - .clk, - .value({<<{value}}), - .clz(ctz) - ); - -endmodule diff --git a/platform/wavelet3d/gfx_fifo.sv b/platform/wavelet3d/gfx_fifo.sv deleted file mode 100644 index 7174e4d..0000000 --- a/platform/wavelet3d/gfx_fifo.sv +++ /dev/null @@ -1,102 +0,0 @@ -module gfx_fifo -#(int WIDTH = 0, - int DEPTH = 0) -( - input logic clk, - rst_n, - - gfx_beats.rx in, - gfx_beats.tx out -); - - logic do_read, do_write, full_if_eq, in_stall, out_stall, - may_read, may_write, read, read_ok, write; - - logic[WIDTH - 1:0] fifo[DEPTH], read_data, write_data; - logic[$clog2(DEPTH) - 1:0] read_ptr, write_ptr; - - assign do_read = read & may_read; - assign do_write = write & may_write; - - always_comb begin - may_read = full_if_eq; - may_write = !full_if_eq; - - if (read) - may_write = 1; - - if (read_ptr != write_ptr) begin - may_read = 1; - may_write = 1; - end - end - - gfx_skid_flow in_flow - ( - .clk, - .rst_n, - .stall(in_stall), - .in_ready(in.ready), - .in_valid(in.valid), - .out_ready(may_write), - .out_valid(write) - ); - - gfx_skid_flow out_flow - ( - .clk, - .rst_n, - .stall(out_stall), - .in_ready(read), - .in_valid(read_ok), - .out_ready(out.ready), - .out_valid(out.valid) - ); - - gfx_skid_buf #(WIDTH) in_skid - ( - .clk, - .in(in.data), - .out(write_data), - .stall(in_stall) - ); - - gfx_skid_buf #(WIDTH) out_skid - ( - .clk, - .in(read_data), - .out(out.data), - .stall(out_stall) - ); - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - read_ok <= 0; - read_ptr <= 0; - write_ptr <= 0; - full_if_eq <= 0; - end else begin - if (~out_stall) - read_ok <= read && may_read; - - if (do_read) - read_ptr <= read_ptr + 1; - - if (do_write) - write_ptr <= write_ptr + 1; - - if (do_read & ~do_write) - full_if_eq <= 0; - else if (~do_read & do_write) - full_if_eq <= 1; - end - - always_ff @(posedge clk) begin - if (~out_stall) - read_data <= fifo[read_ptr]; - - if (may_write) - fifo[write_ptr] <= write_data; - end - -endmodule diff --git a/platform/wavelet3d/gfx_fixed_dotadd.sv b/platform/wavelet3d/gfx_fixed_dotadd.sv deleted file mode 100644 index fdd5ffd..0000000 --- a/platform/wavelet3d/gfx_fixed_dotadd.sv +++ /dev/null @@ -1,55 +0,0 @@ -module gfx_fixed_dotadd -( - input logic clk, - - input gfx::fixed a0, - b0, - a1, - b1, - c, - input logic stall, - - output gfx::fixed q -); - - import gfx::*; - - fixed q0, a1_hold, b1_hold; - - gfx_fixed_muladd muladd_0 - ( - .clk, - .a(a0), - .b(b0), - .c, - .q(q0), - .stall - ); - - gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes - ( - .clk, - .in(a1), - .out(a1_hold), - .stall - ); - - gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes - ( - .clk, - .in(b1), - .out(b1_hold), - .stall - ); - - gfx_fixed_muladd muladd_1 - ( - .clk, - .a(a1_hold), - .b(b1_hold), - .c(q0), - .q, - .stall - ); - -endmodule diff --git a/platform/wavelet3d/gfx_fixed_muladd.sv b/platform/wavelet3d/gfx_fixed_muladd.sv deleted file mode 100644 index 22b7247..0000000 --- a/platform/wavelet3d/gfx_fixed_muladd.sv +++ /dev/null @@ -1,77 +0,0 @@ -module gfx_fixed_muladd -( - input logic clk, - - input gfx::fixed a, - b, - c, - input logic stall, - - output gfx::fixed q -); - - import gfx::*; - -`ifndef VERILATOR - logic[2 * $bits(fixed) - $bits(fixed_frac) - 1:0] q_ext; - - assign q = q_ext[$bits(fixed) - 1:0]; - - lpm_mult mult - ( - .aclr(0), - .clock(clk), - .clken(!stall), - - .sum({c, {`FIXED_FRAC{1'b0}}}), - .dataa(a), - .datab(b), - .result(q_ext) - ); - - defparam - mult.lpm_widtha = $bits(fixed), - mult.lpm_widthb = $bits(fixed), - mult.lpm_widths = $bits(fixed) + $bits(fixed_frac), - /* Esto es crucial. No está documentado en ningún lado (aparte de un - * comentario en r/fpga). Si lpm_widthp < lpm_widtha + lpm_widthb, - * entonces result contiene los lpm_widthp bits más significativos - * del producto, no los menos significativos como tendría sentido. - */ - mult.lpm_widthp = 2 * $bits(fixed) - $bits(fixed_frac), - mult.lpm_representation = "SIGNED", - mult.lpm_pipeline = FIXED_MULADD_DEPTH; -`else - logic[$bits(fixed) + $bits(fixed_frac) - 1:0] q_ext; - - fixed a_hold, b_hold, c_hold; - - assign q = q_ext[$bits(fixed) + $bits(fixed_frac) - 1:$bits(fixed_frac)] + c_hold; - assign q_ext = a_hold * b_hold; - - gfx_pipes #(.WIDTH($bits(a)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes - ( - .clk, - .in(a), - .out(a_hold), - .stall - ); - - gfx_pipes #(.WIDTH($bits(b)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes - ( - .clk, - .in(b), - .out(b_hold), - .stall - ); - - gfx_pipes #(.WIDTH($bits(c)), .DEPTH(FIXED_MULADD_DEPTH)) c_pipes - ( - .clk, - .in(c), - .out(c_hold), - .stall - ); -`endif - -endmodule diff --git a/platform/wavelet3d/gfx_front_back.sv b/platform/wavelet3d/gfx_front_back.sv deleted file mode 100644 index b768532..0000000 --- a/platform/wavelet3d/gfx_front_back.sv +++ /dev/null @@ -1,37 +0,0 @@ -interface gfx_front_back -import gfx::*;; - - struct - { - wave_exec wave; - fpint_op p0; - mem_op p1; - sfu_op p2; - group_op p3; - } execute; - - struct - { - logic valid; - group_id group; - } loop; - - shader_dispatch dispatch; - - modport front - ( - input loop, - - output execute, - dispatch - ); - - modport back - ( - input execute, - dispatch, - - output loop - ); - -endinterface diff --git a/platform/wavelet3d/gfx_isa.sv b/platform/wavelet3d/gfx_isa.sv deleted file mode 100644 index 7239478..0000000 --- a/platform/wavelet3d/gfx_isa.sv +++ /dev/null @@ -1,84 +0,0 @@ -package gfx_isa; - - typedef logic[3:0] sgpr_num; - typedef logic[2:0] vgpr_num; - - typedef logic signed[7:0] pc_offset; - - typedef union packed - { - sgpr_num sgpr; - - struct packed - { - logic[$bits(sgpr_num) - $bits(vgpr_num) - 1:0] reserved; - vgpr_num num; - } vgpr; - } xgpr_num; - - typedef struct packed - { - enum logic[1:0] - { - REGS_SVS = 2'b00, - REGS_SSS = 2'b01, - REGS_VVS = 2'b10, - REGS_VVV = 2'b11 - } reg_mode; - - union packed - { - struct packed - { - logic b_is_imm; - - union packed - { - logic[12:0] imm; - - struct packed - { - logic from_consts; - logic[7:0] reserved; - xgpr_num r; - } read; - } b; - - xgpr_num ra, - rd; - } rr; - } dst_src; - - logic reg_rev; - - union packed - { - struct packed - { - enum logic[4:0] - { - INSN_FPINT_MOV = 0, - INSN_FPINT_FMUL = 1, - INSN_FPINT_IMUL = 2, - INSN_FPINT_FADD = 3, - INSN_FPINT_RES4 = 4, - INSN_FPINT_FMAX = 5, - INSN_FPINT_RES6 = 6, - INSN_FPINT_FMIN = 7, - INSN_FPINT_RES8 = 8, - INSN_FPINT_FCVT = 9, - INSN_FPINT_RES[10:31] - } op; - } fpint; - } by_class; - - enum logic[1:0] - { - INSN_FPINT = 0, - INSN_MEM = 1, - INSN_SFU = 2, - INSN_GROUP = 3 - } insn_class; - } insn_word; - -endpackage diff --git a/platform/wavelet3d/gfx_pipes.sv b/platform/wavelet3d/gfx_pipes.sv deleted file mode 100644 index 2fa875a..0000000 --- a/platform/wavelet3d/gfx_pipes.sv +++ /dev/null @@ -1,24 +0,0 @@ -module gfx_pipes -#(int WIDTH=0, int DEPTH=0) -( - input logic clk, - - input logic[WIDTH - 1:0] in, - input logic stall, - - output logic[WIDTH - 1:0] out -); - - logic[WIDTH - 1:0] pipes[DEPTH]; - - assign out = pipes[DEPTH - 1]; - - always_ff @(posedge clk) - if (~stall) begin - pipes[0] <= in; - - for (integer i = 1; i < DEPTH; ++i) - pipes[i] <= pipes[i - 1]; - end - -endmodule diff --git a/platform/wavelet3d/gfx_pkg.sv b/platform/wavelet3d/gfx_pkg.sv deleted file mode 100644 index 7072967..0000000 --- a/platform/wavelet3d/gfx_pkg.sv +++ /dev/null @@ -1,271 +0,0 @@ -package gfx; - - typedef logic[31:0] word; - - typedef word uword; - typedef logic signed[$bits(word) - 1:0] sword; - typedef logic[$bits(word) / 2 - 1:0] uhword; - typedef logic signed[$bits(word) / 2 - 1:0] shword; - typedef logic[2 * $bits(word) - 1:0] udword; - typedef logic signed[2 * $bits(word) - 1:0] sdword; - typedef logic signed[4 * $bits(word) - 1:0] qword; - typedef logic signed[8 * $bits(word) - 1:0] oword; - - localparam int SUBWORD_BITS = $clog2($bits(word)) - $clog2($bits(byte)); - localparam int BYTES_PER_WORD = 1 << SUBWORD_BITS; - - typedef logic[$bits(word) - SUBWORD_BITS - 1:0] word_ptr; - typedef logic[$bits(word_ptr) - 1 - 1:0] dword_ptr; - typedef logic[$bits(word_ptr) - 2 - 1:0] qword_ptr; - typedef logic[$bits(word_ptr) - 3 - 1:0] oword_ptr; - - typedef logic[7:0] float_exp; - typedef logic[$bits(word) - $bits(float_exp) - 2:0] float_mant; - typedef logic[$bits(float_mant):0] float_mant_full; // Incluye '1.' explícito - typedef logic[$bits(float_mant_full) + 1:0] float_mant_ext; // Considera overflow - - localparam float_exp FLOAT_EXP_BIAS = (1 << ($bits(float_exp) - 1)) - 1; - localparam float_exp FLOAT_EXP_MAX = {($bits(float_exp)){1'b1}}; - - function float_mant_full full_mant(float_mant in); - full_mant = {1'b1, in}; - endfunction - - function float_mant implicit_mant(float_mant_full in); - assert (in[$bits(in) - 1]); - implicit_mant = in[$bits(in) - 2:0]; - endfunction - - typedef struct packed - { - logic sign; - float_exp exp; - float_mant mant; - } float; - - /* Explicación de guard, round, sticky: - * https://drilian.com/2023/01/10/floating-point-numbers-and-rounding/ - */ - typedef struct packed - { - float normal; - logic slow, - zero, - guard, - round, - sticky; - } float_round; - - typedef struct packed - { - logic exp_max, - exp_min, - mant_zero; - } float_class; - - function float_class classify_float(float in); - classify_float.exp_max = &in.exp; - classify_float.exp_min = ~|in.exp; - classify_float.mant_zero = ~|in.mant; - endfunction - - function logic is_float_special(float_class in); - is_float_special = in.exp_max | (in.exp_min & ~in.mant_zero); - endfunction - - function float_mant_ext float_prepare_round(float in, float_class in_class); - float_prepare_round = {~in_class.exp_min, in.mant, 2'b00}; - endfunction - - typedef struct packed - { - logic setup_mul_float, - setup_unit_b, - mnorm_put_hi, - mnorm_put_lo, - mnorm_put_mul, - mnorm_zero_b, - mnorm_zero_flags, - minmax_abs, - minmax_swap, - minmax_zero_min, - minmax_copy_flags, - shiftr_int_signed, - addsub_copy_flags, - addsub_int_operand, - clz_force_nop, - shiftl_copy_flags, - round_copy_flags, - round_enable, - encode_enable, - writeback; - } fpint_op; - - typedef struct packed - { - logic todo; - } mem_op; - - typedef struct packed - { - logic todo; - } sfu_op; - - typedef struct packed - { - logic todo; - } group_op; - - // Q22.10 - typedef logic[9:0] fixed_frac; - typedef logic[$bits(word) - $bits(fixed_frac) - 1:0] fixed_int; - - typedef struct packed signed - { - fixed_int fint; // 'int' es una keyword - fixed_frac frac; - } fixed; - - typedef struct packed - { - fixed x, - y; - } fixed_xy; - - typedef struct packed - { - fixed a, - b, - c; - } vtx_fixed; - - typedef struct packed - { - fixed_xy a, - b, - c; - } vtx_xy; - - localparam int RASTER_BITS = 2; - localparam int RASTER_SUB_BITS = 4; - localparam int RASTER_SIZE = 1 << RASTER_BITS; - localparam int RASTER_COARSE_FRAGS = RASTER_SIZE * RASTER_SIZE; - - typedef logic[RASTER_BITS - 1:0] raster_index; - - // Caso RASTER_BITS = 2: -> 4,4,4,4 -> 8,8-> 16 - localparam int RASTER_OUT_CLZ_DEPTH = 3; - - // Asume RASTER_BITS == 2, hay que ajustarlo si cambia - typedef struct packed - { - // Esto ahorra muchos flops - // - // offsets[0] = inc * 0 = 0 - // offsets[1] = inc * 1 = raster2_times1 - // offsets[2] = inc * 2 = raster2_times1 << 1 - // offsets[3] = inc * 3 = raster2_times3 - fixed raster2_times1, - raster2_times3; - } raster_offsets; - - function fixed raster_idx(raster_offsets offsets, raster_index idx); - unique case (idx) - RASTER_BITS'(0): - return '0; - - RASTER_BITS'(1): - return offsets.raster2_times1; - - RASTER_BITS'(2): - return offsets.raster2_times1 << 1; - - RASTER_BITS'(3): - return offsets.raster2_times3; - endcase - endfunction - - function raster_offsets make_raster_offsets(fixed inc); - make_raster_offsets.raster2_times1 = inc; - make_raster_offsets.raster2_times3 = inc + (inc << 1); - endfunction - - typedef struct packed - { - raster_offsets x, - y; - } raster_offsets_xy; - - typedef struct packed - { - logic[RASTER_SUB_BITS - 1:0] num; - logic[$bits(fixed_frac) - RASTER_SUB_BITS - 1:0] prec; - } raster_sub; - - localparam int RASTER_COARSE_DIM_BITS = $bits(fixed) - $bits(raster_index) - $bits(raster_sub); - - typedef logic signed[RASTER_COARSE_DIM_BITS - 1:0] raster_coarse_dim; - - typedef struct packed - { - raster_coarse_dim x, - y; - } raster_coarse_xy; - - typedef struct packed signed - { - raster_coarse_dim coarse; - raster_index fine; - raster_sub sub; - } raster_prec; - - typedef struct packed - { - raster_prec x, - y; - } raster_prec_xy; - - // Definir el número de lanes a partir de las dimensiones del - // rasterizer es una decisión crucial, el diseño entero depende de esto - - localparam int SHADER_LANES = RASTER_COARSE_FRAGS; - - typedef logic[RASTER_SIZE - 1:0] lane_no; - typedef logic[SHADER_LANES - 1:0] lane_mask; - - typedef logic[5:0] group_id; - - localparam int REGFILE_STAGES = 3; - localparam int REG_READ_STAGES = 2 + REGFILE_STAGES + 1; - - typedef gfx_isa::sgpr_num sgpr_num; - typedef gfx_isa::vgpr_num vgpr_num; - typedef gfx_isa::xgpr_num xgpr_num; - typedef gfx_isa::pc_offset pc_offset; - - typedef struct packed - { - // No incluye p0 porque p0 no tiene señal ready - logic p1, - p2, - p3, - valid; - } shader_dispatch; - - typedef struct - { - group_id group; - xgpr_num dest; - logic dest_scalar; - } wave_exec; - - localparam int FIXED_MULADD_DEPTH = 5; - localparam int FIXED_DOTADD_DEPTH = 2 * FIXED_MULADD_DEPTH; - - localparam word BOOTROM_BASE = 32'h0010_0000; - - localparam int SCHED_BRAM_WORDS = 2048; // 8KiB - - typedef word irq_lines; - -endpackage diff --git a/platform/wavelet3d/gfx_pkts.sv b/platform/wavelet3d/gfx_pkts.sv deleted file mode 100644 index 41399ce..0000000 --- a/platform/wavelet3d/gfx_pkts.sv +++ /dev/null @@ -1,29 +0,0 @@ -interface gfx_pkts -#(parameter int WIDTH = $bits(gfx::word)); - - import gfx::*; - - logic tlast; - logic tready; - logic tvalid; - logic[WIDTH - 1:0] tdata; - - modport tx - ( - input tready, - - output tdata, - tlast, - tvalid - ); - - modport rx - ( - input tdata, - tlast, - tvalid, - - output tready - ); - -endinterface diff --git a/platform/wavelet3d/gfx_raster.sv b/platform/wavelet3d/gfx_raster.sv deleted file mode 100644 index a57a672..0000000 --- a/platform/wavelet3d/gfx_raster.sv +++ /dev/null @@ -1,930 +0,0 @@ -module gfx_raster -( - input logic clk, - rst_n, - - gfx_pkts.rx geometry, - - gfx_pkts.tx coverage -); - - import gfx::*; - - gfx_raster_bounds setup_bounds - ( - .clk, - .rst_n, - - .geometry, - - .edges_ref(bounds_edges_ref), - .edges_vtx(bounds_edges_vtx), - .edges_span(bounds_edges_span), - .edges_ready(bounds_edges_ready), - .edges_valid(bounds_edges_valid), - .edges_geom_id(bounds_edges_geom_id) - ); - - word bounds_edges_geom_id; - logic bounds_edges_ready, bounds_edges_valid; - vtx_xy bounds_edges_vtx; - fixed_xy bounds_edges_ref; - raster_prec_xy bounds_edges_span; - - gfx_raster_edges setup_edges - ( - .clk, - .rst_n, - - .bounds_ref(bounds_edges_ref), - .bounds_vtx(bounds_edges_vtx), - .bounds_span(bounds_edges_span), - .bounds_ready(bounds_edges_ready), - .bounds_valid(bounds_edges_valid), - .bounds_geom_id(bounds_edges_geom_id), - - .coarse_ref(edges_coarse_ref), - .coarse_base(edges_coarse_base), - .coarse_span(edges_coarse_span), - .coarse_ready(edges_coarse_ready), - .coarse_valid(edges_coarse_valid), - .coarse_geom_id(edges_coarse_geom_id), - .coarse_offsets(edges_coarse_offsets) - ); - - word edges_coarse_geom_id; - fixed edges_coarse_base; - logic edges_coarse_ready, edges_coarse_valid; - fixed_xy edges_coarse_ref; - raster_prec_xy edges_coarse_span; - raster_offsets_xy edges_coarse_offsets; - - gfx_raster_coarse coarse - ( - .clk, - .rst_n, - - .edges_ref(edges_coarse_ref), - .edges_base(edges_coarse_base), - .edges_span(edges_coarse_span), - .edges_ready(edges_coarse_ready), - .edges_valid(edges_coarse_valid), - .edges_geom_id(edges_coarse_geom_id), - .edges_offsets(edges_coarse_offsets), - - .fine_ref(coarse_fine_ref), - .fine_ready(coarse_fine_ready), - .fine_valid(coarse_fine_valid), - .fine_corner(coarse_fine_corner), - .fine_geom_id(coarse_fine_geom_id), - .fine_offsets(coarse_fine_offsets) - ); - - word coarse_fine_geom_id; - fixed coarse_fine_corner; - logic coarse_fine_ready, coarse_fine_valid; - fixed_xy coarse_fine_ref; - raster_offsets_xy coarse_fine_offsets; - - gfx_raster_fine fine - ( - .clk, - .rst_n, - - .coarse_ref(coarse_fine_ref), - .coarse_ready(coarse_fine_ready), - .coarse_valid(coarse_fine_valid), - .coarse_corner(coarse_fine_corner), - .coarse_geom_id(coarse_fine_geom_id), - .coarse_offsets(coarse_fine_offsets), - - .coverage - ); - -endmodule - -module gfx_raster_bounds -( - input logic clk, - rst_n, - - gfx_pkts.rx geometry, - - input logic edges_ready, - output logic edges_valid, - output gfx::word edges_geom_id, - output gfx::fixed_xy edges_ref, - output gfx::raster_prec_xy edges_span, - output gfx::vtx_xy edges_vtx -); - - import gfx::*; - - enum int unsigned - { - IN_GEOM_ID, - IN_DIM_X, - IN_DIM_Y - } in_state; - - enum int unsigned - { - VTX_A, - VTX_B, - VTX_C - } vtx_state; - - logic a_lt_b, a_lt_c, b_lt_c, edges_handshake, geom_complete, geom_last, - geom_recv, in_vtx, next_dim, new_vtx; - - logic end_new_dim, end_valid, vtx_valid, lt_new_dim, lt_valid, minmax_new_dim, minmax_valid; - - fixed geom_data; - vtx_fixed dim_vtx, dim_vtx_x, dim_vtx_y; - raster_prec max, min; - - assign geom_recv = geometry.tready & geometry.tvalid; - assign edges_handshake = edges_valid & edges_ready; - - assign edges_vtx.a.x = dim_vtx_x.a; - assign edges_vtx.a.y = dim_vtx_y.a; - assign edges_vtx.b.x = dim_vtx_x.b; - assign edges_vtx.b.y = dim_vtx_y.b; - assign edges_vtx.c.x = dim_vtx_x.c; - assign edges_vtx.c.y = dim_vtx_y.c; - - assign geometry.tready = edges_handshake | ~geom_complete; - - always_comb begin - unique case (vtx_state) - VTX_C: next_dim = geom_recv; - default: next_dim = 0; - endcase - - unique case (in_state) - IN_DIM_Y: geom_last = next_dim; - default: geom_last = 0; - endcase - end - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - in_state <= IN_GEOM_ID; - vtx_state <= VTX_A; - - in_vtx <= 0; - new_vtx <= 0; - geom_complete <= 0; - - lt_valid <= 0; - end_valid <= 0; - vtx_valid <= 0; - edges_valid <= 0; - minmax_valid <= 0; - - lt_new_dim <= 0; - end_new_dim <= 0; - minmax_new_dim <= 0; - - edges_geom_id <= 'x; - end else begin - end_valid <= 0; - vtx_valid <= end_valid; - lt_valid <= vtx_valid; - minmax_valid <= lt_valid; - - if (~edges_valid | edges_ready) - edges_valid <= minmax_valid; - - geom_complete <= (geom_complete | geom_last) & ~edges_handshake; - - unique case (in_state) - IN_GEOM_ID: - if (geom_recv) begin - in_state <= IN_DIM_X; - - in_vtx <= 1; - edges_geom_id <= geometry.tdata; - end - - IN_DIM_X: - if (next_dim) - in_state <= IN_DIM_Y; - - IN_DIM_Y: - if (next_dim) begin - in_state <= IN_GEOM_ID; - - in_vtx <= 0; - end_valid <= 1; - end - endcase - - new_vtx <= 0; - - lt_new_dim <= 0; - minmax_new_dim <= lt_new_dim; - end_new_dim <= minmax_new_dim; - - unique case (vtx_state) - VTX_A: begin - if (in_vtx & geom_recv) begin - new_vtx <= 1; - vtx_state <= VTX_B; - end - - if (new_vtx) begin - dim_vtx.c <= geom_data; - lt_new_dim <= 1; - end - end - - VTX_B: begin - if (geom_recv) begin - new_vtx <= 1; - vtx_state <= VTX_C; - end - - if (new_vtx) - dim_vtx.a <= geom_data; - end - - VTX_C: begin - if (geom_recv) begin - new_vtx <= 1; - vtx_state <= VTX_A; - end - - if (new_vtx) - dim_vtx.b <= geom_data; - end - endcase - - if (in_state == IN_DIM_Y & next_dim) - assert (geometry.tlast); - end - - always_ff @(posedge clk) begin - geom_data <= geometry.tdata; - - a_lt_b <= $signed(dim_vtx.a) < $signed(dim_vtx.b); - a_lt_c <= $signed(dim_vtx.a) < $signed(dim_vtx.c); - b_lt_c <= $signed(dim_vtx.b) < $signed(dim_vtx.c); - - // Realmente no son 'x' o 'y' hasta cuando edges_valid = 1 - if (lt_new_dim) begin - dim_vtx_y <= dim_vtx; - dim_vtx_x <= dim_vtx_y; - end - - if (a_lt_b) begin - min <= a_lt_c ? dim_vtx_y.a : dim_vtx_y.c; - max <= b_lt_c ? dim_vtx_y.c : dim_vtx_y.b; - end else begin - min <= b_lt_c ? dim_vtx_y.b : dim_vtx_y.c; - max <= a_lt_c ? dim_vtx_y.c : dim_vtx_y.a; - end - - {min.fine, min.sub} <= '0; - {max.fine, max.sub} <= '0; - - if (end_new_dim) begin - edges_ref.y <= min; - edges_ref.x <= edges_ref.y; - - edges_span.y <= max - min; - edges_span.x <= edges_span.y; - end - end - -endmodule - -module gfx_raster_edges -( - input logic clk, - rst_n, - - input logic bounds_valid, - input gfx::word bounds_geom_id, - input gfx::fixed_xy bounds_ref, - input gfx::raster_prec_xy bounds_span, - input gfx::vtx_xy bounds_vtx, - output logic bounds_ready, - - input logic coarse_ready, - output logic coarse_valid, - output gfx::word coarse_geom_id, - output gfx::fixed_xy coarse_ref, - output gfx::raster_prec_xy coarse_span, - output gfx::fixed coarse_base, - output gfx::raster_offsets_xy coarse_offsets -); - - import gfx::*; - - enum int unsigned - { - EDGE_AB, - EDGE_BC, - EDGE_CA, - // EDGE_CA cumple doble función como OFFSETS_AB - OFFSETS_BC, - OFFSETS_CA, - OUT - } state; - - struct - { - fixed_xy cur, - delay1, - delay2; - } inc; - - logic coarse_handshake, coarse_stall, offsets_flow; - fixed_xy delta, p, q; - - // - 2 porque coarse valid va al final - logic[FIXED_DOTADD_DEPTH - 2:0] dotadd_valid; - - assign coarse_stall = coarse_valid & ~coarse_ready; - assign coarse_handshake = coarse_valid & coarse_ready; - - gfx_fixed_dotadd edge_base - ( - .clk, - .c(0), - .q(coarse_base), - .a0(delta.x), - .b0(inc.cur.x), - .a1(delta.y), - .b1(inc.cur.y), - .stall(coarse_stall) - ); - - always_comb - unique case (state) - OUT: offsets_flow = coarse_handshake; - default: offsets_flow = 1; - endcase - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - state <= EDGE_AB; - - p <= 'x; - q <= 'x; - coarse_ref <= 'x; - coarse_geom_id <= 'x; - - bounds_ready <= 0; - coarse_valid <= 0; - - for (int i = 0; i < $bits(dotadd_valid) - 1; ++i) - dotadd_valid[i] <= 0; - end else begin - for (int i = 1; i < $bits(dotadd_valid); ++i) - dotadd_valid[i] <= dotadd_valid[i - 1]; - - if (~coarse_stall) - coarse_valid <= dotadd_valid[$bits(dotadd_valid) - 1]; - - bounds_ready <= 0; - dotadd_valid[0] <= 0; - - unique case (state) - EDGE_AB: begin - if (bounds_valid) - state <= EDGE_BC; - - coarse_ref <= bounds_ref; - coarse_span <= bounds_span; - coarse_geom_id <= bounds_geom_id; - - p <= bounds_vtx.a; - q <= bounds_vtx.b; - end - - EDGE_BC: begin - state <= EDGE_CA; - bounds_ready <= 1; - - p <= bounds_vtx.b; - q <= bounds_vtx.c; - end - - EDGE_CA: begin - state <= OFFSETS_BC; - - p <= bounds_vtx.c; - q <= bounds_vtx.a; - - // Esto ocurre justamente en un momento en que ab, bc, ca - // quedan todos en sus lugares correctos en la pipeline - dotadd_valid[0] <= 1; - end - - OFFSETS_BC: - state <= OFFSETS_CA; - - OFFSETS_CA: - state <= OUT; - - OUT: - if (coarse_handshake) - state <= EDGE_AB; - endcase - end - - always_ff @(posedge clk) begin - delta.x <= coarse_ref.x - q.x; - delta.y <= coarse_ref.y - q.y; - - inc.cur.x <= p.y - q.y; - inc.cur.y <= q.x - p.x; - - //TODO: top-left rule - if (offsets_flow) begin - inc.delay1 <= inc.cur; - inc.delay2 <= inc.delay1; - - coarse_offsets.x <= make_raster_offsets(inc.delay2.x); - coarse_offsets.y <= make_raster_offsets(inc.delay2.y); - end - end - -endmodule - -module gfx_raster_coarse -( - input logic clk, - rst_n, - - input logic edges_valid, - input gfx::word edges_geom_id, - input gfx::fixed_xy edges_ref, - input gfx::raster_prec_xy edges_span, - input gfx::fixed edges_base, - input gfx::raster_offsets_xy edges_offsets, - output logic edges_ready, - - input logic fine_ready, - output logic fine_valid, - output gfx::word fine_geom_id, - output gfx::fixed_xy fine_ref, - output gfx::fixed fine_corner, - output gfx::raster_offsets_xy fine_offsets -); - - import gfx::*; - - enum int unsigned - { - SETUP, - TEST_AB, - TEST_BC, - TEST_CA, - OUT - } state; - - struct - { - fixed cur, - next, - prev; - } corner, edge_fn, vertical; - - struct - { - raster_offsets_xy cur, - next, - prev; - } offsets; - - logic edges_recv, end_block, end_x, end_y, first_run, - mask, mask_reset, new_geom, test_flow, out_flow; - - fixed edge_test, reference_x, vertical_inc; - fixed_xy max_offset, min_offset, test_offset; - raster_coarse_xy stride; - raster_coarse_dim width; - raster_offsets_xy next_offsets; - - function fixed coarse_offset(raster_offsets offsets); - return raster_idx(offsets, RASTER_BITS'(1)) << RASTER_BITS; - endfunction - - assign end_x = stride.x == '0; - assign end_y = stride.y == '0; - assign end_block = end_x & end_y; - - assign edge_test = edge_fn.cur + test_offset.x + test_offset.y; - assign vertical_inc = vertical.cur + coarse_offset(offsets.cur.y); - - assign fine_corner = corner.cur; - assign fine_offsets = offsets.cur; // Vuelve a cur luego de 3 ciclos - - assign min_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(0)); - assign min_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(0)); - assign max_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(RASTER_SIZE - 1)); - assign max_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(RASTER_SIZE - 1)); - assign next_offsets = edges_recv ? edges_offsets : offsets.next; - - always_comb begin - unique case (state) - SETUP: new_geom = 1; - default: new_geom = 0; - endcase - - unique case (state) - TEST_AB: mask_reset = 1; - default: mask_reset = 0; - endcase - - unique case (state) - SETUP: edges_ready = 1; - default: edges_ready = 0; - endcase - - unique case (state) - SETUP: - edges_recv = 1; - - TEST_AB, TEST_BC: - edges_recv = first_run; - - default: - edges_recv = 0; - endcase - - unique case (state) - OUT: fine_valid = mask; - default: fine_valid = 0; - endcase - - unique case (state) - OUT: begin - out_flow = ~mask | fine_ready; - test_flow = 0; - end - - default: begin - out_flow = 0; - test_flow = 1; - end - endcase - end - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - state <= SETUP; - first_run <= 1; - end else - unique case (state) - SETUP: - if (edges_valid) - state <= TEST_AB; - - TEST_AB: - state <= TEST_BC; - - TEST_BC: - state <= TEST_CA; - - TEST_CA: - state <= OUT; - - OUT: begin - first_run <= end_block; - if (out_flow) - state <= end_block ? SETUP : TEST_AB; - end - endcase - - always_ff @(posedge clk) begin - if (new_geom) begin - width <= edges_span.x.coarse; - stride.x <= edges_span.x.coarse; - stride.y <= edges_span.y.coarse; - reference_x <= edges_ref.x; - - fine_ref <= edges_ref; - fine_geom_id <= edges_geom_id; - end - - if (out_flow) begin - stride.x <= stride.x - 1; - fine_ref.x.fint <= fine_ref.x.fint + ($bits(fixed_int))'(RASTER_SIZE); - - if (end_x) begin - fine_ref.x <= reference_x; - fine_ref.y.fint <= fine_ref.y.fint + ($bits(fixed_int))'(RASTER_SIZE); - - stride.x <= width; - stride.y <= stride.y - 1; - end - end - - if (test_flow) begin - offsets.cur <= next_offsets; - offsets.next <= offsets.prev; - offsets.prev <= offsets.cur; - - vertical.cur <= vertical.next; - vertical.next <= vertical.prev; - vertical.prev <= vertical.cur; - - edge_fn.cur <= edge_fn.next; - edge_fn.next <= edge_fn.prev; - edge_fn.prev <= edge_fn.cur + coarse_offset(offsets.cur.x); - - if (end_x) begin - edge_fn.prev <= vertical_inc; - vertical.prev <= vertical_inc; - end - - corner.cur <= corner.next; - corner.next <= corner.prev; - corner.prev <= edge_fn.cur; - - if (coarse_offset(next_offsets.x) >= 'sd0) - test_offset.x <= max_offset.x; - else - test_offset.x <= min_offset.x; - - if (coarse_offset(next_offsets.y) >= 'sd0) - test_offset.y <= max_offset.y; - else - test_offset.y <= min_offset.y; - - mask <= (mask | mask_reset) & 1/*(edge_test >= 'sd0)*/; - end - - if (edges_recv) begin - edge_fn.cur <= edges_base; - vertical.cur <= edges_base; - end - end - -endmodule - -module gfx_raster_fine -( - input logic clk, - rst_n, - - input logic coarse_valid, - input gfx::word coarse_geom_id, - input gfx::fixed_xy coarse_ref, - input gfx::fixed coarse_corner, - input gfx::raster_offsets_xy coarse_offsets, - output logic coarse_ready, - - gfx_pkts.tx coverage -); - - import gfx::*; - - enum int unsigned - { - IN_C, - IN_A, - IN_B, - IN_MASK - } in_state; - - enum int unsigned - { - OUT_ACCEPT, - OUT_GEOM_ID, - OUT_POS, - OUT_MASK, - OUT_BARY_C, - OUT_BARY_A, - OUT_BARY_B - } out_state; - - struct - { - fixed cur, - next, - prev; - } corner; - - struct - { - raster_offsets_xy cur, - next, - prev; - } offsets; - - logic begin_bary, hold_block, in_valid, mask_in_clean, - mask_in_reset, new_block, out_last; - - word geom_id; - fixed bary_coord; - lane_no lane, lane_ctz, lane_hold; - fixed_xy block_ref; - lane_mask mask_in, mask, mask_ctz; - raster_index lane_x, lane_y; - logic[$bits(lane_ctz):0] ctz_count; - - function shword ref_half(raster_prec dim); - return dim.coarse[$bits(shword) - 1:0]; - endfunction - - assign lane_ctz = ctz_count[$bits(lane_ctz) - 1:0]; - assign in_valid = mask_in_clean & |mask_in; - assign out_last = ~|mask; - assign {lane_y, lane_x} = lane; - - // **IMPORTANTE**: Esto va a fallar a partir de RASTER_BITS >= 3, - // ya que la fsm asume que ctz termina en 3 ciclos o menos - - gfx_ctz #(RASTER_COARSE_FRAGS) ctz - ( - .clk, - .value(mask_ctz), - .ctz(ctz_count) - ); - - always_comb begin - unique case (out_state) - OUT_ACCEPT: new_block = 1; - default: new_block = 0; - endcase - - unique case (out_state) - OUT_ACCEPT: mask_ctz = mask_in; - default: mask_ctz = mask; - endcase - - unique case (out_state) - OUT_ACCEPT: coverage.tvalid = 0; - default: coverage.tvalid = 1; - endcase - - unique case (out_state) - OUT_MASK, OUT_BARY_B: - begin_bary = coverage.tready; - - default: - begin_bary = 0; - endcase - - unique case (out_state) - OUT_BARY_B: coverage.tlast = out_last; - default: coverage.tlast = 0; - endcase - - unique case (out_state) - OUT_GEOM_ID: - coverage.tdata = geom_id; - - OUT_POS: - coverage.tdata = {ref_half(coarse_ref.y), ref_half(block_ref.x)}; - - OUT_MASK: - coverage.tdata = {{($bits(word) - $bits(mask)){1'b0}}, mask}; - - OUT_BARY_C, OUT_BARY_A, OUT_BARY_B: - coverage.tdata = bary_coord; - - default: - coverage.tdata = 'x; - endcase - - unique case (out_state) - OUT_MASK: - lane = lane_ctz; - - default: - lane = lane_hold; - endcase - - unique case (in_state) - IN_C: coarse_ready = new_block; - default: coarse_ready = 0; - endcase - - unique case (in_state) - IN_C: hold_block = new_block; - IN_A: hold_block = 1; - IN_B: hold_block = 1; - IN_MASK: hold_block = 0; - endcase - - unique case (in_state) - IN_C: mask_in_reset = 1; - default: mask_in_reset = 0; - endcase - - unique case (in_state) - IN_MASK: mask_in_clean = 1; - default: mask_in_clean = 0; - endcase - end - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - in_state <= IN_C; - out_state <= OUT_ACCEPT; - end else begin - unique case (in_state) - IN_C: - if (coarse_valid & new_block) - in_state <= IN_A; - - IN_A: - in_state <= IN_B; - - IN_B: - in_state <= IN_MASK; - - IN_MASK: - in_state <= IN_C; - endcase - - unique case (out_state) - OUT_ACCEPT: - if (in_valid) - out_state <= OUT_GEOM_ID; - - OUT_GEOM_ID: - if (coverage.tready) - out_state <= OUT_POS; - - OUT_POS: - if (coverage.tready) - out_state <= OUT_MASK; - - OUT_MASK: - if (coverage.tready) - out_state <= OUT_BARY_C; - - OUT_BARY_C: - if (coverage.tready) - out_state <= OUT_BARY_A; - - OUT_BARY_A: - if (coverage.tready) - out_state <= OUT_BARY_B; - - OUT_BARY_B: - if (coverage.tready) - out_state <= out_last ? OUT_ACCEPT : OUT_BARY_C; - endcase - end - - always_ff @(posedge clk) begin - // Prueba paralela de signos, esto hace el heavy lifting de fine raster - // Nótese que muchos sumadores serán eliminados en síntesis - for (int i = 0; i < RASTER_SIZE; ++i) - for (int j = 0; j < RASTER_SIZE; ++j) - mask_in[i * RASTER_SIZE + j] <= - (mask_in[i * RASTER_SIZE + j] | mask_in_reset) - & (coarse_corner - + raster_idx(coarse_offsets.y, RASTER_BITS'(i)) - + raster_idx(coarse_offsets.x, RASTER_BITS'(j)) - >= 'sd0); - - // Recalculamos las coordenadas baricéntricas de cada fragmento que - // no haya sido descartado. La razón de esto es evitar almacenar y - // luego multiplexar las coordenadas de un bloque entero (48 words). - if (coverage.tready) - bary_coord <= corner.next - + raster_idx(offsets.next.y, RASTER_BITS'(lane_y)) - + raster_idx(offsets.next.x, RASTER_BITS'(lane_x)); - - if (new_block & mask_in_reset) begin - geom_id <= coarse_geom_id; - block_ref <= coarse_ref; - end - - // new_block = 0 => coverage.tvalid = 1 - if (new_block | coverage.tready) begin - corner.cur <= corner.next; - corner.next <= corner.prev; - corner.prev <= corner.cur; - - offsets.cur <= offsets.next; - offsets.next <= offsets.prev; - offsets.prev <= offsets.cur; - end - - if (hold_block) begin - // Para prev en vez de cur para que los primeros valores queden en - // cur justamente al llegar a OUT_BARY_C - corner.prev <= coarse_corner; - offsets.prev <= coarse_offsets; - end - - if (new_block) - mask <= mask_in; - - if (begin_bary) begin - mask <= mask & (mask - 1); - lane_hold <= lane_ctz; - end - end - -endmodule diff --git a/platform/wavelet3d/gfx_regfile_io.sv b/platform/wavelet3d/gfx_regfile_io.sv deleted file mode 100644 index 2459049..0000000 --- a/platform/wavelet3d/gfx_regfile_io.sv +++ /dev/null @@ -1,106 +0,0 @@ -interface gfx_regfile_io; - - import gfx::*; - - struct - { - group_id group; - sgpr_num a_sgpr, - b_sgpr; - vgpr_num a_vgpr, - b_vgpr; - logic[12:0] b_imm; - logic a_scalar, - b_scalar, - b_is_imm, - b_is_const, - scalar_rev; - } op; - - struct - { - logic write; - group_id group; - sgpr_num sgpr; - word data; - } sgpr_write; - - struct - { - lane_mask mask; - group_id group; - vgpr_num vgpr; - word data[SHADER_LANES]; - } vgpr_write; - - word a[SHADER_LANES], b[SHADER_LANES], sgpr_write_data, vgpr_write_data[SHADER_LANES]; - logic mask_wb_write, pc_wb_write; - word_ptr pc_back, pc_front, pc_wb; - group_id mask_back_group, mask_wb_group, pc_back_group, pc_front_group, pc_wb_group; - lane_mask mask_back, mask_wb; - - modport ab - ( - input a, - b - ); - - modport read - ( - output op - ); - - modport bind_ - ( - input pc_front, - - output pc_front_group - ); - - modport wb - ( - input pc_back, - mask_back, - - output sgpr_write, - vgpr_write, - - pc_back_group, - mask_back_group, - - pc_wb, - pc_wb_group, - pc_wb_write, - - mask_wb, - mask_wb_group, - mask_wb_write - ); - - modport regs - ( - input op, - sgpr_write, - vgpr_write, - - pc_back_group, - pc_front_group, - mask_back_group, - - pc_wb, - pc_wb_group, - pc_wb_write, - - mask_wb, - mask_wb_group, - mask_wb_write, - - output a, - b, - - pc_back, - pc_front, - mask_back - ); - -endinterface diff --git a/platform/wavelet3d/gfx_rst_sync.sv b/platform/wavelet3d/gfx_rst_sync.sv deleted file mode 100644 index 2a8ea3b..0000000 --- a/platform/wavelet3d/gfx_rst_sync.sv +++ /dev/null @@ -1,13 +0,0 @@ -//FIXME: peligro -module gfx_rst_sync -( - input logic clk, - rst_n, - - output logic srst_n -); - - always_ff @(posedge clk or negedge rst_n) - srst_n <= ~rst_n ? 0 : 1; - -endmodule diff --git a/platform/wavelet3d/gfx_sched.sv b/platform/wavelet3d/gfx_sched.sv deleted file mode 100644 index b8b6b7e..0000000 --- a/platform/wavelet3d/gfx_sched.sv +++ /dev/null @@ -1,139 +0,0 @@ -module gfx_sched -import gfx::*; -( - input logic clk, - rst_n, - srst_n, - - gfx_axil.m axim, - - input irq_lines irq -); - - logic axi_ready, axi_valid, bram_ready, bram_read, bram_write, bram_write_next, - mem_instr, mem_la_read, mem_la_write, mem_ready, mem_valid, select_bram; - - word bram[SCHED_BRAM_WORDS]; - word axi_rdata, bram_rdata, mem_addr, mem_la_addr, mem_rdata, mem_wdata; - logic[$bits(word) / $bits(byte) - 1:0] mem_wstrb; - - logic[$clog2(SCHED_BRAM_WORDS) - 1:0] bram_addr; - - assign bram_addr = mem_addr[$bits(bram_addr) + SUBWORD_BITS - 1:SUBWORD_BITS]; - assign mem_ready = (axi_valid & axi_ready) | bram_ready; - assign mem_rdata = bram_ready ? bram_rdata : axi_rdata; - assign select_bram = ~|mem_la_addr[$bits(mem_la_addr) - 1:$bits(bram_addr) + SUBWORD_BITS]; - assign bram_write_next = mem_la_write & select_bram; - - defparam core.ENABLE_COUNTERS = 0; - defparam core.ENABLE_COUNTERS64 = 0; - defparam core.BARREL_SHIFTER = 1; - defparam core.COMPRESSED_ISA = 1; - defparam core.CATCH_MISALIGN = 0; - defparam core.CATCH_ILLINSN = 0; - defparam core.ENABLE_MUL = 1; - defparam core.ENABLE_DIV = 1; - defparam core.ENABLE_IRQ = 1; - defparam core.ENABLE_IRQ_QREGS = 0; - defparam core.ENABLE_IRQ_TIMER = 0; - defparam core.PROGADDR_RESET = BOOTROM_BASE; - - picorv32 core - ( - .clk, - .resetn(srst_n), - .trap(), - - .mem_valid, - .mem_instr, - .mem_ready, - - .mem_addr, - .mem_wdata, - .mem_wstrb, - .mem_rdata, - - .mem_la_read, - .mem_la_write, - .mem_la_addr, - .mem_la_wdata(), - .mem_la_wstrb(), - - .pcpi_valid(), - .pcpi_insn(), - .pcpi_rs1(), - .pcpi_rs2(), - .pcpi_wr(), - .pcpi_rd(), - .pcpi_wait(0), - .pcpi_ready(0), - - .irq, - .eoi(), - - .trace_valid(), - .trace_data() - ); - - picorv32_axi_adapter axi - ( - .clk, - .resetn(srst_n), - - .mem_axi_awvalid(axim.awvalid), - .mem_axi_awready(axim.awready), - .mem_axi_awaddr(axim.awaddr), - .mem_axi_awprot(), - - .mem_axi_wvalid(axim.wvalid), - .mem_axi_wready(axim.wready), - .mem_axi_wdata(axim.wdata), - .mem_axi_wstrb(), // Potenciales sorpresas - - .mem_axi_bvalid(axim.bvalid), - .mem_axi_bready(axim.bready), - - .mem_axi_arvalid(axim.arvalid), - .mem_axi_arready(axim.arready), - .mem_axi_araddr(axim.araddr), - .mem_axi_arprot(), - - .mem_axi_rvalid(axim.rvalid), - .mem_axi_rready(axim.rready), - .mem_axi_rdata(axim.rdata), - - .mem_valid(mem_valid & axi_valid), - .mem_instr, - .mem_ready(axi_ready), - .mem_addr, - .mem_wdata, - .mem_wstrb, - .mem_rdata(axi_rdata) - ); - - always_ff @(posedge clk) begin - if (bram_write) begin - for (int i = 0; i < $bits(mem_wstrb); ++i) - if (mem_wstrb[i]) - bram[bram_addr][i] <= mem_wdata[i]; - - bram_rdata <= 'x; - end else - bram_rdata <= bram[bram_addr]; - end - - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - axi_valid <= 0; - bram_read <= 0; - bram_ready <= 0; - bram_write <= 0; - end else begin - axi_valid <= ~select_bram | (axi_valid & ~axi_ready); - bram_read <= mem_la_read & select_bram; - bram_write <= bram_write_next; - bram_ready <= bram_read | bram_write_next; - end - -endmodule diff --git a/platform/wavelet3d/gfx_shader.sv b/platform/wavelet3d/gfx_shader.sv deleted file mode 100644 index 322ffb5..0000000 --- a/platform/wavelet3d/gfx_shader.sv +++ /dev/null @@ -1,77 +0,0 @@ -module gfx_shader -import gfx::*; -import gfx_shader_schedif_pkg::*; -( - input logic clk, - rst_n, - - gfx_axib.m insn_mem, - - gfx_axil.s sched -); - - axi4lite_intf #(.ADDR_WIDTH(GFX_SHADER_SCHEDIF_MIN_ADDR_WIDTH)) regblock(); - - gfx_axil2regblock axil2regblock - ( - .axis(sched), - .axim(regblock.master) - ); - - gfx_shader_schedif__in_t schedif_in; - gfx_shader_schedif__out_t schedif_out; - - gfx_front_back front_back(); - gfx_regfile_io regfile(); - gfx_shader_setup setup(); - - assign schedif_in.SETUP_CTRL.GPR_DONE.hwset = setup.sched.set_done.gpr; - assign schedif_in.SETUP_CTRL.MASK_DONE.hwset = setup.sched.set_done.mask; - assign schedif_in.SETUP_CTRL.SUBMIT_DONE.hwset = setup.sched.set_done.submit; - - assign setup.sched.write.pc = schedif_out.SETUP_SUBMIT.PC.value; - assign setup.sched.write.gpr = schedif_out.SETUP_CTRL.XGPR.value; - assign setup.sched.write.mask = schedif_out.SETUP_MASK.MASK.value; - assign setup.sched.write.group = schedif_out.SETUP_CTRL.GROUP.value; - assign setup.sched.write.pc_set = schedif_out.SETUP_SUBMIT.PC.swmod; - assign setup.sched.write.gpr_set = schedif_out.SETUP_GPR.VALUE.swmod; - assign setup.sched.write.mask_set = schedif_out.SETUP_MASK.MASK.swmod; - assign setup.sched.write.gpr_value = schedif_out.SETUP_GPR.VALUE.value; - - gfx_shader_front frontend - ( - .clk, - .rst_n, - .front(front_back.front), - .reg_bind(regfile.bind_), - .reg_read(regfile.read), - .fetch_mem(insn_mem), - .icache_flush(schedif_out.CORE.IFLUSH.value) - ); - - gfx_shader_back backend - ( - .clk, - .rst_n, - .back(front_back.back), - .setup(setup.core), - .reg_wb(regfile.wb), - .read_data(regfile.ab) - ); - - gfx_shader_regs regs - ( - .clk, - .io(regfile.regs) - ); - - gfx_shader_schedif schedif - ( - .clk, - .arst_n(rst_n), - .s_axil(regblock.slave), - .hwif_in(schedif_in), - .hwif_out(schedif_out) - ); - -endmodule diff --git a/platform/wavelet3d/gfx_shader_back.sv b/platform/wavelet3d/gfx_shader_back.sv deleted file mode 100644 index 4929192..0000000 --- a/platform/wavelet3d/gfx_shader_back.sv +++ /dev/null @@ -1,335 +0,0 @@ -module gfx_shader_back -import gfx::*; -( - input logic clk, - rst_n, - - gfx_front_back.back back, - - gfx_regfile_io.ab read_data, - gfx_regfile_io.wb reg_wb, - - gfx_shader_setup.core setup -); - - logic abort; - - gfx_wb out_wb(), p0_wb(), p1_wb(), p2_wb(), p3_wb(); - gfx_shake p1_shake(), p2_shake(), p3_shake(); - - gfx_shader_abort p0_abort - ( - .clk, - .p1(p1_shake.peek), - .p2(p2_shake.peek), - .p3(p3_shake.peek), - .abort - ); - - gfx_shader_fpint p0 - ( - .clk, - .rst_n, - .op(back.execute.p0), - .wb(p0_wb.tx), - .wave(back.execute.wave), - .abort, - .read_data, - .in_valid(back.dispatch.valid) - ); - - gfx_shader_mem p1 - ( - .clk, - .rst_n, - .op(back.execute.p1), - .wb(p1_wb.tx), - .wave(back.execute.wave), - .in_shake(p1_shake.rx), - .read_data - ); - - gfx_shader_sfu p2 - ( - .clk, - .rst_n, - .op(back.execute.p2), - .wb(p2_wb.tx), - .wave(back.execute.wave), - .in_shake(p2_shake.rx), - .read_data - ); - - gfx_shader_group p3 - ( - .clk, - .rst_n, - .op(back.execute.p3), - .wb(p3_wb.tx), - .wave(back.execute.wave), - .in_shake(p3_shake.rx), - .read_data - ); - - gfx_shader_writeback_arbiter4 writeback_arbiter - ( - .clk, - .rst_n, - .p0(p0_wb.rx), - .p1(p1_wb.rx), - .p2(p2_wb.rx), - .p3(p3_wb.rx), - .out(out_wb.tx) - ); - - gfx_shader_writeback writeback - ( - .clk, - .rst_n, - .wb(out_wb.rx), - .regs(reg_wb), - .setup, - .loop_group(back.loop.group), - .loop_valid(back.loop.valid) - ); - -endmodule - -module gfx_shader_abort -( - input logic clk, - - gfx_shake.peek p1, - p2, - p3, - - output logic abort -); - - always_ff @(posedge clk) - abort <= - (p1.valid & p1.ready) - | (p2.valid & p2.ready) - | (p3.valid & p3.ready); - -endmodule - -module gfx_shader_writeback_arbiter4 -( - input logic clk, - rst_n, - - gfx_wb.rx p0, - p1, - p2, - p3, - - gfx_wb.tx out -); - - assert property ( - @(posedge clk) - disable iff (~rst_n) - - (p0.ready & out.ready) - ); - - gfx_wb p0_p1(), p2_p3(); - - gfx_shader_writeback_arbiter2_prio arbiter_p0_p1 - ( - .clk, - .rst_n, - .a(p0), - .b(p1), - .out(p0_p1.tx) - ); - - gfx_shader_writeback_arbiter2_prio arbiter_p2_p3 - ( - .clk, - .rst_n, - .a(p2), - .b(p3), - .out(p2_p3.tx) - ); - - gfx_shader_writeback_arbiter2_prio arbiter_out - ( - .clk, - .rst_n, - .a(p0_p1.rx), - .b(p2_p3.tx), - .out - ); - -endmodule - -module gfx_shader_writeback_arbiter2_prio -( - input logic clk, - rst_n, - - gfx_wb.rx a, - b, - - gfx_wb.tx out -); - - //TODO - assign a.ready = out.ready; - assign b.ready = 0; - - assign out.dest = a.dest; - assign out.lanes = a.lanes; - assign out.group = a.group; - assign out.valid = a.valid; - assign out.scalar = a.scalar; - assign out.writeback = a.writeback; - - assign out.mask = a.mask; - assign out.mask_update = a.mask_update; - - assign out.pc_add = a.pc_add; - assign out.pc_inc = a.pc_inc; - assign out.pc_update = a.pc_update; - -endmodule - -module gfx_shader_writeback -import gfx::*; -( - input logic clk, - rst_n, - - gfx_wb.rx wb, - - gfx_regfile_io.wb regs, - - output logic loop_valid, - output group_id loop_group, - - gfx_shader_setup.core setup -); - - struct - { - group_id group; - word lanes[SHADER_LANES]; - pc_offset pc_add; - lane_mask mask; - vgpr_num vgpr; - logic pc_update, - mask_update, - vgpr_update; - } loop_hold[REGFILE_STAGES], loop_out; - - logic loop_valid_hold[REGFILE_STAGES], loop_out_valid, mask_wb, scalar_wb, - setup_gpr, setup_mask, setup_submit; - - assign wb.ready = 1; - - assign loop_out = loop_hold[REGFILE_STAGES - 1]; - assign loop_out_valid = loop_valid_hold[REGFILE_STAGES - 1]; - - assign loop_valid = loop_out_valid | setup_submit; - - assign regs.pc_back_group = wb.group; - assign regs.mask_back_group = wb.group; - - assign regs.pc_wb_write = (loop_out_valid & loop_out.pc_update) | setup_submit; - assign regs.mask_wb_write = mask_wb | setup_mask; - assign regs.sgpr_write.write = scalar_wb | setup_gpr; - - assign regs.vgpr_write.vgpr = loop_out.vgpr; - assign regs.vgpr_write.group = loop_out.group; - - assign mask_wb = loop_out_valid & loop_out.mask_update; - assign scalar_wb = wb.valid & wb.writeback & wb.scalar; - - always_comb begin - loop_group = setup.write.group; - regs.pc_wb = setup.write.pc; - regs.pc_wb_group = setup.write.group; - - if (loop_out_valid) begin - loop_group = loop_out.group; - regs.pc_wb = regs.pc_back + word_ptr'(loop_out.pc_add); - regs.pc_wb_group = loop_out.group; - end - - regs.mask_wb = setup.write.mask; - regs.mask_wb_group = setup.write.group; - - if (mask_wb) begin - regs.mask_wb = loop_out.mask; - regs.mask_wb_group = loop_out.group; - end - - regs.sgpr_write.data = setup.write.gpr_value; - regs.sgpr_write.sgpr = setup.write.gpr.sgpr; - regs.sgpr_write.group = setup.write.group; - - if (scalar_wb) begin - regs.sgpr_write.data = wb.lanes[0]; - regs.sgpr_write.sgpr = wb.dest.sgpr; - regs.sgpr_write.group = wb.group; - end - - for (int i = 0; i < SHADER_LANES; ++i) - regs.vgpr_write.data[i] = loop_out.lanes[i]; - - regs.vgpr_write.mask = regs.mask_back; - if (~loop_out_valid | ~loop_out.vgpr_update) - regs.vgpr_write.mask = '0; - end - - always_ff @(posedge clk) begin - // Blocking assignments por bug de verilator (ver for de lanes abajo) - - for (int i = REGFILE_STAGES - 1; i > 0; --i) - loop_hold[i] = loop_hold[i - 1]; - - loop_hold[0].mask = wb.mask; - loop_hold[0].vgpr = wb.dest.vgpr.num; - loop_hold[0].group = wb.group; - loop_hold[0].pc_add = wb.pc_add; - loop_hold[0].pc_update = wb.pc_update; - loop_hold[0].mask_update = wb.mask_update; - loop_hold[0].vgpr_update = wb.writeback & ~wb.scalar; - - // https://github.com/verilator/verilator/issues/4804 - for (int i = 0; i < SHADER_LANES; ++i) - loop_hold[0].lanes[i] = wb.lanes[i]; - - if (wb.pc_inc) - loop_hold[0].pc_add = pc_offset'(1); - end - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - setup_gpr <= 0; - setup_mask <= 0; - setup_submit <= 0; - - setup.set_done.gpr <= 0; - setup.set_done.mask <= 0; - setup.set_done.submit <= 0; - - for (int i = 0; i < $size(loop_valid_hold); ++i) - loop_valid_hold[i] <= 0; - end else begin - setup_gpr <= (setup_gpr & scalar_wb) | setup.write.gpr_set; - setup_mask <= (setup_mask & mask_wb) | setup.write.mask_set; - setup_submit <= (setup_submit & loop_out_valid) | setup.write.pc_set; - - setup.set_done.gpr <= setup_gpr & ~scalar_wb; - setup.set_done.mask <= setup_mask & ~mask_wb; - setup.set_done.submit <= setup_submit & ~loop_out_valid; - - loop_valid_hold[0] <= wb.valid; - for (int i = 1; i < REGFILE_STAGES; ++i) - loop_valid_hold[i] <= loop_valid_hold[i - 1]; - end - -endmodule diff --git a/platform/wavelet3d/gfx_shader_fpint.sv b/platform/wavelet3d/gfx_shader_fpint.sv deleted file mode 100644 index a418dcc..0000000 --- a/platform/wavelet3d/gfx_shader_fpint.sv +++ /dev/null @@ -1,932 +0,0 @@ -// -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32 -localparam int FPINT_CLZ_STAGES = 4; - -localparam bit[$clog2($bits(gfx::float_mant_ext)):0] FPINT_MAX_SHIFT - = 1 << $clog2($bits(gfx::float_mant_ext)); - -typedef logic[$clog2(FPINT_MAX_SHIFT):0] fpint_shift; - -/* Las 15 etapas son: - * - setup - * - mulclass - * - mnorm - * - minmax - * - expdiff - * - shiftr - * - addsub - * - clz0-clz3 - * - shiftl - * - round - * - rnorm - * - encode - */ - -typedef struct -{ - gfx::float a, - b, - a_mul, - b_mul; -} fpint_setup_mulclass; - -typedef struct -{ - gfx::float b; - gfx::float_exp exp; - gfx::float_class a_class, - b_class; - gfx::udword product; - logic sign, - overflow; -} fpint_mulclass_mnorm; - -typedef struct -{ - gfx::float a, - b; - gfx::float_class a_class, - b_class; - logic slow, - zero, - guard, - round, - sticky, - slow_in, - overflow; -} fpint_mnorm_minmax; - -typedef struct -{ - gfx::float max, - min; - gfx::float_class max_class, - min_class; - logic slow, - zero, - guard, - round, - sticky; -} fpint_minmax_expdiff; - -typedef struct -{ - gfx::float max, - min; - gfx::float_class max_class, - min_class; - fpint_shift exp_shift; - logic slow, - zero, - guard, - round, - sticky; -} fpint_expdiff_shiftr; - -typedef struct -{ - gfx::float max, - min; - gfx::float_class max_class, - min_class; - gfx::float_mant_ext max_mant, - min_mant, - sticky_mask; - logic slow, - zero, - guard, - round, - sticky, - int_sign; -} fpint_shiftr_addsub; - -typedef struct -{ - gfx::float max; - gfx::word add_sub; - logic slow, - zero, - guard, - round, - sticky; -} fpint_clz_hold; - -typedef fpint_clz_hold fpint_addsub_clz; - -typedef struct -{ - fpint_clz_hold hold; - fpint_shift shift; -} fpint_clz_shiftl; - -typedef struct -{ - gfx::float val; - logic slow, - zero, - guard, - round, - sticky, - overflow, - sticky_last; -} fpint_shiftl_round; - -typedef struct -{ - gfx::float val; - logic slow, - zero, - exp_step, - overflow; -} fpint_round_rnorm; - -typedef struct -{ - gfx::float val; - logic slow, - zero, - overflow; -} fpint_rnorm_encode; - -module gfx_shader_fpint -import gfx::*; -( - input logic clk, - rst_n, - - input fpint_op op, - input wave_exec wave, - input logic abort, - in_valid, - - gfx_regfile_io.ab read_data, - - gfx_wb.tx wb -); - - localparam int FPINT_STAGES = 7 + FPINT_CLZ_STAGES + 4; - - struct - { - fpint_op op; - wave_exec wave; - } stage[FPINT_STAGES]; - - logic stage_valid[FPINT_STAGES]; - - assign wb.dest = stage[FPINT_STAGES - 1].wave.dest; - assign wb.mask = 'x; - assign wb.group = stage[FPINT_STAGES - 1].wave.group; - assign wb.pc_add = 'x; - assign wb.pc_inc = 1; - assign wb.scalar = stage[FPINT_STAGES - 1].wave.dest_scalar; - assign wb.pc_update = wb.writeback; - assign wb.writeback = stage[FPINT_STAGES - 1].op.writeback; - assign wb.mask_update = 0; - - // Ojo: stage_valid[0], pero stage[0] no - assign stage_valid[0] = in_valid; - - genvar lane; - generate - for (lane = 0; lane < SHADER_LANES; ++lane) begin: lanes - gfx_shader_fpint_lane unit - ( - .clk(clk), - .a(read_data.a[lane]), - .b(read_data.b[lane]), - .q(wb.lanes[lane]), - .mul_float_0(op.setup_mul_float), - .unit_b_0(op.setup_unit_b), - .put_hi_2(stage[2 - 1].op.mnorm_put_hi), - .put_lo_2(stage[2 - 1].op.mnorm_put_lo), - .put_mul_2(stage[2 - 1].op.mnorm_put_mul), - .zero_b_2(stage[2 - 1].op.mnorm_zero_b), - .zero_flags_2(stage[2 - 1].op.mnorm_zero_flags), - .abs_3(stage[3 - 1].op.minmax_abs), - .swap_3(stage[3 - 1].op.minmax_swap), - .zero_min_3(stage[3 - 1].op.minmax_zero_min), - .copy_flags_3(stage[3 - 1].op.minmax_copy_flags), - .int_signed_5(stage[5 - 1].op.shiftr_int_signed), - .copy_flags_6(stage[6 - 1].op.addsub_copy_flags), - .int_operand_6(stage[6 - 1].op.addsub_int_operand), - .force_nop_7(stage[7 - 1].op.clz_force_nop), - .copy_flags_11(stage[11 - 1].op.shiftl_copy_flags), - .copy_flags_12(stage[12 - 1].op.round_copy_flags), - .enable_12(stage[12 - 1].op.round_enable), - .enable_14(stage[14 - 1].op.encode_enable) - ); - end - endgenerate - - always_ff @(posedge clk) begin - stage[0].op <= op; - stage[0].wave <= wave; - - for (int i = 1; i < FPINT_STAGES; ++i) - stage[i] <= stage[i - 1]; - end - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - for (int i = 1; i < FPINT_STAGES; ++i) - stage_valid[i] <= 0; - - wb.valid <= 0; - end else begin - for (int i = 1; i < FPINT_STAGES; ++i) - stage_valid[i] <= stage_valid[i - 1]; - - // Se levanta 1 ciclo luego que in_valid - stage_valid[2] <= stage_valid[1] & ~abort; - - wb.valid <= stage_valid[FPINT_STAGES - 1]; - end - -endmodule - -module gfx_shader_fpint_lane -import gfx::*; -( - input logic clk, - - input word a, - b, - - input logic mul_float_0, - unit_b_0, - put_hi_2, - put_lo_2, - put_mul_2, - zero_b_2, - zero_flags_2, - abs_3, - swap_3, - zero_min_3, - copy_flags_3, - int_signed_5, - copy_flags_6, - int_operand_6, - force_nop_7, - copy_flags_11, - copy_flags_12, - enable_12, - enable_14, - - output word q -); - - /* Notas de implementación para floating-point - * - * === PRODUCTO === - * - * Queremos calcular q = a * b. - * - * Donde a = (-1)^s * 1.m * 2^f, - * b = (-1)^t * 1.n * 2^g - * - * Entonces q = (-1)^(s + t) (1.m * 1.n) 2^(f + g) - * - * El producto es entre números >= 1.0 y < 2.0. En el peor caso: - * Mejor caso: 1.000... * 1.000... ~ 1.000... - * Peor caso: 1.999... * 1.999... ~ 3.999... = 2^1 * 1.999 - * - * Así que, si el producto es >= 2, hay que hacerle >> 1 a la mantisa - * y sumarle 1 al exponente para normalizar. - * - * - * === SUMA/RESTA === - * - * Queremos calcular q = a + b. Curiosamente, eso es más complicado que a * b. - * Hay que ajustar el exponente del menor entre a y b para que coincida - * con el del mayor (desnormalizando), realizar la operación y finalmente - * renormalizar. Se hace suma o resta dependiendo de relaciones de signos, - * no según la operación de entrada (eso último solo le hace xor al signo de b). - * Recordar aquí que IEEE 754 es una especie de signo-magnitud y no complemento. - * - * En el caso de una resta, el exponente normalizado puede ser mucho más - * pequeño que cualquiera de los exponentes de entrada. Necesitamos - * entonces de lǵoica CLZ (count leading zeros) para renormalizar. - * - * - * === CONVERSIÓN INTEGER->FP === - * - * Esto simplemente usa el mismo datapath de fadd, con el abs del entero - * como entrada como entrada de clz. El exponente de referencia se fija - * en 30 (aludiendo al segundo msb de un entero de 32 bits). A partir de - * ese punto es idéntico a un fadd, las etapas de clz se encargan de ajustar - * el exponente. - */ - - fpint_setup_mulclass setup_mulclass; - fpint_mulclass_mnorm mulclass_mnorm; - fpint_mnorm_minmax mnorm_minmax; - fpint_minmax_expdiff minmax_expdiff; - fpint_expdiff_shiftr expdiff_shiftr; - fpint_shiftr_addsub shiftr_addsub; - fpint_addsub_clz addsub_clz; - fpint_clz_shiftl clz_shiftl; - fpint_shiftl_round shiftl_round; - fpint_round_rnorm round_rnorm; - fpint_rnorm_encode rnorm_encode; - - gfx_shader_fpint_setup stage_0 - ( - .clk(clk), - .a(a), - .b(b), - .out(setup_mulclass), - .unit_b(unit_b_0), - .mul_float(mul_float_0) - ); - - gfx_shader_fpint_mulclass stage_1 - ( - .clk(clk), - .in(setup_mulclass), - .out(mulclass_mnorm) - ); - - gfx_shader_fpint_mnorm stage_2 - ( - .clk(clk), - .in(mulclass_mnorm), - .out(mnorm_minmax), - .put_hi(put_hi_2), - .put_lo(put_lo_2), - .put_mul(put_mul_2), - .zero_b(zero_b_2), - .zero_flags(zero_flags_2) - ); - - gfx_shader_fpint_minmax stage_3 - ( - .clk(clk), - .in(mnorm_minmax), - .out(minmax_expdiff), - .abs(abs_3), - .swap(swap_3), - .zero_min(zero_min_3), - .copy_flags(copy_flags_3) - ); - - gfx_shader_fpint_expdiff stage_4 - ( - .clk(clk), - .in(minmax_expdiff), - .out(expdiff_shiftr) - ); - - gfx_shader_fpint_shiftr stage_5 - ( - .clk(clk), - .in(expdiff_shiftr), - .out(shiftr_addsub), - .int_signed(int_signed_5) - ); - - gfx_shader_fpint_addsub stage_6 - ( - .clk(clk), - .in(shiftr_addsub), - .out(addsub_clz), - .copy_flags(copy_flags_6), - .int_operand(int_operand_6) - ); - - gfx_shader_fpint_clz stage_7_8_9_10 - ( - .clk(clk), - .in(addsub_clz), - .out(clz_shiftl), - .force_nop(force_nop_7) - ); - - gfx_shader_fpint_shiftl stage_11 - ( - .clk(clk), - .in(clz_shiftl), - .out(shiftl_round), - .copy_flags(copy_flags_11) - ); - - gfx_shader_fpint_round stage_12 - ( - .clk(clk), - .in(shiftl_round), - .out(round_rnorm), - .enable(enable_12), - .copy_flags(copy_flags_12) - ); - - gfx_shader_fpint_rnorm stage_13 - ( - .clk(clk), - .in(round_rnorm), - .out(rnorm_encode) - ); - - gfx_shader_fpint_encode stage_14 - ( - .clk(clk), - .q(q), - .in(rnorm_encode), - .enable(enable_14) - ); - -endmodule - -// Stage 0: argumentos de mul -module gfx_shader_fpint_setup -import gfx::*; -( - input logic clk, - - input word a, - b, - input logic mul_float, - unit_b, - - output fpint_setup_mulclass out -); - - always_ff @(posedge clk) begin - out.a <= a; - out.b <= b; - out.a_mul <= a; - out.b_mul <= b; - - /* Nótese que el orden es sign-exp-mant. Esto coloca el '1.' implícito - * en la posición correcta para multiplicar las mantisas. - */ - if (mul_float) begin - out.a_mul.exp <= 1; - out.b_mul.exp <= 1; - out.a_mul.sign <= 0; - out.b_mul.sign <= 0; - end - - if (unit_b) begin - out.b_mul.exp <= 0; - out.b_mul.mant <= 1; - out.b_mul.sign <= 0; - end - end - -endmodule - -// Stage 1: multiplicación de fp o enteros -module gfx_shader_fpint_mulclass -import gfx::*; -( - input logic clk, - - input fpint_setup_mulclass in, - - output fpint_mulclass_mnorm out -); - - always_ff @(posedge clk) begin - out.b <= in.b; - out.sign <= in.a.sign ^ in.b.sign; - out.a_class <= classify_float(in.a); - out.b_class <= classify_float(in.b); - out.product <= in.a_mul * in.b_mul; - {out.overflow, out.exp} <= {1'b0, in.a.exp} + {1'b0, in.b.exp} - {1'b0, FLOAT_EXP_BIAS}; - end - -endmodule - -// Stage 2: normalización -module gfx_shader_fpint_mnorm -import gfx::*; -( - input logic clk, - - input fpint_mulclass_mnorm in, - input logic put_hi, - put_lo, - put_mul, - zero_b, - zero_flags, - - output fpint_mnorm_minmax out -); - - word product_hi, product_lo; - logic guard, lo_msb, lo_reduce, round, slow_in_next; - float_mant_full hi; - logic[$bits(float_mant_full) - 3:0] lo; - - assign lo_msb = lo[$bits(lo) - 1]; - assign lo_reduce = |lo[$bits(lo) - 2:0]; - assign slow_in_next = is_float_special(in.a_class) | is_float_special(in.b_class); - assign {product_hi, product_lo} = in.product; - assign {hi, guard, round, lo} = in.product[2 * $bits(float_mant_full) - 1:0]; - - always_ff @(posedge clk) begin - if (put_mul) begin - out.slow <= slow_in_next | (in.overflow & ~in.a_class.exp_min & ~in.a_class.exp_min); - out.zero <= in.a_class.exp_min | in.b_class.exp_min; - end else begin - out.slow <= 0; - out.zero <= 0; - end - - out.a.sign <= in.sign; - out.overflow <= 0; - - if (hi[$bits(hi) - 1]) begin - out.guard <= guard; - out.round <= round; - out.sticky <= lo_msb | lo_reduce; - out.a.mant <= implicit_mant(hi); - {out.overflow, out.a.exp} <= {1'b0, in.exp} + 1; - end else begin - /* Bit antes de msb es necesariamente 1, ya que los msb de - * ambos multiplicandos son 1. Ver assert en implicit_mant(). - */ - out.guard <= round; - out.round <= lo_msb; - out.sticky <= lo_reduce; - - out.a.exp <= in.exp; - out.a.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard}); - end - - unique case (1'b1) - put_mul: ; - - put_hi: - out.a <= product_hi; - - put_lo: - out.a <= product_lo; - endcase - - out.a_class <= in.a_class; - out.slow_in <= slow_in_next; - - if (zero_flags) begin - out.a_class <= classify_float(0); - out.slow_in <= 0; - end - - if (zero_b) begin - out.b <= 0; - out.b_class <= classify_float(0); - end else begin - out.b <= in.b; - out.b_class <= in.b_class; - end - end - -endmodule - -// Stage 3: ordenar tal que abs(max) >= abs(min) -module gfx_shader_fpint_minmax -import gfx::*; -( - input logic clk, - - input fpint_mnorm_minmax in, - input logic abs, - swap, - zero_min, - copy_flags, - - output fpint_minmax_expdiff out -); - - logic abs_b_gt_abs_a, b_gt_a; - - /* Wiki dice: - * - * A property of the single- and double-precision formats is that - * their encoding allows one to easily sort them without using - * floating-point hardware, as if the bits represented sign-magnitude - * integers, although it is unclear whether this was a design - * consideration (it seems noteworthy that the earlier IBM hexadecimal - * floating-point representation also had this property for normalized - * numbers). - */ - assign abs_b_gt_abs_a = {in.b.exp, in.b.mant} > {in.a.exp, in.a.mant}; - - always_comb begin - unique case ({in.b.sign, in.a.sign}) - 2'b00: b_gt_a = abs_b_gt_abs_a; - 2'b01: b_gt_a = 1; - 2'b10: b_gt_a = 0; - 2'b11: b_gt_a = abs_b_gt_abs_a; - endcase - - if (abs) - b_gt_a = abs_b_gt_abs_a; - end - - always_ff @(posedge clk) begin - if (b_gt_a ^ swap) begin - out.max <= in.b; - out.min <= in.a; - out.max_class <= in.b_class; - out.min_class <= in.a_class; - end else begin - out.max <= in.a; - out.min <= in.b; - out.max_class <= in.a_class; - out.min_class <= in.b_class; - end - - if (zero_min) begin - out.min <= 0; - out.min_class <= classify_float(0); - end - - out.guard <= in.guard; - out.round <= in.round; - out.sticky <= in.sticky; - - if (copy_flags) begin - out.slow <= in.slow | in.overflow; - out.zero <= in.zero; - end else begin - out.slow <= in.slow_in; - out.zero <= 0; - end - end - -endmodule - -// Stage 4: exp_shift amount -module gfx_shader_fpint_expdiff -import gfx::*; -( - input logic clk, - - input fpint_minmax_expdiff in, - - output fpint_expdiff_shiftr out -); - - float_exp exp_delta; - - assign exp_delta = in.max.exp - in.min.exp; - - always_ff @(posedge clk) begin - out.max <= in.max; - out.min <= in.min; - out.slow <= in.slow; - out.zero <= in.zero; - out.guard <= in.guard; - out.round <= in.round; - out.sticky <= in.sticky; - out.max_class <= in.max_class; - out.min_class <= in.min_class; - - out.exp_shift <= exp_delta[$bits(out.exp_shift) - 1:0]; - if (exp_delta > {{($bits(exp_delta) - $bits(FPINT_MAX_SHIFT)){1'b0}}, FPINT_MAX_SHIFT}) - out.exp_shift <= FPINT_MAX_SHIFT; - end - -endmodule - -// Stage 5: shifts y abs(max) para enteros con signo -module gfx_shader_fpint_shiftr -import gfx::*; -( - input logic clk, - - input fpint_expdiff_shiftr in, - input logic int_signed, - - output fpint_shiftr_addsub out -); - - always_ff @(posedge clk) begin - out.min <= in.min; - out.slow <= in.slow; - out.zero <= in.zero; - out.guard <= in.guard; - out.round <= in.round; - out.sticky <= in.sticky; - out.min_class <= in.min_class; - - out.max_mant <= float_prepare_round(in.max, in.max_class); - out.min_mant <= float_prepare_round(in.min, in.min_class) >> in.exp_shift; - out.sticky_mask <= {($bits(out.min_mant)){1'b1}} << in.exp_shift; - - out.max <= in.max; - out.int_sign <= in.max[$bits(in.max) - 1]; - - if (int_signed & in.max[$bits(in.max) - 1]) - out.max <= -in.max; - end - -endmodule - -// Stage 6: suma de mantisas -module gfx_shader_fpint_addsub -import gfx::*; -( - input logic clk, - - input fpint_shiftr_addsub in, - input logic copy_flags, - int_operand, - - output fpint_addsub_clz out -); - - localparam int INT_SHIFT_REF = $bits(word) - 2; - - function word fp_add_sub_arg(float_mant_ext arg); - fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}}; - endfunction - - always_ff @(posedge clk) begin - out.max <= in.max; - out.slow <= in.slow; - out.zero <= in.zero; - out.guard <= in.guard; - out.round <= in.round; - - if (int_operand) begin - out.max.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0]; - out.max.sign <= in.int_sign; - end - - if (copy_flags) - out.sticky <= in.sticky; - else - out.sticky <= |(float_prepare_round(in.min, in.min_class) & ~in.sticky_mask); - - if (int_operand) - out.add_sub <= in.max; - else if (in.max.sign ^ in.min.sign) - out.add_sub <= fp_add_sub_arg(in.max_mant) - fp_add_sub_arg(in.min_mant); - else - out.add_sub <= fp_add_sub_arg(in.max_mant) + fp_add_sub_arg(in.min_mant); - end - -endmodule - -// Stages 7-10: encontrar el 1 más significativo -module gfx_shader_fpint_clz -import gfx::*; -( - input logic clk, - - input fpint_addsub_clz in, - input logic force_nop, - - output fpint_clz_shiftl out -); - - word clz_in; - fpint_clz_hold hold[FPINT_CLZ_STAGES]; - - assign out.hold = hold[FPINT_CLZ_STAGES - 1]; - - gfx_clz #($bits(word)) clz - ( - .clk(clk), - .clz(out.shift), - .value(clz_in) - ); - - always_comb begin - clz_in = in.add_sub; - if (force_nop) - clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01; - end - - always_ff @(posedge clk) begin - hold[0] <= in; - - for (int i = 1; i < FPINT_CLZ_STAGES; ++i) - hold[i] <= hold[i - 1]; - end - -endmodule - -// Stage 11: normalización -module gfx_shader_fpint_shiftl -import gfx::*; -( - input logic clk, - - input fpint_clz_shiftl in, - input logic copy_flags, - - output fpint_shiftl_round out -); - - localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(in.shift) + 1; - - word normalized; - - assign normalized = in.hold.add_sub << in.shift; - - always_ff @(posedge clk) begin - out.slow <= in.hold.slow; - out.zero <= in.hold.zero; - out.sticky <= in.hold.sticky; - out.val.sign <= in.hold.max.sign; - - {out.val.mant, out.guard, out.round, out.sticky_last} <= - normalized[$bits(normalized) - 2:$bits(normalized) - $bits(float_mant) - 4]; - - {out.overflow, out.val.exp} <= - {1'b0, in.hold.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, in.shift} + 1; - - if (in.shift[$bits(in.shift) - 1]) - out.zero <= 1; - - if (copy_flags) begin - out.guard <= in.hold.guard; - out.round <= in.hold.round; - out.overflow <= 0; - out.sticky_last <= 0; - end - end - -endmodule - -// Stage 12: redondeo -module gfx_shader_fpint_round -import gfx::*; -( - input logic clk, - - input fpint_shiftl_round in, - input logic copy_flags, - enable, - - output fpint_round_rnorm out -); - - always_ff @(posedge clk) begin - out.val <= in.val; - out.slow <= in.slow | (~copy_flags & in.overflow & ~in.zero); - out.zero <= in.zero; - out.exp_step <= 0; - - // Este es el modo de redondeo más usual: round to nearest, ties to even - if (enable & in.guard & (in.round | in.sticky | in.sticky_last | in.val.mant[0])) - {out.exp_step, out.val.mant} <= {1'b0, out.val.mant} + 1; - end - -endmodule - -// Stage 13: ajuste de exponente por redondeo -module gfx_shader_fpint_rnorm -import gfx::*; -( - input logic clk, - - input fpint_round_rnorm in, - - output fpint_rnorm_encode out -); - - always_ff @(posedge clk) begin - out.slow <= in.slow; - out.zero <= in.zero; - out.overflow <= 0; - out.val.mant <= in.val.mant; - out.val.sign <= in.val.sign; - - if (in.exp_step) - {out.overflow, out.val.exp} <= {1'b0, in.val.exp} + 1; - else - out.val.exp <= in.val.exp; - end - -endmodule - -// Stage 14: salida y codificación de ceros y NaNs -module gfx_shader_fpint_encode -import gfx::*; -( - input logic clk, - - input fpint_rnorm_encode in, - input logic enable, - - output float q -); - - always_ff @(posedge clk) begin - q <= in.val; - - if (enable) begin - if (&in.val.exp | in.slow | in.overflow) begin - q.exp <= FLOAT_EXP_MAX; - q.mant <= 1; - end else if (in.zero) begin - q.exp <= 0; - q.mant <= 0; - end - end - end - -endmodule diff --git a/platform/wavelet3d/gfx_shader_front.sv b/platform/wavelet3d/gfx_shader_front.sv deleted file mode 100644 index 52074fd..0000000 --- a/platform/wavelet3d/gfx_shader_front.sv +++ /dev/null @@ -1,746 +0,0 @@ -typedef struct -{ - logic valid, - retry; - gfx::group_id group; - gfx_isa::insn_word insn; -} front_wave; - -typedef struct -{ - gfx::xgpr_num dest; - logic dest_scalar; -} front_reg_passthru; - -typedef logic[4:0] icache_line_num; - -typedef logic[$bits(gfx::oword_ptr) - $bits(icache_line_num) - 1:0] icache_tag; - -typedef struct packed -{ - icache_tag tag; - icache_line_num line; -} icache_line_tag; - -typedef struct packed -{ - icache_line_tag line_tag; - logic[2:0] word_num; -} icache_ptr; - -module gfx_shader_front -import gfx::*; -( - input logic clk, - rst_n, - - gfx_axib.m fetch_mem, - - input logic icache_flush, - - gfx_regfile_io.read reg_read, - gfx_regfile_io.bind_ reg_bind, - - gfx_front_back.front front -); - - word fetch_insn, port_insn; - logic fetch_hit, p0_writeback; - front_wave bind_wave, dec_wave, port_dec_wave; - front_reg_passthru reg_passthru; - - assign front.execute.wave.dest = reg_passthru.dest; - assign front.execute.wave.dest_scalar = reg_passthru.dest_scalar; - - gfx_shader_bind bind_ - ( - .clk, - .rst_n, - .mem(fetch_mem), - .wave(bind_wave), - .regs(reg_bind), - .loop_valid(front.loop.valid), - .loop_group(front.loop.group), - .icache_flush - ); - - gfx_shader_read_regs reg_dec - ( - .clk, - .rst_n, - .in(bind_wave), - .out(dec_wave), - .read(reg_read), - .passthru(reg_passthru) - ); - - gfx_shader_decode_class class_dec - ( - .clk, - .rst_n, - .wave(dec_wave), - .out_group(front.execute.wave.group), - .port_wave(port_dec_wave), - .dispatch(front.dispatch), - .p0_writeback - ); - - gfx_shader_decode_fpint p0_dec - ( - .clk, - .op(front.execute.p0), - .insn(port_dec_wave.insn), - .writeback(p0_writeback) - ); - -endmodule - -module gfx_shader_bind -import gfx::*; -( - input logic clk, - rst_n, - - gfx_axib.m mem, - - input logic icache_flush, - - input logic loop_valid, - input group_id loop_group, - - gfx_regfile_io.bind_ regs, - - output front_wave wave -); - - localparam int ICACHE_STAGES = 6; - localparam int BIND_STAGES = REGFILE_STAGES + ICACHE_STAGES; - - gfx_beats #($bits(group_id)) runnable_in(), runnable_out(); - - logic ar_stall, request_ready, request_valid, valids[BIND_STAGES]; - group_id groups[BIND_STAGES]; - icache_line_tag araddr, request_addr; - - assign mem.bready = 0; - assign mem.wvalid = 0; - assign mem.awvalid = 0; - - assign mem.arlen = ($bits(mem.arlen))'($bits(oword) / $bits(word) - 1); - assign mem.araddr = {araddr, ($clog2($bits(oword)) - $clog2($bits(word)) + SUBWORD_BITS)'('0)}; - assign mem.arburst = 2'b01; // Incremental mode - - assign runnable_in.tx.data = loop_group; - assign runnable_in.tx.valid = loop_valid; - - assign regs.pc_front_group = runnable_out.rx.data; - assign runnable_out.rx.ready = 1; - - assign wave.group = groups[$size(groups) - 1]; - - gfx_skid_buf #($bits(araddr)) ar_skid - ( - .clk, - .in(request_addr), - .out(araddr), - .stall(ar_stall) - ); - - gfx_skid_flow ar_flow - ( - .clk, - .rst_n, - .stall(ar_stall), - .in_ready(request_ready), - .in_valid(request_valid), - .out_ready(mem.arready), - .out_valid(mem.arvalid) - ); - - //TODO: Podríamos quitar ~25 entries sin afectar throughput, latencia o correctitud - gfx_fifo #(.WIDTH($bits(group_id)), .DEPTH(1 << $bits(group_id))) runnable - ( - .clk, - .rst_n, - .in(runnable_in.rx), - .out(runnable_out.tx) - ); - - gfx_shader_bind_icache icache - ( - .clk, - .rst_n, - - .icache_flush, - .read_addr(regs.pc_front), - .read_valid(valids[REGFILE_STAGES - 1]), - - .request_addr, - .request_valid, - .request_ready, - - .fetch_data(mem.rdata), - .fetch_last(mem.rlast), - .fetch_valid(mem.rvalid), - .fetch_ready(mem.rready), - - .insn(wave.insn), - .insn_retry(wave.retry), - .insn_valid(wave.valid) - ); - - always_ff @(posedge clk) begin - groups[0] <= runnable_out.rx.data; - for (int i = 1; i < $size(groups); ++i) - groups[i] <= groups[i - 1]; - end - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) - for (int i = 0; i < $size(valids); ++i) - valids[i] <= 0; - else begin - valids[0] <= runnable_out.rx.valid; - for (int i = 1; i < $size(valids); ++i) - valids[i] <= valids[i - 1]; - end - -endmodule - -module gfx_shader_bind_icache -import gfx::*; -( - input logic clk, - rst_n, - - input logic icache_flush, - - input logic read_valid, - input icache_ptr read_addr, - - input logic fetch_last, - fetch_valid, - input word fetch_data, - output logic fetch_ready, - - input logic request_ready, - output logic request_valid, - output icache_line_tag request_addr, - - output logic insn_valid, - insn_retry, - output word insn -); - - // Dan Gisselquist limita a (1 << 3) bursts por defecto. - // Ver LGMAXBURST en axixbar.v - localparam int PENDING_FIFO_DEPTH = 8; - - enum int unsigned - { - FLUSH, - RUN - } state; - - struct - { - logic valid, - accessed, - hit; - icache_tag tag; - oword data; - } cache[1 << $bits(icache_line_num)], read, read_hold; - - gfx_beats #($bits(icache_line_tag)) pending_in(), pending_out(); - - logic accessed_write, accessed_write_enable, burst, fetch_done, hit_write, - in_flush, hit_commit, hit_write_enable, retry_4, retry_5, rollback, - tag_hit, valid_1, valid_2, valid_3, valid_4, valid_5, valid_write, - valid_write_enable; - - icache_ptr read_addr_1, read_addr_2, read_addr_3, read_addr_4, read_addr_5; - icache_tag tag_write; - icache_line_num accessed_write_line, flush_ptr, hit_write_line, valid_write_line; - icache_line_tag pending_pop; - - oword data_write; - word[1:0] data_5; - word[7:0] fetch_shift; - qword[1:0] data_3; - udword[1:0] data_4; - - assign data_3 = read.data; - assign tag_hit = read.tag == read_addr_3.line_tag.tag; - assign fetch_ready = ~fetch_done; - assign pending_pop = pending_out.rx.data; - - assign request_addr = read_addr_4.line_tag; - assign request_valid = burst & pending_in.tx.ready; - assign pending_in.tx.data = read_addr_4.line_tag; - assign pending_in.tx.valid = burst & request_ready; - assign pending_out.rx.ready = fetch_done & ~hit_commit & ~rollback; - - gfx_fifo #(.WIDTH($bits(icache_line_tag)), .DEPTH(PENDING_FIFO_DEPTH)) pending - ( - .clk, - .rst_n, - .in(pending_in.rx), - .out(pending_out.tx) - ); - - always_comb - unique case (state) - FLUSH: in_flush = 1; - RUN: in_flush = 0; - endcase - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - state <= FLUSH; - flush_ptr <= '0; - fetch_done <= 0; - - valid_1 <= 0; - valid_2 <= 0; - valid_3 <= 0; - valid_4 <= 0; - valid_5 <= 0; - - burst <= 0; - end else begin - unique case (state) - FLUSH: - if (~icache_flush & &flush_ptr) - state <= RUN; - - RUN: - if (icache_flush) - state <= FLUSH; - endcase - - flush_ptr <= flush_ptr + 1; - if (icache_flush) - flush_ptr <= '0; - - if (fetch_done) - fetch_done <= hit_commit | ~pending_out.rx.valid | rollback; - else if (fetch_ready & fetch_valid) - fetch_done <= fetch_last; - - valid_1 <= read_valid; - valid_2 <= valid_1; - valid_3 <= valid_2; - valid_4 <= valid_3; - valid_5 <= valid_4; - - burst <= valid_3 & ~tag_hit & ~read.accessed & (~read.valid | read.hit); - end - - always_ff @(posedge clk) begin - tag_write <= pending_pop.tag; - data_write <= fetch_shift; - - valid_write <= 1; - valid_write_line <= pending_pop.line; - valid_write_enable <= fetch_done & ~hit_commit & pending_out.rx.valid & ~rollback; - - accessed_write <= 0; - accessed_write_enable <= 1; - - if (rollback) - accessed_write_line <= read_addr_5.line_tag.line; - else if (fetch_done & ~hit_commit & pending_out.rx.valid) - accessed_write_line <= pending_pop.line; - else begin - accessed_write <= 1; - accessed_write_line <= read_addr.line_tag.line; - accessed_write_enable <= read_valid; - end - - hit_write <= hit_commit; - if (hit_commit) begin - hit_write_line <= read_addr_4.line_tag.line; - hit_write_enable <= 1; - end else begin - hit_write_line <= pending_pop.line; - hit_write_enable <= fetch_done & pending_out.rx.valid & ~rollback; - end - - if (in_flush) begin - valid_write <= 0; - valid_write_line <= flush_ptr; - valid_write_enable <= 1; - - accessed_write <= 0; - accessed_write_line <= flush_ptr; - accessed_write_enable <= 1; - - hit_write <= 0; - hit_write_line <= flush_ptr; - hit_write_enable <= 1; - end - - if (valid_write_enable) begin - cache[valid_write_line].tag <= tag_write; - cache[valid_write_line].data <= data_write; - cache[valid_write_line].valid <= valid_write; - end - - if (accessed_write_enable) - cache[accessed_write_line].accessed <= accessed_write; - - if (hit_write_enable) - cache[hit_write_line].hit <= hit_write; - - read_addr_1 <= read_addr; - - read_hold <= cache[read_addr_1.line_tag.line]; - read_addr_2 <= read_addr_1; - - read <= read_hold; - read_addr_3 <= read_addr_2; - - data_4 <= data_3[read_addr_3.word_num[2]]; - retry_4 <= ~tag_hit | ~read.valid; - hit_commit <= valid_3 & tag_hit & read.valid; - read_addr_4 <= read_addr_3; - - data_5 <= data_4[read_addr_4.word_num[1]]; - retry_5 <= retry_4; - rollback <= burst & (~request_valid | ~pending_in.tx.valid); - read_addr_5 <= read_addr_4; - - insn <= data_5[read_addr_5.word_num[0]]; - insn_retry <= retry_5; - insn_valid <= valid_5; - - if (fetch_ready & fetch_valid) begin - fetch_shift[0] <= fetch_data; - for (int i = 1; i < $size(fetch_shift); ++i) - fetch_shift[i] <= fetch_shift[i - 1]; - end - end - -endmodule - -module gfx_shader_read_regs -import gfx::*; -import gfx_isa::*; -( - input logic clk, - rst_n, - - input front_wave in, - - gfx_regfile_io.read read, - - output front_wave out, - output front_reg_passthru passthru -); - - // + 1 por next-cycle de read.op - localparam int PASSTHRU_DEPTH = REG_READ_STAGES + 1 - 2; - localparam int HOLD_DEPTH = PASSTHRU_DEPTH - 2; - - logic reg_rev; - logic valid[HOLD_DEPTH]; - front_wave out_hold[HOLD_DEPTH]; - front_reg_passthru passthru_hold[PASSTHRU_DEPTH]; - - assign passthru = passthru_hold[$size(passthru_hold) - 1]; - - assign reg_rev = in.insn.reg_rev; - - always_comb begin - out = out_hold[$size(out_hold) - 1]; - out.valid = valid[$size(valid) - 1]; - end - - always_ff @(posedge clk) begin - out_hold[0] <= in; - for (int i = 1; i < $size(out_hold); ++i) - out_hold[i] <= out_hold[i - 1]; - - passthru_hold[0].dest <= in.insn.dst_src.rr.rd; - unique case (in.insn.reg_mode) - REGS_SVS, REGS_SSS: - passthru_hold[0].dest_scalar <= 1; - - REGS_VVS, REGS_VVV: - passthru_hold[0].dest_scalar <= 0; - endcase - - for (int i = 1; i < $size(passthru_hold); ++i) - passthru_hold[i] <= passthru_hold[i - 1]; - - read.op.group <= in.group; - - read.op.b_imm <= in.insn.dst_src.rr.b.imm; - read.op.a_sgpr <= in.insn.dst_src.rr.ra.sgpr; - read.op.b_sgpr <= in.insn.dst_src.rr.b.read.r.sgpr; - read.op.a_vgpr <= in.insn.dst_src.rr.ra.vgpr.num; - read.op.b_vgpr <= in.insn.dst_src.rr.b.read.r.vgpr.num; - read.op.b_is_imm <= in.insn.dst_src.rr.b_is_imm; - read.op.b_is_const <= in.insn.dst_src.rr.b.read.from_consts; - read.op.scalar_rev <= reg_rev; - - unique case (in.insn.reg_mode) - REGS_SVS, REGS_VVS: begin - read.op.a_scalar <= reg_rev; - read.op.b_scalar <= ~reg_rev; - end - - REGS_SSS: begin - read.op.a_scalar <= 1; - read.op.b_scalar <= 1; - end - - REGS_VVV: begin - read.op.a_scalar <= 0; - read.op.b_scalar <= 0; - end - endcase - end - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) - for (int i = 0; i < HOLD_DEPTH; ++i) - valid[i] <= 0; - else begin - valid[0] <= in.valid; - - for (int i = 1; i < HOLD_DEPTH; ++i) - valid[i] <= valid[i - 1]; - end - -endmodule - -module gfx_shader_decode_class -import gfx::*; -import gfx_isa::*; -( - input logic clk, - rst_n, - - input front_wave wave, - output front_wave port_wave, - output group_id out_group, - - output shader_dispatch dispatch, - output logic p0_writeback -); - - logic is_fsu, is_mem, is_group, hold_valid, retry; - front_wave hold_wave; - - assign p0_writeback = ~(is_mem | is_fsu | is_group | retry); - - always_comb begin - port_wave = hold_wave; - port_wave.valid = hold_valid; - end - - always_ff @(posedge clk) begin - hold_wave <= wave; - out_group <= port_wave.group; - end - - always_ff @(posedge clk or negedge rst_n) - // Intencionalmente repetitivo - if (~rst_n) begin - is_fsu <= 0; - is_mem <= 0; - is_group <= 0; - - retry <= 0; - hold_valid <= 0; - - dispatch <= '0; - end else begin - is_fsu <= 0; - is_mem <= 0; - is_group <= 0; - - retry <= wave.retry; - hold_valid <= wave.valid; - - unique case (wave.insn.insn_class) - INSN_FPINT: ; // p0 no tiene ready - INSN_MEM: is_mem <= 1; - INSN_SFU: is_fsu <= 1; - INSN_GROUP: is_group <= 1; - - default: - {is_mem, is_fsu, is_group} <= 'x; - endcase - - dispatch.p1 <= is_mem; - dispatch.p2 <= is_fsu; - dispatch.p3 <= is_group; - - if (~hold_valid | retry) begin - dispatch.p1 <= 0; - dispatch.p2 <= 0; - dispatch.p3 <= 0; - end - - dispatch.valid <= hold_valid; - end - -endmodule - -module gfx_shader_decode_fpint -import gfx::*; -import gfx_isa::*; -( - input logic clk, - - input insn_word insn, - input logic writeback, - - output fpint_op op -); - - always_ff @(posedge clk) begin - unique case (insn.by_class.fpint.op) - INSN_FPINT_MOV: begin - op.setup_mul_float <= 0; - op.setup_unit_b <= 1; - op.mnorm_put_hi <= 0; - op.mnorm_put_lo <= 1; - op.mnorm_put_mul <= 0; - op.mnorm_zero_flags <= 1; - op.mnorm_zero_b <= 1; - op.minmax_abs <= 1; - op.minmax_swap <= 0; - op.minmax_zero_min <= 0; - op.minmax_copy_flags <= 1; - op.shiftr_int_signed <= 0; - op.addsub_int_operand <= 0; - op.addsub_copy_flags <= 1; - op.clz_force_nop <= 1; - op.shiftl_copy_flags <= 1; - op.round_copy_flags <= 1; - op.round_enable <= 1; - op.encode_enable <= 1; - end - - INSN_FPINT_FMUL: begin - op.setup_mul_float <= 1; - op.setup_unit_b <= 0; - op.mnorm_put_hi <= 0; - op.mnorm_put_lo <= 0; - op.mnorm_put_mul <= 1; - op.mnorm_zero_flags <= 0; - op.mnorm_zero_b <= 1; - op.minmax_abs <= 1; - op.minmax_swap <= 0; - op.minmax_zero_min <= 0; - op.minmax_copy_flags <= 1; - op.shiftr_int_signed <= 0; - op.addsub_int_operand <= 0; - op.addsub_copy_flags <= 1; - op.clz_force_nop <= 1; - op.shiftl_copy_flags <= 1; - op.round_copy_flags <= 1; - op.round_enable <= 1; - op.encode_enable <= 1; - end - - INSN_FPINT_IMUL: begin - op.setup_mul_float <= 0; - op.setup_unit_b <= 0; - op.mnorm_put_hi <= 0; - op.mnorm_put_lo <= 1; - op.mnorm_put_mul <= 0; - op.mnorm_zero_flags <= 1; - op.mnorm_zero_b <= 1; - op.minmax_abs <= 1; - op.minmax_swap <= 0; - op.minmax_zero_min <= 0; - op.minmax_copy_flags <= 1; - op.shiftr_int_signed <= 0; - op.addsub_int_operand <= 0; - op.addsub_copy_flags <= 1; - op.clz_force_nop <= 1; - op.shiftl_copy_flags <= 1; - op.round_copy_flags <= 1; - op.round_enable <= 0; - op.encode_enable <= 0; - end - - INSN_FPINT_FADD: begin - op.setup_mul_float <= 0; - op.setup_unit_b <= 1; - op.mnorm_put_hi <= 0; - op.mnorm_put_lo <= 1; - op.mnorm_put_mul <= 0; - op.mnorm_zero_flags <= 0; - op.mnorm_zero_b <= 0; - op.minmax_abs <= 1; - op.minmax_swap <= 0; - op.minmax_zero_min <= 0; - op.minmax_copy_flags <= 0; - op.shiftr_int_signed <= 0; - op.addsub_int_operand <= 0; - op.addsub_copy_flags <= 0; - op.clz_force_nop <= 0; - op.shiftl_copy_flags <= 0; - op.round_copy_flags <= 0; - op.round_enable <= 1; - op.encode_enable <= 1; - end - - INSN_FPINT_FMAX, INSN_FPINT_FMIN: begin - op.setup_mul_float <= 0; - op.setup_unit_b <= 1; - op.mnorm_put_hi <= 0; - op.mnorm_put_lo <= 1; - op.mnorm_put_mul <= 0; - op.mnorm_zero_flags <= 0; - op.mnorm_zero_b <= 0; - op.minmax_abs <= 0; - op.minmax_swap <= insn.by_class.fpint.op == INSN_FPINT_FMIN; - op.minmax_zero_min <= 1; - op.minmax_copy_flags <= 1; - op.shiftr_int_signed <= 0; - op.addsub_int_operand <= 0; - op.addsub_copy_flags <= 1; - op.clz_force_nop <= 1; - op.shiftl_copy_flags <= 1; - op.round_copy_flags <= 1; - op.round_enable <= 0; - op.encode_enable <= 0; - end - - INSN_FPINT_FCVT: begin - op.setup_mul_float <= 0; - op.setup_unit_b <= 1; - op.mnorm_put_hi <= 0; - op.mnorm_put_lo <= 1; - op.mnorm_put_mul <= 0; - op.mnorm_zero_flags <= 1; - op.mnorm_zero_b <= 1; - - op.minmax_abs <= 1; - op.minmax_swap <= 0; - op.minmax_zero_min <= 0; - op.minmax_copy_flags <= 0; - op.shiftr_int_signed <= 1; - op.addsub_int_operand <= 1; - op.addsub_copy_flags <= 1; - op.clz_force_nop <= 0; - op.shiftl_copy_flags <= 0; - op.round_copy_flags <= 0; - op.round_enable <= 1; - op.encode_enable <= 1; - end - - default: - op <= 'x; - endcase - - op.writeback <= writeback; - end - -endmodule diff --git a/platform/wavelet3d/gfx_shader_group.sv b/platform/wavelet3d/gfx_shader_group.sv deleted file mode 100644 index e668877..0000000 --- a/platform/wavelet3d/gfx_shader_group.sv +++ /dev/null @@ -1,17 +0,0 @@ -module gfx_shader_group -import gfx::*; -( - input logic clk, - rst_n, - - input group_op op, - input wave_exec wave, - - gfx_regfile_io.ab read_data, - - gfx_shake.rx in_shake, - - gfx_wb.tx wb -); - -endmodule diff --git a/platform/wavelet3d/gfx_shader_mem.sv b/platform/wavelet3d/gfx_shader_mem.sv deleted file mode 100644 index 403c9e4..0000000 --- a/platform/wavelet3d/gfx_shader_mem.sv +++ /dev/null @@ -1,17 +0,0 @@ -module gfx_shader_mem -import gfx::*; -( - input logic clk, - rst_n, - - input mem_op op, - input wave_exec wave, - - gfx_regfile_io.ab read_data, - - gfx_shake.rx in_shake, - - gfx_wb.tx wb -); - -endmodule diff --git a/platform/wavelet3d/gfx_shader_regs.sv b/platform/wavelet3d/gfx_shader_regs.sv deleted file mode 100644 index ef3a129..0000000 --- a/platform/wavelet3d/gfx_shader_regs.sv +++ /dev/null @@ -1,302 +0,0 @@ -module gfx_shader_regs -import gfx::*; -( - input logic clk, - - gfx_regfile_io.regs io -); - - // verilator tracing_off - - localparam PC_TABLE_PORTS = 2; - localparam MASK_TABLE_PORTS = 1; - - word hold_imm[REGFILE_STAGES], imm_out, read_a_data_sgpr, read_b_data_scalar, - read_b_data_sgpr, read_const, read_a_data_vgpr[SHADER_LANES], - read_b_data_vgpr[SHADER_LANES], sgpr_out_a, sgpr_out_b; - - group_id mask_read_groups[MASK_TABLE_PORTS], pc_read_groups[PC_TABLE_PORTS]; - word_ptr pc_read[PC_TABLE_PORTS]; - lane_mask mask_read[MASK_TABLE_PORTS]; - - logic a_scalar_out, b_is_const_out, b_is_imm_out, b_scalar_out, scalar_rev_out; - group_id hold_read_group_1, hold_read_group_2; - sgpr_num hold_read_a_sgpr; - vgpr_num hold_read_a_vgpr_1, hold_read_a_vgpr_2, hold_read_b_vgpr_1, hold_read_b_vgpr_2; - logic[REGFILE_STAGES - 1:0] hold_b_is_imm, hold_b_is_const; - logic[REGFILE_STAGES + 1 - 1:0] hold_scalar_rev; - logic[REGFILE_STAGES + 2 - 1:0] hold_a_scalar, hold_b_scalar; - - assign io.pc_back = pc_read[0]; - assign io.pc_front = pc_read[1]; - assign pc_read_groups[0] = io.pc_back_group; - assign pc_read_groups[1] = io.pc_front_group; - - assign io.mask_back = mask_read[0]; - assign pc_read_groups[0] = io.mask_back_group; - - assign imm_out = hold_imm[$size(hold_imm) - 1]; - assign a_scalar_out = hold_a_scalar[$bits(hold_a_scalar) - 1]; - assign b_scalar_out = hold_b_scalar[$bits(hold_b_scalar) - 1]; - assign b_is_imm_out = hold_b_is_imm[$bits(hold_b_is_imm) - 1]; - assign b_is_const_out = hold_b_is_const[$bits(hold_b_is_const) - 1]; - assign scalar_rev_out = hold_scalar_rev[$bits(hold_scalar_rev) - 1]; - - gfx_shader_table #(.DATA_WIDTH($bits(word_ptr)), .READ_PORTS(PC_TABLE_PORTS)) pc_table - ( - .clk, - .read(pc_read), - .write(io.pc_wb), - .read_groups(pc_read_groups), - .write_group(io.pc_wb_group), - .write_enable(io.pc_wb_write) - ); - - gfx_shader_table #(.DATA_WIDTH($bits(lane_mask)), .READ_PORTS(MASK_TABLE_PORTS)) mask_table - ( - .clk, - .read(mask_read), - .write(io.mask_wb), - .read_groups(mask_read_groups), - .write_group(io.mask_wb_group), - .write_enable(io.mask_wb_write) - ); - - gfx_shader_consts consts - ( - .clk, - .num(io.op.b_sgpr), - .value(read_const) - ); - - gfx_shader_regfile #($bits(group_id) + $bits(sgpr_num)) sgprs - ( - .clk, - - .read_a_num({hold_read_group_1, hold_read_a_sgpr}), - .read_b_num({io.op.group, io.op.b_sgpr}), - .read_a_data(read_a_data_sgpr), - .read_b_data(read_b_data_sgpr), - - .write(io.sgpr_write.write), - .write_num({io.sgpr_write.group, io.sgpr_write.sgpr}), - .write_data(io.sgpr_write.data) - ); - - generate - for (genvar i = 0; i < SHADER_LANES; ++i) begin: vgprs - gfx_shader_regfile #($bits(group_id) + $bits(vgpr_num)) vgprs - ( - .clk, - - .read_a_num({hold_read_group_2, hold_read_a_vgpr_2}), - .read_b_num({hold_read_group_2, hold_read_b_vgpr_2}), - .read_a_data(read_a_data_vgpr[i]), - .read_b_data(read_b_data_vgpr[i]), - - .write(io.vgpr_write.mask[i]), - .write_num({io.vgpr_write.group, io.vgpr_write.vgpr}), - .write_data(io.vgpr_write.data[i]) - ); - end - endgenerate - - always_ff @(posedge clk) begin - hold_imm[0] <= {{($bits(word) - $bits(io.op.b_imm)){1'b0}}, io.op.b_imm}; - hold_a_scalar[0] <= io.op.a_scalar; - hold_b_scalar[0] <= io.op.b_scalar; - hold_b_is_imm[0] <= io.op.b_is_imm; - hold_b_is_const[0] <= io.op.b_is_const; - hold_scalar_rev[0] <= io.op.scalar_rev; - - for (int i = 1; i < REGFILE_STAGES; ++i) begin - hold_imm[i] <= hold_imm[i - 1]; - hold_a_scalar[i] <= hold_a_scalar[i - 1]; - hold_b_scalar[i] <= hold_b_scalar[i - 1]; - hold_b_is_imm[i] <= hold_b_is_imm[i - 1]; - hold_b_is_const[i] <= hold_b_is_const[i - 1]; - hold_scalar_rev[i] <= hold_scalar_rev[i - 1]; - end - - for (int i = REGFILE_STAGES; i < REGFILE_STAGES + 2; ++i) begin - hold_a_scalar[i] <= hold_a_scalar[i - 1]; - hold_b_scalar[i] <= hold_b_scalar[i - 1]; - end - - hold_scalar_rev[REGFILE_STAGES] <= hold_scalar_rev[REGFILE_STAGES - 1]; - - hold_read_a_sgpr <= io.op.a_sgpr; - hold_read_group_1 <= io.op.group; - hold_read_group_2 <= hold_read_group_1; - - hold_read_a_vgpr_1 <= io.op.a_vgpr; - hold_read_a_vgpr_2 <= hold_read_a_vgpr_1; - - hold_read_b_vgpr_1 <= io.op.b_vgpr; - hold_read_b_vgpr_2 <= hold_read_b_vgpr_1; - - if (b_is_imm_out) - read_b_data_scalar <= imm_out; - else if (b_is_const_out) - read_b_data_scalar <= read_const; - else - read_b_data_scalar <= read_b_data_sgpr; - - if (scalar_rev_out) begin - sgpr_out_a <= read_b_data_scalar; - sgpr_out_b <= read_a_data_sgpr; - end else begin - sgpr_out_a <= read_a_data_sgpr; - sgpr_out_b <= read_b_data_scalar; - end - - for (int i = 0; i < SHADER_LANES; ++i) begin - io.a[i] <= a_scalar_out ? sgpr_out_a : read_a_data_vgpr[i]; - io.b[i] <= b_scalar_out ? sgpr_out_b : read_a_data_vgpr[i]; - end - end - -endmodule - -module gfx_shader_consts -import gfx::*; -( - input logic clk, - - input sgpr_num num, - output word value -); - - word hold_out, rom[1 << $bits(sgpr_num)]; - sgpr_num hold_in; - - always_ff @(posedge clk) begin - value <= hold_out; - hold_in <= num; - hold_out <= rom[hold_in]; - end - - initial begin - rom[0] = 'hffff_ffff; // -1 - rom[1] = 'h7fff_ffff; // 2^31 - 1, útil para abs de fp - rom[2] = 'h8000_0000; // 2^31, útil para neg de fp - rom[3] = 'h3f80_0000; // +1.0 - rom[4] = 'hbf80_0000; // -1.0 - end - -endmodule - -module gfx_shader_regfile -import gfx::*; -#(int DEPTH_LOG = 0) -( - input logic clk, - - input logic[DEPTH_LOG - 1:0] read_a_num, - read_b_num, - output word read_a_data, - read_b_data, - - input logic write, - input logic[DEPTH_LOG - 1:0] write_num, - input word write_data -); - - gfx_shader_regfile_port #(DEPTH_LOG) a - ( - .clk, - .write, - .read_num(read_a_num), - .read_data(read_a_data), - .write_num, - .write_data - ); - - gfx_shader_regfile_port #(DEPTH_LOG) b - ( - .clk, - .write, - .read_num(read_b_num), - .read_data(read_b_data), - .write_num, - .write_data - ); - -endmodule - -module gfx_shader_regfile_port -import gfx::*; -#(int DEPTH_LOG = 0) -( - input logic clk, - - input logic[DEPTH_LOG - 1:0] read_num, - output word read_data, - - input logic write, - input logic[DEPTH_LOG - 1:0] write_num, - input word write_data -); - - word file[1 << DEPTH_LOG], hold_read_data, hold_write_data; - logic hold_write; - logic[DEPTH_LOG - 1:0] hold_read_num, hold_write_num; - - // hold_write no necesita rst_n porque cualquier write inicial es inofensivo - - always_ff @(posedge clk) begin - hold_write <= write; - hold_read_num <= read_num; - hold_write_num <= write_num; - hold_write_data <= write_data; - - hold_read_data <= file[hold_read_num]; - if (hold_write) - file[hold_write_num] <= hold_write_data; - - read_data <= hold_read_data; - end - -endmodule - -module gfx_shader_table -import gfx::*; -#(int DATA_WIDTH = 0, - int READ_PORTS = 0) -( - input logic clk, - - input group_id write_group, - read_groups[READ_PORTS], - - input logic[DATA_WIDTH - 1:0] write, - input logic write_enable, - - output logic[DATA_WIDTH - 1:0] read[READ_PORTS] -); - - genvar i; - - generate - for (i = 0; i < READ_PORTS; ++i) begin: ports - logic write_enable_hold; - group_id read_group_hold, write_group_hold; - logic[DATA_WIDTH - 1:0] data[1 << $bits(group_id)], read_hold, write_hold; - - always_ff @(posedge clk) begin - write_hold <= write; - read_group_hold <= read_groups[i]; - write_group_hold <= write_group; - write_enable_hold <= write_enable; - - read_hold <= data[read_group_hold]; - - if (write_enable_hold) - data[write_group_hold] <= write_hold; - - read[i] <= read_hold; - end - end - endgenerate - -endmodule diff --git a/platform/wavelet3d/gfx_shader_schedif.rdl b/platform/wavelet3d/gfx_shader_schedif.rdl deleted file mode 100644 index c846da9..0000000 --- a/platform/wavelet3d/gfx_shader_schedif.rdl +++ /dev/null @@ -1,91 +0,0 @@ -addrmap gfx_shader_schedif { - name = "Scheduler<->core interface"; - - default hw = r; - default sw = w; - default regwidth = 32; - - reg { - name = "Shader core control register"; - - field { - desc = "Set this field to flush the instruction cache"; - - singlepulse; - } IFLUSH[0:0] = 0; - } CORE @ 0x00; - - reg { - name = "Wavefront setup control register"; - - default hw = na; - default sw = r; - default precedence = hw; - - field { - desc = "Wavefront group number"; - - hw = r; - sw = rw; - } GROUP[5:0]; - - field { - desc = "Destination SGPR number"; - - hw = r; - sw = rw; - } XGPR[11:8]; - - field { - desc = "PC table update done, group submitted"; - - rclr; - hwset; - } SUBMIT_DONE[16:16] = 0; - - field { - desc = "General-purpose register update done"; - - rclr; - hwset; - } GPR_DONE[17:17] = 0; - - field { - desc = "Lane mask update done"; - - rclr; - hwset; - } MASK_DONE[18:18] = 0; - } SETUP_CTRL @ 0x04; - - reg { - name = "SGPR/VGPR write register"; - - field { - desc = "Value to write"; - - swmod; - } VALUE[31:0]; - } SETUP_GPR @ 0x08; - - reg { - name = "Lane mask write register"; - - field { - desc = "Mask value to write"; - - swmod; - } MASK[15:0]; - } SETUP_MASK @ 0x0c; - - reg { - name = "Group submit register"; - - field { - desc = "Initial group program counter, submits group on write"; - - swmod; - } PC[31:2]; - } SETUP_SUBMIT @ 0x10; -}; - diff --git a/platform/wavelet3d/gfx_shader_setup.sv b/platform/wavelet3d/gfx_shader_setup.sv deleted file mode 100644 index f46fb66..0000000 --- a/platform/wavelet3d/gfx_shader_setup.sv +++ /dev/null @@ -1,37 +0,0 @@ -interface gfx_shader_setup -import gfx::*;; - - struct - { - group_id group; - word_ptr pc; - xgpr_num gpr; - word gpr_value; - lane_mask mask; - logic pc_set, - gpr_set, - mask_set; - } write; - - struct - { - logic gpr, - mask, - submit; - } set_done; - - modport core - ( - input write, - - output set_done - ); - - modport sched - ( - input set_done, - - output write - ); - -endinterface diff --git a/platform/wavelet3d/gfx_shader_sfu.sv b/platform/wavelet3d/gfx_shader_sfu.sv deleted file mode 100644 index d65e522..0000000 --- a/platform/wavelet3d/gfx_shader_sfu.sv +++ /dev/null @@ -1,17 +0,0 @@ -module gfx_shader_sfu -import gfx::*; -( - input logic clk, - rst_n, - - input sfu_op op, - input wave_exec wave, - - gfx_regfile_io.ab read_data, - - gfx_shake.rx in_shake, - - gfx_wb.tx wb -); - -endmodule diff --git a/platform/wavelet3d/gfx_shake.sv b/platform/wavelet3d/gfx_shake.sv deleted file mode 100644 index baae0c3..0000000 --- a/platform/wavelet3d/gfx_shake.sv +++ /dev/null @@ -1,24 +0,0 @@ -interface gfx_shake; - - logic ready; - logic valid; - - modport tx - ( - input ready, - output valid - ); - - modport rx - ( - input valid, - output ready - ); - - modport peek - ( - input ready, - valid - ); - -endinterface diff --git a/platform/wavelet3d/gfx_sim_debug.sv b/platform/wavelet3d/gfx_sim_debug.sv deleted file mode 100644 index 4b4622a..0000000 --- a/platform/wavelet3d/gfx_sim_debug.sv +++ /dev/null @@ -1,50 +0,0 @@ -module gfx_sim_debug -import gfx::*; -( - input logic clk, - rst_n, - - gfx_axil.s axis -); - - enum int unsigned - { - INPUT, - STALL - } state; - - assign axis.rvalid = 0; - assign axis.arready = 0; - assign axis.awready = 1; - - always_comb - unique case (state) - INPUT: begin - axis.wready = 1; - axis.bvalid = axis.wvalid; - end - - STALL: begin - axis.wready = 0; - axis.bvalid = 1; - end - endcase - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) - state <= INPUT; - else - unique case (state) - INPUT: - if (axis.wvalid) begin - $display("%c", axis.wdata[7:0]); - if (~axis.bready) - state <= STALL; - end - - STALL: - if (axis.bready) - state <= INPUT; - endcase - -endmodule diff --git a/platform/wavelet3d/gfx_skid_buf.sv b/platform/wavelet3d/gfx_skid_buf.sv deleted file mode 100644 index e3e5247..0000000 --- a/platform/wavelet3d/gfx_skid_buf.sv +++ /dev/null @@ -1,20 +0,0 @@ -module gfx_skid_buf -#(int WIDTH = 0) -( - input logic clk, - - input logic[WIDTH - 1:0] in, - input logic stall, - - output logic[WIDTH - 1:0] out -); - - logic[WIDTH - 1:0] skid; - - assign out = stall ? skid : in; - - always_ff @(posedge clk) - if (~stall) - skid <= in; - -endmodule diff --git a/platform/wavelet3d/gfx_skid_flow.sv b/platform/wavelet3d/gfx_skid_flow.sv deleted file mode 100644 index 7890ae3..0000000 --- a/platform/wavelet3d/gfx_skid_flow.sv +++ /dev/null @@ -1,31 +0,0 @@ -module gfx_skid_flow -( - input logic clk, - rst_n, - - input logic in_valid, - out_ready, - - output logic in_ready, - out_valid, - stall -); - - logic was_ready, was_valid; - - assign stall = ~in_ready; - assign in_ready = was_ready | ~was_valid; - assign out_valid = in_valid | stall; - - always_ff @(posedge clk or negedge rst_n) - if (~rst_n) begin - was_ready <= 0; - was_valid <= 0; - end else begin - was_ready <= out_ready; - - if (~stall) - was_valid <= in_valid; - end - -endmodule diff --git a/platform/wavelet3d/gfx_top.sv b/platform/wavelet3d/gfx_top.sv deleted file mode 100644 index 41ff7f4..0000000 --- a/platform/wavelet3d/gfx_top.sv +++ /dev/null @@ -1,160 +0,0 @@ -module gfx_top -import gfx::*; -( - input logic clk, - rst_n, - - input word a[SHADER_LANES], - b[SHADER_LANES], - input logic in_valid, - setup_mul_float, - setup_unit_b, - mnorm_put_hi, - mnorm_put_lo, - mnorm_put_mul, - mnorm_zero_b, - mnorm_zero_flags, - minmax_abs, - minmax_swap, - minmax_zero_min, - minmax_copy_flags, - shiftr_int_signed, - addsub_copy_flags, - addsub_int_operand, - clz_force_nop, - shiftl_copy_flags, - round_copy_flags, - round_enable, - encode_enable, - - output logic out_valid, - output word q[SHADER_LANES], - - input word geom_tdata, - input logic geom_tlast, - geom_tvalid, - output logic geom_tready, - - input logic raster_tready, - output logic raster_tlast, - raster_tvalid, - output word raster_tdata -); - - logic srst_n; - - gfx_wb fpint_wb(); - gfx_axib insn_mem(); - gfx_pkts geometry(), coverage(); - gfx_regfile_io fpint_io(); - gfx_axil bootrom_axi(), debug_axi(), sched_axi(), shader_0_axi(); - - assign q = fpint_wb.rx.lanes; - assign out_valid = fpint_wb.rx.valid; - - assign geometry.tx.tdata = geom_tdata; - assign geometry.tx.tlast = geom_tlast; - assign geometry.tx.tvalid = geom_tvalid; - assign geom_tready = geometry.tx.tready; - - assign raster_tdata = coverage.rx.tdata; - assign raster_tlast = coverage.rx.tlast; - assign raster_tvalid = coverage.rx.tvalid; - assign coverage.rx.tready = raster_tready; - - fpint_op op; - assign op.writeback = 1; - assign op.setup_mul_float = setup_mul_float; - assign op.setup_unit_b = setup_unit_b; - assign op.mnorm_put_hi = mnorm_put_hi; - assign op.mnorm_put_lo = mnorm_put_lo; - assign op.mnorm_put_mul = mnorm_put_mul; - assign op.mnorm_zero_b = mnorm_zero_b; - assign op.mnorm_zero_flags = mnorm_zero_flags; - assign op.minmax_abs = minmax_abs; - assign op.minmax_swap = minmax_swap; - assign op.minmax_zero_min = minmax_zero_min; - assign op.minmax_copy_flags = minmax_copy_flags; - assign op.shiftr_int_signed = shiftr_int_signed; - assign op.addsub_copy_flags = addsub_copy_flags; - assign op.addsub_int_operand = addsub_int_operand; - assign op.clz_force_nop = clz_force_nop; - assign op.shiftl_copy_flags = shiftl_copy_flags; - assign op.round_copy_flags = round_copy_flags; - assign op.round_enable = round_enable; - assign op.encode_enable = encode_enable; - - assign fpint_io.regs.a = a; - assign fpint_io.regs.b = b; - - gfx_rst_sync rst_sync - ( - .clk, - .rst_n, - .srst_n - ); - - gfx_shader_fpint fpint - ( - .clk, - .rst_n, - .op, - .wb(fpint_wb.tx), - .wave(), - .abort(0), - .in_valid, - .read_data(fpint_io.ab) - ); - - gfx_sched sched - ( - .clk, - .rst_n, - .srst_n, - .irq(0), - .axim(sched_axi.m) - ); - - gfx_bootrom bootrom - ( - .clk, - .rst_n, - .axis(bootrom_axi.s) - ); - - gfx_sim_debug debug - ( - .clk, - .rst_n, - .axis(debug_axi.s) - ); - - gfx_shader shader_0 - ( - .clk, - .rst_n, - .sched(shader_0_axi.s), - .insn_mem(insn_mem.m) - ); - - gfx_xbar_sched xbar - ( - .clk, - .srst_n, - - .sched(sched_axi.s), - - .debug(debug_axi.m), - .bootrom(bootrom_axi.m), - .shader_0(shader_0_axi.m) - ); - - gfx_raster raster - ( - .clk, - .rst_n, - .geometry(geometry.rx), - .coverage(coverage.tx) - ); - -endmodule diff --git a/platform/wavelet3d/gfx_wb.sv b/platform/wavelet3d/gfx_wb.sv deleted file mode 100644 index 20c7c64..0000000 --- a/platform/wavelet3d/gfx_wb.sv +++ /dev/null @@ -1,51 +0,0 @@ -interface gfx_wb; - - import gfx::*; - - word lanes[SHADER_LANES]; - logic mask_update, pc_inc, pc_update, ready, scalar, valid, writeback; - group_id group; - xgpr_num dest; - lane_mask mask; - pc_offset pc_add; - - modport tx - ( - input ready, - - output dest, - group, - lanes, - valid, - scalar, - writeback, - - mask, - mask_update, - - pc_add, - pc_inc, - pc_update - ); - - modport rx - ( - input dest, - group, - lanes, - valid, - scalar, - writeback, - - mask, - mask_update, - - pc_add, - pc_inc, - pc_update, - - output ready - ); - - -endinterface diff --git a/platform/wavelet3d/gfx_xbar_sched.sv b/platform/wavelet3d/gfx_xbar_sched.sv deleted file mode 100644 index 95e4afb..0000000 --- a/platform/wavelet3d/gfx_xbar_sched.sv +++ /dev/null @@ -1,146 +0,0 @@ -module gfx_xbar_sched -import gfx::*; -( - input logic clk, - srst_n, - - gfx_axil.s sched, - - gfx_axil.m debug, - gfx_axil.m bootrom, - gfx_axil.m shader_0 -); - - localparam word BOOTROM_MASK = 32'hfff0_0000; - localparam word DEBUG_BASE = 32'h0020_0000; - localparam word DEBUG_MASK = 32'hfff0_0000; - localparam word SHADER_0_BASE = 32'h0100_0000; - localparam word SHADER_0_MASK = 32'hfff0_0000; - - defparam xbar.NM = 1; - defparam xbar.NS = 3; - defparam xbar.OPT_LOWPOWER = 0; - - defparam xbar.SLAVE_ADDR = { - SHADER_0_BASE, - DEBUG_BASE, - BOOTROM_BASE - }; - - defparam xbar.SLAVE_MASK = { - SHADER_0_MASK, - DEBUG_MASK, - BOOTROM_MASK - }; - - axilxbar xbar - ( - .S_AXI_ACLK(clk), - .S_AXI_ARESETN(srst_n), - - .S_AXI_AWVALID(sched.awvalid), - .S_AXI_AWREADY(sched.awready), - .S_AXI_AWADDR(sched.awaddr), - .S_AXI_AWPROT('0), - - .S_AXI_WVALID(sched.wvalid), - .S_AXI_WREADY(sched.wready), - .S_AXI_WDATA(sched.wdata), - .S_AXI_WSTRB('1), - - .S_AXI_BVALID(sched.bvalid), - .S_AXI_BREADY(sched.bready), - .S_AXI_BRESP(), - - .S_AXI_ARVALID(sched.arvalid), - .S_AXI_ARREADY(sched.arready), - .S_AXI_ARADDR(sched.araddr), - .S_AXI_ARPROT('0), - - .S_AXI_RVALID(sched.rvalid), - .S_AXI_RREADY(sched.rready), - .S_AXI_RDATA(sched.rdata), - .S_AXI_RRESP(), - - .M_AXI_AWADDR({ - shader_0.awaddr, - debug.awaddr, - bootrom.awaddr - }), - .M_AXI_AWPROT(), - .M_AXI_AWVALID({ - shader_0.awvalid, - debug.awvalid, - bootrom.awvalid - }), - .M_AXI_AWREADY({ - shader_0.awready, - debug.awready, - bootrom.awready - }), - - .M_AXI_WDATA({ - shader_0.wdata, - debug.wdata, - bootrom.wdata - }), - .M_AXI_WSTRB(), - .M_AXI_WVALID({ - shader_0.wvalid, - debug.wvalid, - bootrom.wvalid - }), - .M_AXI_WREADY({ - shader_0.wready, - debug.wready, - bootrom.wready - }), - - .M_AXI_BRESP('0), - .M_AXI_BVALID({ - shader_0.bvalid, - debug.bvalid, - bootrom.bvalid - }), - .M_AXI_BREADY({ - shader_0.bready, - debug.bready, - bootrom.bready - }), - - .M_AXI_ARADDR({ - shader_0.araddr, - debug.araddr, - bootrom.araddr - }), - .M_AXI_ARPROT(), - .M_AXI_ARVALID({ - shader_0.arvalid, - debug.arvalid, - bootrom.arvalid - }), - .M_AXI_ARREADY({ - shader_0.arready, - debug.arready, - bootrom.arready - }), - - .M_AXI_RDATA({ - shader_0.rdata, - debug.rdata, - bootrom.rdata - }), - .M_AXI_RRESP('0), - .M_AXI_RVALID({ - shader_0.rvalid, - debug.rvalid, - bootrom.rvalid - }), - .M_AXI_RREADY({ - shader_0.rready, - debug.rready, - bootrom.rready - }) - ); - -endmodule diff --git a/platform/wavelet3d/mod.mk b/platform/wavelet3d/mod.mk index 153f9c7..16c6cfc 100644 --- a/platform/wavelet3d/mod.mk +++ b/platform/wavelet3d/mod.mk @@ -1,21 +1,10 @@ -cores := gfx_shader_schedif - define core - $(this)/deps := axixbar fp_unit gfx_shader_schedif picorv32 + $(this)/deps := gfx - $(this)/rtl_top := gfx_top + $(this)/rtl_top := w3d_top $(this)/rtl_dirs := . - $(this)/rtl_files := gfx_isa.sv gfx_pkg.sv gfx_top.sv + $(this)/rtl_files := w3d_top.sv $(this)/vl_main := main.cpp $(this)/vl_pkgconfig := sdl2 endef - -define core/gfx_shader_schedif - $(this)/hooks := regblock - - $(this)/regblock_rdl := gfx_shader_schedif.rdl - $(this)/regblock_top := gfx_shader_schedif - $(this)/regblock_args := --default-reset arst_n - $(this)/regblock_cpuif := axi4-lite -endef diff --git a/platform/wavelet3d/w3d_top.sv b/platform/wavelet3d/w3d_top.sv new file mode 100644 index 0000000..34ecb52 --- /dev/null +++ b/platform/wavelet3d/w3d_top.sv @@ -0,0 +1,160 @@ +module w3d_top +import gfx::*; +( + input logic clk, + rst_n, + + input word a[SHADER_LANES], + b[SHADER_LANES], + input logic in_valid, + setup_mul_float, + setup_unit_b, + mnorm_put_hi, + mnorm_put_lo, + mnorm_put_mul, + mnorm_zero_b, + mnorm_zero_flags, + minmax_abs, + minmax_swap, + minmax_zero_min, + minmax_copy_flags, + shiftr_int_signed, + addsub_copy_flags, + addsub_int_operand, + clz_force_nop, + shiftl_copy_flags, + round_copy_flags, + round_enable, + encode_enable, + + output logic out_valid, + output word q[SHADER_LANES], + + input word geom_tdata, + input logic geom_tlast, + geom_tvalid, + output logic geom_tready, + + input logic raster_tready, + output logic raster_tlast, + raster_tvalid, + output word raster_tdata +); + + logic srst_n; + + gfx_wb fpint_wb(); + gfx_axib insn_mem(); + gfx_pkts geometry(), coverage(); + gfx_regfile_io fpint_io(); + gfx_axil bootrom_axi(), debug_axi(), sched_axi(), shader_0_axi(); + + assign q = fpint_wb.rx.lanes; + assign out_valid = fpint_wb.rx.valid; + + assign geometry.tx.tdata = geom_tdata; + assign geometry.tx.tlast = geom_tlast; + assign geometry.tx.tvalid = geom_tvalid; + assign geom_tready = geometry.tx.tready; + + assign raster_tdata = coverage.rx.tdata; + assign raster_tlast = coverage.rx.tlast; + assign raster_tvalid = coverage.rx.tvalid; + assign coverage.rx.tready = raster_tready; + + fpint_op op; + assign op.writeback = 1; + assign op.setup_mul_float = setup_mul_float; + assign op.setup_unit_b = setup_unit_b; + assign op.mnorm_put_hi = mnorm_put_hi; + assign op.mnorm_put_lo = mnorm_put_lo; + assign op.mnorm_put_mul = mnorm_put_mul; + assign op.mnorm_zero_b = mnorm_zero_b; + assign op.mnorm_zero_flags = mnorm_zero_flags; + assign op.minmax_abs = minmax_abs; + assign op.minmax_swap = minmax_swap; + assign op.minmax_zero_min = minmax_zero_min; + assign op.minmax_copy_flags = minmax_copy_flags; + assign op.shiftr_int_signed = shiftr_int_signed; + assign op.addsub_copy_flags = addsub_copy_flags; + assign op.addsub_int_operand = addsub_int_operand; + assign op.clz_force_nop = clz_force_nop; + assign op.shiftl_copy_flags = shiftl_copy_flags; + assign op.round_copy_flags = round_copy_flags; + assign op.round_enable = round_enable; + assign op.encode_enable = encode_enable; + + assign fpint_io.regs.a = a; + assign fpint_io.regs.b = b; + + gfx_rst_sync rst_sync + ( + .clk, + .rst_n, + .srst_n + ); + + gfx_shader_fpint fpint + ( + .clk, + .rst_n, + .op, + .wb(fpint_wb.tx), + .wave(), + .abort(0), + .in_valid, + .read_data(fpint_io.ab) + ); + + gfx_sched sched + ( + .clk, + .rst_n, + .srst_n, + .irq(0), + .axim(sched_axi.m) + ); + + gfx_bootrom bootrom + ( + .clk, + .rst_n, + .axis(bootrom_axi.s) + ); + + gfx_sim_debug debug + ( + .clk, + .rst_n, + .axis(debug_axi.s) + ); + + gfx_shader shader_0 + ( + .clk, + .rst_n, + .sched(shader_0_axi.s), + .insn_mem(insn_mem.m) + ); + + gfx_xbar_sched xbar + ( + .clk, + .srst_n, + + .sched(sched_axi.s), + + .debug(debug_axi.m), + .bootrom(bootrom_axi.m), + .shader_0(shader_0_axi.m) + ); + + gfx_raster raster + ( + .clk, + .rst_n, + .geometry(geometry.rx), + .coverage(coverage.tx) + ); + +endmodule diff --git a/rtl/gfx/gfx_axib.sv b/rtl/gfx/gfx_axib.sv new file mode 100644 index 0000000..7b3cbdc --- /dev/null +++ b/rtl/gfx/gfx_axib.sv @@ -0,0 +1,81 @@ +// AXI4 con burst +interface gfx_axib; + + import gfx::word; + + logic awvalid, + awready; + logic[7:0] awlen; + logic[1:0] awburst; + word awaddr; + + logic wlast; + logic wvalid; + logic wready; + word wdata; + + logic bvalid; + logic bready; + + logic arvalid, + arready; + logic[7:0] arlen; + logic[1:0] arburst; + word araddr; + + logic rlast; + logic rvalid; + logic rready; + word rdata; + + modport m + ( + input awready, + wready, + bvalid, + arready, + rlast, + rvalid, + rdata, + + output awlen, + awburst, + awvalid, + awaddr, + wlast, + wvalid, + wdata, + bready, + arlen, + arburst, + arvalid, + araddr, + rready + ); + + modport s + ( + input awlen, + awburst, + awvalid, + awaddr, + wlast, + wvalid, + wdata, + bready, + arlen, + arburst, + arvalid, + araddr, + rready, + + output awready, + wready, + bvalid, + arready, + rlast, + rvalid, + rdata + ); + +endinterface diff --git a/rtl/gfx/gfx_axil.sv b/rtl/gfx/gfx_axil.sv new file mode 100644 index 0000000..c254e26 --- /dev/null +++ b/rtl/gfx/gfx_axil.sv @@ -0,0 +1,61 @@ +// AXI4-Lite, sin wstrb ni axprot +interface gfx_axil; + import gfx::*; + + logic awvalid; + logic awready; + word awaddr; + + logic wvalid; + logic wready; + word wdata; + + logic bvalid; + logic bready; + + logic arvalid; + logic arready; + word araddr; + + logic rvalid; + logic rready; + word rdata; + + modport m + ( + input awready, + wready, + bvalid, + arready, + rvalid, + rdata, + + output awvalid, + awaddr, + wvalid, + wdata, + bready, + arvalid, + araddr, + rready + ); + + modport s + ( + input awvalid, + awaddr, + wvalid, + wdata, + bready, + arvalid, + araddr, + rready, + + output awready, + wready, + bvalid, + arready, + rvalid, + rdata + ); +endinterface diff --git a/rtl/gfx/gfx_axil2regblock.sv b/rtl/gfx/gfx_axil2regblock.sv new file mode 100644 index 0000000..2449b05 --- /dev/null +++ b/rtl/gfx/gfx_axil2regblock.sv @@ -0,0 +1,30 @@ +module gfx_axil2regblock +( + gfx_axil.s axis, + axi4lite_intf.master axim +); + + assign axis.rdata = axim.RDATA; + assign axis.rvalid = axim.RVALID; + assign axis.bvalid = axim.BVALID; + assign axis.wready = axim.WREADY; + assign axis.arready = axim.ARREADY; + assign axis.awready = axim.AWREADY; + + assign axim.AWVALID = axis.awvalid; + assign axim.AWADDR = axis.awaddr[$bits(axim.AWADDR) - 1:0]; + assign axim.AWPROT = '0; + + assign axim.WVALID = axis.wvalid; + assign axim.WDATA = axis.wdata; + assign axim.WSTRB = '1; + + assign axim.BREADY = axis.bready; + + assign axim.ARVALID = axis.arvalid; + assign axim.ARADDR = axis.araddr[$bits(axim.ARADDR) - 1:0]; + assign axim.ARPROT = '0; + + assign axim.RREADY = axis.rready; + +endmodule diff --git a/rtl/gfx/gfx_beats.sv b/rtl/gfx/gfx_beats.sv new file mode 100644 index 0000000..fcbb091 --- /dev/null +++ b/rtl/gfx/gfx_beats.sv @@ -0,0 +1,29 @@ +interface gfx_beats +#(int WIDTH = $bits(gfx::word)); + + logic[WIDTH - 1:0] data; + logic ready; + logic valid; + + modport tx + ( + input ready, + output data, + valid + ); + + modport rx + ( + input data, + valid, + output ready + ); + + modport peek + ( + input data, + ready, + valid + ); + +endinterface diff --git a/rtl/gfx/gfx_bootrom.sv b/rtl/gfx/gfx_bootrom.sv new file mode 100644 index 0000000..2c4581e --- /dev/null +++ b/rtl/gfx/gfx_bootrom.sv @@ -0,0 +1,66 @@ +module gfx_bootrom +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axil.s axis +); + + localparam ROM_WORDS_LOG = 8; + + enum int unsigned + { + WAIT, + READ, + RDATA, + READY + } state; + + word read, rom[1 << ROM_WORDS_LOG]; + logic[ROM_WORDS_LOG - 1:0] read_addr; + + assign axis.bvalid = 0; + assign axis.wready = 0; + assign axis.awready = 0; + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= WAIT; + axis.rvalid <= 0; + axis.arready <= 0; + end else begin + axis.arready <= 0; + + unique case (state) + WAIT: + if (axis.arvalid & ~axis.arready) + state <= READ; + + READ: + state <= RDATA; + + RDATA: begin + state <= READY; + axis.rvalid <= 1; + end + + READY: + if (axis.rready) begin + state <= WAIT; + axis.rvalid <= 0; + axis.arready <= 1; + end + endcase + end + + always_ff @(posedge clk) begin + read <= rom[read_addr]; + read_addr <= axis.araddr[$bits(read_addr) + SUBWORD_BITS - 1:SUBWORD_BITS]; + axis.rdata <= read; + end + + initial + $readmemh("gfx_bootrom.hex", rom); + +endmodule diff --git a/rtl/gfx/gfx_clz.sv b/rtl/gfx/gfx_clz.sv new file mode 100644 index 0000000..8d6f100 --- /dev/null +++ b/rtl/gfx/gfx_clz.sv @@ -0,0 +1,68 @@ +/* Implementación en árbol de count leading zeros (CLZ). + * WIDTH debe ser una potencia de 2. + */ +module gfx_clz +#(int WIDTH = 0) +( + input logic clk, + + input logic[WIDTH - 1:0] value, + output logic[$clog2(WIDTH):0] clz +); + + genvar i; + generate + if (WIDTH <= 1) begin + always_ff @(posedge clk) + clz <= !value; + end else if (WIDTH == 2) begin + always_ff @(posedge clk) + unique case (value) + 2'b00: clz <= 2'b10; + 2'b01: clz <= 2'b01; + 2'b10: clz <= 2'b00; + 2'b11: clz <= 2'b00; + endcase + end else if (WIDTH == 4) begin + // Eficiente en FPGAs con 4-LUTs + always_ff @(posedge clk) + if (value[3]) + clz <= 3'b000; + else if (value[2]) + clz <= 3'b001; + else if (value[1]) + clz <= 3'b010; + else if (value[0]) + clz <= 3'b011; + else + clz <= 3'b100; + end else begin + logic msb_right; + logic[$clog2(WIDTH) - 1:0] clz_left, clz_right; + logic[$clog2(WIDTH) - 2:0] tail_right; + + assign {msb_right, tail_right} = clz_right; + + gfx_clz #(WIDTH / 2) left + ( + .clk(clk), + .clz(clz_left), + .value(value[WIDTH - 1:WIDTH / 2]) + ); + + gfx_clz #(WIDTH / 2) right + ( + .clk(clk), + .clz(clz_right), + .value(value[WIDTH / 2 - 1:0]) + ); + + always_ff @(posedge clk) + if (clz_left[$clog2(WIDTH) - 1]) + clz <= {msb_right, ~msb_right, tail_right}; + else + clz <= {1'b0, clz_left}; + end + endgenerate + +endmodule diff --git a/rtl/gfx/gfx_ctz.sv b/rtl/gfx/gfx_ctz.sv new file mode 100644 index 0000000..2713f8a --- /dev/null +++ b/rtl/gfx/gfx_ctz.sv @@ -0,0 +1,18 @@ +// Count trailing zeros (ctz), clz al revés +module gfx_ctz +#(int WIDTH = 0) +( + input logic clk, + + input logic[WIDTH - 1:0] value, + output logic[$clog2(WIDTH):0] ctz +); + + gfx_clz #(WIDTH) clz + ( + .clk, + .value({<<{value}}), + .clz(ctz) + ); + +endmodule diff --git a/rtl/gfx/gfx_fifo.sv b/rtl/gfx/gfx_fifo.sv new file mode 100644 index 0000000..7174e4d --- /dev/null +++ b/rtl/gfx/gfx_fifo.sv @@ -0,0 +1,102 @@ +module gfx_fifo +#(int WIDTH = 0, + int DEPTH = 0) +( + input logic clk, + rst_n, + + gfx_beats.rx in, + gfx_beats.tx out +); + + logic do_read, do_write, full_if_eq, in_stall, out_stall, + may_read, may_write, read, read_ok, write; + + logic[WIDTH - 1:0] fifo[DEPTH], read_data, write_data; + logic[$clog2(DEPTH) - 1:0] read_ptr, write_ptr; + + assign do_read = read & may_read; + assign do_write = write & may_write; + + always_comb begin + may_read = full_if_eq; + may_write = !full_if_eq; + + if (read) + may_write = 1; + + if (read_ptr != write_ptr) begin + may_read = 1; + may_write = 1; + end + end + + gfx_skid_flow in_flow + ( + .clk, + .rst_n, + .stall(in_stall), + .in_ready(in.ready), + .in_valid(in.valid), + .out_ready(may_write), + .out_valid(write) + ); + + gfx_skid_flow out_flow + ( + .clk, + .rst_n, + .stall(out_stall), + .in_ready(read), + .in_valid(read_ok), + .out_ready(out.ready), + .out_valid(out.valid) + ); + + gfx_skid_buf #(WIDTH) in_skid + ( + .clk, + .in(in.data), + .out(write_data), + .stall(in_stall) + ); + + gfx_skid_buf #(WIDTH) out_skid + ( + .clk, + .in(read_data), + .out(out.data), + .stall(out_stall) + ); + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + read_ok <= 0; + read_ptr <= 0; + write_ptr <= 0; + full_if_eq <= 0; + end else begin + if (~out_stall) + read_ok <= read && may_read; + + if (do_read) + read_ptr <= read_ptr + 1; + + if (do_write) + write_ptr <= write_ptr + 1; + + if (do_read & ~do_write) + full_if_eq <= 0; + else if (~do_read & do_write) + full_if_eq <= 1; + end + + always_ff @(posedge clk) begin + if (~out_stall) + read_data <= fifo[read_ptr]; + + if (may_write) + fifo[write_ptr] <= write_data; + end + +endmodule diff --git a/rtl/gfx/gfx_fixed_dotadd.sv b/rtl/gfx/gfx_fixed_dotadd.sv new file mode 100644 index 0000000..fdd5ffd --- /dev/null +++ b/rtl/gfx/gfx_fixed_dotadd.sv @@ -0,0 +1,55 @@ +module gfx_fixed_dotadd +( + input logic clk, + + input gfx::fixed a0, + b0, + a1, + b1, + c, + input logic stall, + + output gfx::fixed q +); + + import gfx::*; + + fixed q0, a1_hold, b1_hold; + + gfx_fixed_muladd muladd_0 + ( + .clk, + .a(a0), + .b(b0), + .c, + .q(q0), + .stall + ); + + gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes + ( + .clk, + .in(a1), + .out(a1_hold), + .stall + ); + + gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes + ( + .clk, + .in(b1), + .out(b1_hold), + .stall + ); + + gfx_fixed_muladd muladd_1 + ( + .clk, + .a(a1_hold), + .b(b1_hold), + .c(q0), + .q, + .stall + ); + +endmodule diff --git a/rtl/gfx/gfx_fixed_muladd.sv b/rtl/gfx/gfx_fixed_muladd.sv new file mode 100644 index 0000000..22b7247 --- /dev/null +++ b/rtl/gfx/gfx_fixed_muladd.sv @@ -0,0 +1,77 @@ +module gfx_fixed_muladd +( + input logic clk, + + input gfx::fixed a, + b, + c, + input logic stall, + + output gfx::fixed q +); + + import gfx::*; + +`ifndef VERILATOR + logic[2 * $bits(fixed) - $bits(fixed_frac) - 1:0] q_ext; + + assign q = q_ext[$bits(fixed) - 1:0]; + + lpm_mult mult + ( + .aclr(0), + .clock(clk), + .clken(!stall), + + .sum({c, {`FIXED_FRAC{1'b0}}}), + .dataa(a), + .datab(b), + .result(q_ext) + ); + + defparam + mult.lpm_widtha = $bits(fixed), + mult.lpm_widthb = $bits(fixed), + mult.lpm_widths = $bits(fixed) + $bits(fixed_frac), + /* Esto es crucial. No está documentado en ningún lado (aparte de un + * comentario en r/fpga). Si lpm_widthp < lpm_widtha + lpm_widthb, + * entonces result contiene los lpm_widthp bits más significativos + * del producto, no los menos significativos como tendría sentido. + */ + mult.lpm_widthp = 2 * $bits(fixed) - $bits(fixed_frac), + mult.lpm_representation = "SIGNED", + mult.lpm_pipeline = FIXED_MULADD_DEPTH; +`else + logic[$bits(fixed) + $bits(fixed_frac) - 1:0] q_ext; + + fixed a_hold, b_hold, c_hold; + + assign q = q_ext[$bits(fixed) + $bits(fixed_frac) - 1:$bits(fixed_frac)] + c_hold; + assign q_ext = a_hold * b_hold; + + gfx_pipes #(.WIDTH($bits(a)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes + ( + .clk, + .in(a), + .out(a_hold), + .stall + ); + + gfx_pipes #(.WIDTH($bits(b)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes + ( + .clk, + .in(b), + .out(b_hold), + .stall + ); + + gfx_pipes #(.WIDTH($bits(c)), .DEPTH(FIXED_MULADD_DEPTH)) c_pipes + ( + .clk, + .in(c), + .out(c_hold), + .stall + ); +`endif + +endmodule diff --git a/rtl/gfx/gfx_front_back.sv b/rtl/gfx/gfx_front_back.sv new file mode 100644 index 0000000..b768532 --- /dev/null +++ b/rtl/gfx/gfx_front_back.sv @@ -0,0 +1,37 @@ +interface gfx_front_back +import gfx::*;; + + struct + { + wave_exec wave; + fpint_op p0; + mem_op p1; + sfu_op p2; + group_op p3; + } execute; + + struct + { + logic valid; + group_id group; + } loop; + + shader_dispatch dispatch; + + modport front + ( + input loop, + + output execute, + dispatch + ); + + modport back + ( + input execute, + dispatch, + + output loop + ); + +endinterface diff --git a/rtl/gfx/gfx_isa.sv b/rtl/gfx/gfx_isa.sv new file mode 100644 index 0000000..7239478 --- /dev/null +++ b/rtl/gfx/gfx_isa.sv @@ -0,0 +1,84 @@ +package gfx_isa; + + typedef logic[3:0] sgpr_num; + typedef logic[2:0] vgpr_num; + + typedef logic signed[7:0] pc_offset; + + typedef union packed + { + sgpr_num sgpr; + + struct packed + { + logic[$bits(sgpr_num) - $bits(vgpr_num) - 1:0] reserved; + vgpr_num num; + } vgpr; + } xgpr_num; + + typedef struct packed + { + enum logic[1:0] + { + REGS_SVS = 2'b00, + REGS_SSS = 2'b01, + REGS_VVS = 2'b10, + REGS_VVV = 2'b11 + } reg_mode; + + union packed + { + struct packed + { + logic b_is_imm; + + union packed + { + logic[12:0] imm; + + struct packed + { + logic from_consts; + logic[7:0] reserved; + xgpr_num r; + } read; + } b; + + xgpr_num ra, + rd; + } rr; + } dst_src; + + logic reg_rev; + + union packed + { + struct packed + { + enum logic[4:0] + { + INSN_FPINT_MOV = 0, + INSN_FPINT_FMUL = 1, + INSN_FPINT_IMUL = 2, + INSN_FPINT_FADD = 3, + INSN_FPINT_RES4 = 4, + INSN_FPINT_FMAX = 5, + INSN_FPINT_RES6 = 6, + INSN_FPINT_FMIN = 7, + INSN_FPINT_RES8 = 8, + INSN_FPINT_FCVT = 9, + INSN_FPINT_RES[10:31] + } op; + } fpint; + } by_class; + + enum logic[1:0] + { + INSN_FPINT = 0, + INSN_MEM = 1, + INSN_SFU = 2, + INSN_GROUP = 3 + } insn_class; + } insn_word; + +endpackage diff --git a/rtl/gfx/gfx_pipes.sv b/rtl/gfx/gfx_pipes.sv new file mode 100644 index 0000000..2fa875a --- /dev/null +++ b/rtl/gfx/gfx_pipes.sv @@ -0,0 +1,24 @@ +module gfx_pipes +#(int WIDTH=0, int DEPTH=0) +( + input logic clk, + + input logic[WIDTH - 1:0] in, + input logic stall, + + output logic[WIDTH - 1:0] out +); + + logic[WIDTH - 1:0] pipes[DEPTH]; + + assign out = pipes[DEPTH - 1]; + + always_ff @(posedge clk) + if (~stall) begin + pipes[0] <= in; + + for (integer i = 1; i < DEPTH; ++i) + pipes[i] <= pipes[i - 1]; + end + +endmodule diff --git a/rtl/gfx/gfx_pkg.sv b/rtl/gfx/gfx_pkg.sv new file mode 100644 index 0000000..7072967 --- /dev/null +++ b/rtl/gfx/gfx_pkg.sv @@ -0,0 +1,271 @@ +package gfx; + + typedef logic[31:0] word; + + typedef word uword; + typedef logic signed[$bits(word) - 1:0] sword; + typedef logic[$bits(word) / 2 - 1:0] uhword; + typedef logic signed[$bits(word) / 2 - 1:0] shword; + typedef logic[2 * $bits(word) - 1:0] udword; + typedef logic signed[2 * $bits(word) - 1:0] sdword; + typedef logic signed[4 * $bits(word) - 1:0] qword; + typedef logic signed[8 * $bits(word) - 1:0] oword; + + localparam int SUBWORD_BITS = $clog2($bits(word)) - $clog2($bits(byte)); + localparam int BYTES_PER_WORD = 1 << SUBWORD_BITS; + + typedef logic[$bits(word) - SUBWORD_BITS - 1:0] word_ptr; + typedef logic[$bits(word_ptr) - 1 - 1:0] dword_ptr; + typedef logic[$bits(word_ptr) - 2 - 1:0] qword_ptr; + typedef logic[$bits(word_ptr) - 3 - 1:0] oword_ptr; + + typedef logic[7:0] float_exp; + typedef logic[$bits(word) - $bits(float_exp) - 2:0] float_mant; + typedef logic[$bits(float_mant):0] float_mant_full; // Incluye '1.' explícito + typedef logic[$bits(float_mant_full) + 1:0] float_mant_ext; // Considera overflow + + localparam float_exp FLOAT_EXP_BIAS = (1 << ($bits(float_exp) - 1)) - 1; + localparam float_exp FLOAT_EXP_MAX = {($bits(float_exp)){1'b1}}; + + function float_mant_full full_mant(float_mant in); + full_mant = {1'b1, in}; + endfunction + + function float_mant implicit_mant(float_mant_full in); + assert (in[$bits(in) - 1]); + implicit_mant = in[$bits(in) - 2:0]; + endfunction + + typedef struct packed + { + logic sign; + float_exp exp; + float_mant mant; + } float; + + /* Explicación de guard, round, sticky: + * https://drilian.com/2023/01/10/floating-point-numbers-and-rounding/ + */ + typedef struct packed + { + float normal; + logic slow, + zero, + guard, + round, + sticky; + } float_round; + + typedef struct packed + { + logic exp_max, + exp_min, + mant_zero; + } float_class; + + function float_class classify_float(float in); + classify_float.exp_max = &in.exp; + classify_float.exp_min = ~|in.exp; + classify_float.mant_zero = ~|in.mant; + endfunction + + function logic is_float_special(float_class in); + is_float_special = in.exp_max | (in.exp_min & ~in.mant_zero); + endfunction + + function float_mant_ext float_prepare_round(float in, float_class in_class); + float_prepare_round = {~in_class.exp_min, in.mant, 2'b00}; + endfunction + + typedef struct packed + { + logic setup_mul_float, + setup_unit_b, + mnorm_put_hi, + mnorm_put_lo, + mnorm_put_mul, + mnorm_zero_b, + mnorm_zero_flags, + minmax_abs, + minmax_swap, + minmax_zero_min, + minmax_copy_flags, + shiftr_int_signed, + addsub_copy_flags, + addsub_int_operand, + clz_force_nop, + shiftl_copy_flags, + round_copy_flags, + round_enable, + encode_enable, + writeback; + } fpint_op; + + typedef struct packed + { + logic todo; + } mem_op; + + typedef struct packed + { + logic todo; + } sfu_op; + + typedef struct packed + { + logic todo; + } group_op; + + // Q22.10 + typedef logic[9:0] fixed_frac; + typedef logic[$bits(word) - $bits(fixed_frac) - 1:0] fixed_int; + + typedef struct packed signed + { + fixed_int fint; // 'int' es una keyword + fixed_frac frac; + } fixed; + + typedef struct packed + { + fixed x, + y; + } fixed_xy; + + typedef struct packed + { + fixed a, + b, + c; + } vtx_fixed; + + typedef struct packed + { + fixed_xy a, + b, + c; + } vtx_xy; + + localparam int RASTER_BITS = 2; + localparam int RASTER_SUB_BITS = 4; + localparam int RASTER_SIZE = 1 << RASTER_BITS; + localparam int RASTER_COARSE_FRAGS = RASTER_SIZE * RASTER_SIZE; + + typedef logic[RASTER_BITS - 1:0] raster_index; + + // Caso RASTER_BITS = 2: -> 4,4,4,4 -> 8,8-> 16 + localparam int RASTER_OUT_CLZ_DEPTH = 3; + + // Asume RASTER_BITS == 2, hay que ajustarlo si cambia + typedef struct packed + { + // Esto ahorra muchos flops + // + // offsets[0] = inc * 0 = 0 + // offsets[1] = inc * 1 = raster2_times1 + // offsets[2] = inc * 2 = raster2_times1 << 1 + // offsets[3] = inc * 3 = raster2_times3 + fixed raster2_times1, + raster2_times3; + } raster_offsets; + + function fixed raster_idx(raster_offsets offsets, raster_index idx); + unique case (idx) + RASTER_BITS'(0): + return '0; + + RASTER_BITS'(1): + return offsets.raster2_times1; + + RASTER_BITS'(2): + return offsets.raster2_times1 << 1; + + RASTER_BITS'(3): + return offsets.raster2_times3; + endcase + endfunction + + function raster_offsets make_raster_offsets(fixed inc); + make_raster_offsets.raster2_times1 = inc; + make_raster_offsets.raster2_times3 = inc + (inc << 1); + endfunction + + typedef struct packed + { + raster_offsets x, + y; + } raster_offsets_xy; + + typedef struct packed + { + logic[RASTER_SUB_BITS - 1:0] num; + logic[$bits(fixed_frac) - RASTER_SUB_BITS - 1:0] prec; + } raster_sub; + + localparam int RASTER_COARSE_DIM_BITS = $bits(fixed) - $bits(raster_index) - $bits(raster_sub); + + typedef logic signed[RASTER_COARSE_DIM_BITS - 1:0] raster_coarse_dim; + + typedef struct packed + { + raster_coarse_dim x, + y; + } raster_coarse_xy; + + typedef struct packed signed + { + raster_coarse_dim coarse; + raster_index fine; + raster_sub sub; + } raster_prec; + + typedef struct packed + { + raster_prec x, + y; + } raster_prec_xy; + + // Definir el número de lanes a partir de las dimensiones del + // rasterizer es una decisión crucial, el diseño entero depende de esto + + localparam int SHADER_LANES = RASTER_COARSE_FRAGS; + + typedef logic[RASTER_SIZE - 1:0] lane_no; + typedef logic[SHADER_LANES - 1:0] lane_mask; + + typedef logic[5:0] group_id; + + localparam int REGFILE_STAGES = 3; + localparam int REG_READ_STAGES = 2 + REGFILE_STAGES + 1; + + typedef gfx_isa::sgpr_num sgpr_num; + typedef gfx_isa::vgpr_num vgpr_num; + typedef gfx_isa::xgpr_num xgpr_num; + typedef gfx_isa::pc_offset pc_offset; + + typedef struct packed + { + // No incluye p0 porque p0 no tiene señal ready + logic p1, + p2, + p3, + valid; + } shader_dispatch; + + typedef struct + { + group_id group; + xgpr_num dest; + logic dest_scalar; + } wave_exec; + + localparam int FIXED_MULADD_DEPTH = 5; + localparam int FIXED_DOTADD_DEPTH = 2 * FIXED_MULADD_DEPTH; + + localparam word BOOTROM_BASE = 32'h0010_0000; + + localparam int SCHED_BRAM_WORDS = 2048; // 8KiB + + typedef word irq_lines; + +endpackage diff --git a/rtl/gfx/gfx_pkts.sv b/rtl/gfx/gfx_pkts.sv new file mode 100644 index 0000000..41399ce --- /dev/null +++ b/rtl/gfx/gfx_pkts.sv @@ -0,0 +1,29 @@ +interface gfx_pkts +#(parameter int WIDTH = $bits(gfx::word)); + + import gfx::*; + + logic tlast; + logic tready; + logic tvalid; + logic[WIDTH - 1:0] tdata; + + modport tx + ( + input tready, + + output tdata, + tlast, + tvalid + ); + + modport rx + ( + input tdata, + tlast, + tvalid, + + output tready + ); + +endinterface diff --git a/rtl/gfx/gfx_raster.sv b/rtl/gfx/gfx_raster.sv new file mode 100644 index 0000000..a57a672 --- /dev/null +++ b/rtl/gfx/gfx_raster.sv @@ -0,0 +1,930 @@ +module gfx_raster +( + input logic clk, + rst_n, + + gfx_pkts.rx geometry, + + gfx_pkts.tx coverage +); + + import gfx::*; + + gfx_raster_bounds setup_bounds + ( + .clk, + .rst_n, + + .geometry, + + .edges_ref(bounds_edges_ref), + .edges_vtx(bounds_edges_vtx), + .edges_span(bounds_edges_span), + .edges_ready(bounds_edges_ready), + .edges_valid(bounds_edges_valid), + .edges_geom_id(bounds_edges_geom_id) + ); + + word bounds_edges_geom_id; + logic bounds_edges_ready, bounds_edges_valid; + vtx_xy bounds_edges_vtx; + fixed_xy bounds_edges_ref; + raster_prec_xy bounds_edges_span; + + gfx_raster_edges setup_edges + ( + .clk, + .rst_n, + + .bounds_ref(bounds_edges_ref), + .bounds_vtx(bounds_edges_vtx), + .bounds_span(bounds_edges_span), + .bounds_ready(bounds_edges_ready), + .bounds_valid(bounds_edges_valid), + .bounds_geom_id(bounds_edges_geom_id), + + .coarse_ref(edges_coarse_ref), + .coarse_base(edges_coarse_base), + .coarse_span(edges_coarse_span), + .coarse_ready(edges_coarse_ready), + .coarse_valid(edges_coarse_valid), + .coarse_geom_id(edges_coarse_geom_id), + .coarse_offsets(edges_coarse_offsets) + ); + + word edges_coarse_geom_id; + fixed edges_coarse_base; + logic edges_coarse_ready, edges_coarse_valid; + fixed_xy edges_coarse_ref; + raster_prec_xy edges_coarse_span; + raster_offsets_xy edges_coarse_offsets; + + gfx_raster_coarse coarse + ( + .clk, + .rst_n, + + .edges_ref(edges_coarse_ref), + .edges_base(edges_coarse_base), + .edges_span(edges_coarse_span), + .edges_ready(edges_coarse_ready), + .edges_valid(edges_coarse_valid), + .edges_geom_id(edges_coarse_geom_id), + .edges_offsets(edges_coarse_offsets), + + .fine_ref(coarse_fine_ref), + .fine_ready(coarse_fine_ready), + .fine_valid(coarse_fine_valid), + .fine_corner(coarse_fine_corner), + .fine_geom_id(coarse_fine_geom_id), + .fine_offsets(coarse_fine_offsets) + ); + + word coarse_fine_geom_id; + fixed coarse_fine_corner; + logic coarse_fine_ready, coarse_fine_valid; + fixed_xy coarse_fine_ref; + raster_offsets_xy coarse_fine_offsets; + + gfx_raster_fine fine + ( + .clk, + .rst_n, + + .coarse_ref(coarse_fine_ref), + .coarse_ready(coarse_fine_ready), + .coarse_valid(coarse_fine_valid), + .coarse_corner(coarse_fine_corner), + .coarse_geom_id(coarse_fine_geom_id), + .coarse_offsets(coarse_fine_offsets), + + .coverage + ); + +endmodule + +module gfx_raster_bounds +( + input logic clk, + rst_n, + + gfx_pkts.rx geometry, + + input logic edges_ready, + output logic edges_valid, + output gfx::word edges_geom_id, + output gfx::fixed_xy edges_ref, + output gfx::raster_prec_xy edges_span, + output gfx::vtx_xy edges_vtx +); + + import gfx::*; + + enum int unsigned + { + IN_GEOM_ID, + IN_DIM_X, + IN_DIM_Y + } in_state; + + enum int unsigned + { + VTX_A, + VTX_B, + VTX_C + } vtx_state; + + logic a_lt_b, a_lt_c, b_lt_c, edges_handshake, geom_complete, geom_last, + geom_recv, in_vtx, next_dim, new_vtx; + + logic end_new_dim, end_valid, vtx_valid, lt_new_dim, lt_valid, minmax_new_dim, minmax_valid; + + fixed geom_data; + vtx_fixed dim_vtx, dim_vtx_x, dim_vtx_y; + raster_prec max, min; + + assign geom_recv = geometry.tready & geometry.tvalid; + assign edges_handshake = edges_valid & edges_ready; + + assign edges_vtx.a.x = dim_vtx_x.a; + assign edges_vtx.a.y = dim_vtx_y.a; + assign edges_vtx.b.x = dim_vtx_x.b; + assign edges_vtx.b.y = dim_vtx_y.b; + assign edges_vtx.c.x = dim_vtx_x.c; + assign edges_vtx.c.y = dim_vtx_y.c; + + assign geometry.tready = edges_handshake | ~geom_complete; + + always_comb begin + unique case (vtx_state) + VTX_C: next_dim = geom_recv; + default: next_dim = 0; + endcase + + unique case (in_state) + IN_DIM_Y: geom_last = next_dim; + default: geom_last = 0; + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + in_state <= IN_GEOM_ID; + vtx_state <= VTX_A; + + in_vtx <= 0; + new_vtx <= 0; + geom_complete <= 0; + + lt_valid <= 0; + end_valid <= 0; + vtx_valid <= 0; + edges_valid <= 0; + minmax_valid <= 0; + + lt_new_dim <= 0; + end_new_dim <= 0; + minmax_new_dim <= 0; + + edges_geom_id <= 'x; + end else begin + end_valid <= 0; + vtx_valid <= end_valid; + lt_valid <= vtx_valid; + minmax_valid <= lt_valid; + + if (~edges_valid | edges_ready) + edges_valid <= minmax_valid; + + geom_complete <= (geom_complete | geom_last) & ~edges_handshake; + + unique case (in_state) + IN_GEOM_ID: + if (geom_recv) begin + in_state <= IN_DIM_X; + + in_vtx <= 1; + edges_geom_id <= geometry.tdata; + end + + IN_DIM_X: + if (next_dim) + in_state <= IN_DIM_Y; + + IN_DIM_Y: + if (next_dim) begin + in_state <= IN_GEOM_ID; + + in_vtx <= 0; + end_valid <= 1; + end + endcase + + new_vtx <= 0; + + lt_new_dim <= 0; + minmax_new_dim <= lt_new_dim; + end_new_dim <= minmax_new_dim; + + unique case (vtx_state) + VTX_A: begin + if (in_vtx & geom_recv) begin + new_vtx <= 1; + vtx_state <= VTX_B; + end + + if (new_vtx) begin + dim_vtx.c <= geom_data; + lt_new_dim <= 1; + end + end + + VTX_B: begin + if (geom_recv) begin + new_vtx <= 1; + vtx_state <= VTX_C; + end + + if (new_vtx) + dim_vtx.a <= geom_data; + end + + VTX_C: begin + if (geom_recv) begin + new_vtx <= 1; + vtx_state <= VTX_A; + end + + if (new_vtx) + dim_vtx.b <= geom_data; + end + endcase + + if (in_state == IN_DIM_Y & next_dim) + assert (geometry.tlast); + end + + always_ff @(posedge clk) begin + geom_data <= geometry.tdata; + + a_lt_b <= $signed(dim_vtx.a) < $signed(dim_vtx.b); + a_lt_c <= $signed(dim_vtx.a) < $signed(dim_vtx.c); + b_lt_c <= $signed(dim_vtx.b) < $signed(dim_vtx.c); + + // Realmente no son 'x' o 'y' hasta cuando edges_valid = 1 + if (lt_new_dim) begin + dim_vtx_y <= dim_vtx; + dim_vtx_x <= dim_vtx_y; + end + + if (a_lt_b) begin + min <= a_lt_c ? dim_vtx_y.a : dim_vtx_y.c; + max <= b_lt_c ? dim_vtx_y.c : dim_vtx_y.b; + end else begin + min <= b_lt_c ? dim_vtx_y.b : dim_vtx_y.c; + max <= a_lt_c ? dim_vtx_y.c : dim_vtx_y.a; + end + + {min.fine, min.sub} <= '0; + {max.fine, max.sub} <= '0; + + if (end_new_dim) begin + edges_ref.y <= min; + edges_ref.x <= edges_ref.y; + + edges_span.y <= max - min; + edges_span.x <= edges_span.y; + end + end + +endmodule + +module gfx_raster_edges +( + input logic clk, + rst_n, + + input logic bounds_valid, + input gfx::word bounds_geom_id, + input gfx::fixed_xy bounds_ref, + input gfx::raster_prec_xy bounds_span, + input gfx::vtx_xy bounds_vtx, + output logic bounds_ready, + + input logic coarse_ready, + output logic coarse_valid, + output gfx::word coarse_geom_id, + output gfx::fixed_xy coarse_ref, + output gfx::raster_prec_xy coarse_span, + output gfx::fixed coarse_base, + output gfx::raster_offsets_xy coarse_offsets +); + + import gfx::*; + + enum int unsigned + { + EDGE_AB, + EDGE_BC, + EDGE_CA, + // EDGE_CA cumple doble función como OFFSETS_AB + OFFSETS_BC, + OFFSETS_CA, + OUT + } state; + + struct + { + fixed_xy cur, + delay1, + delay2; + } inc; + + logic coarse_handshake, coarse_stall, offsets_flow; + fixed_xy delta, p, q; + + // - 2 porque coarse valid va al final + logic[FIXED_DOTADD_DEPTH - 2:0] dotadd_valid; + + assign coarse_stall = coarse_valid & ~coarse_ready; + assign coarse_handshake = coarse_valid & coarse_ready; + + gfx_fixed_dotadd edge_base + ( + .clk, + .c(0), + .q(coarse_base), + .a0(delta.x), + .b0(inc.cur.x), + .a1(delta.y), + .b1(inc.cur.y), + .stall(coarse_stall) + ); + + always_comb + unique case (state) + OUT: offsets_flow = coarse_handshake; + default: offsets_flow = 1; + endcase + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= EDGE_AB; + + p <= 'x; + q <= 'x; + coarse_ref <= 'x; + coarse_geom_id <= 'x; + + bounds_ready <= 0; + coarse_valid <= 0; + + for (int i = 0; i < $bits(dotadd_valid) - 1; ++i) + dotadd_valid[i] <= 0; + end else begin + for (int i = 1; i < $bits(dotadd_valid); ++i) + dotadd_valid[i] <= dotadd_valid[i - 1]; + + if (~coarse_stall) + coarse_valid <= dotadd_valid[$bits(dotadd_valid) - 1]; + + bounds_ready <= 0; + dotadd_valid[0] <= 0; + + unique case (state) + EDGE_AB: begin + if (bounds_valid) + state <= EDGE_BC; + + coarse_ref <= bounds_ref; + coarse_span <= bounds_span; + coarse_geom_id <= bounds_geom_id; + + p <= bounds_vtx.a; + q <= bounds_vtx.b; + end + + EDGE_BC: begin + state <= EDGE_CA; + bounds_ready <= 1; + + p <= bounds_vtx.b; + q <= bounds_vtx.c; + end + + EDGE_CA: begin + state <= OFFSETS_BC; + + p <= bounds_vtx.c; + q <= bounds_vtx.a; + + // Esto ocurre justamente en un momento en que ab, bc, ca + // quedan todos en sus lugares correctos en la pipeline + dotadd_valid[0] <= 1; + end + + OFFSETS_BC: + state <= OFFSETS_CA; + + OFFSETS_CA: + state <= OUT; + + OUT: + if (coarse_handshake) + state <= EDGE_AB; + endcase + end + + always_ff @(posedge clk) begin + delta.x <= coarse_ref.x - q.x; + delta.y <= coarse_ref.y - q.y; + + inc.cur.x <= p.y - q.y; + inc.cur.y <= q.x - p.x; + + //TODO: top-left rule + if (offsets_flow) begin + inc.delay1 <= inc.cur; + inc.delay2 <= inc.delay1; + + coarse_offsets.x <= make_raster_offsets(inc.delay2.x); + coarse_offsets.y <= make_raster_offsets(inc.delay2.y); + end + end + +endmodule + +module gfx_raster_coarse +( + input logic clk, + rst_n, + + input logic edges_valid, + input gfx::word edges_geom_id, + input gfx::fixed_xy edges_ref, + input gfx::raster_prec_xy edges_span, + input gfx::fixed edges_base, + input gfx::raster_offsets_xy edges_offsets, + output logic edges_ready, + + input logic fine_ready, + output logic fine_valid, + output gfx::word fine_geom_id, + output gfx::fixed_xy fine_ref, + output gfx::fixed fine_corner, + output gfx::raster_offsets_xy fine_offsets +); + + import gfx::*; + + enum int unsigned + { + SETUP, + TEST_AB, + TEST_BC, + TEST_CA, + OUT + } state; + + struct + { + fixed cur, + next, + prev; + } corner, edge_fn, vertical; + + struct + { + raster_offsets_xy cur, + next, + prev; + } offsets; + + logic edges_recv, end_block, end_x, end_y, first_run, + mask, mask_reset, new_geom, test_flow, out_flow; + + fixed edge_test, reference_x, vertical_inc; + fixed_xy max_offset, min_offset, test_offset; + raster_coarse_xy stride; + raster_coarse_dim width; + raster_offsets_xy next_offsets; + + function fixed coarse_offset(raster_offsets offsets); + return raster_idx(offsets, RASTER_BITS'(1)) << RASTER_BITS; + endfunction + + assign end_x = stride.x == '0; + assign end_y = stride.y == '0; + assign end_block = end_x & end_y; + + assign edge_test = edge_fn.cur + test_offset.x + test_offset.y; + assign vertical_inc = vertical.cur + coarse_offset(offsets.cur.y); + + assign fine_corner = corner.cur; + assign fine_offsets = offsets.cur; // Vuelve a cur luego de 3 ciclos + + assign min_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(0)); + assign min_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(0)); + assign max_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(RASTER_SIZE - 1)); + assign max_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(RASTER_SIZE - 1)); + assign next_offsets = edges_recv ? edges_offsets : offsets.next; + + always_comb begin + unique case (state) + SETUP: new_geom = 1; + default: new_geom = 0; + endcase + + unique case (state) + TEST_AB: mask_reset = 1; + default: mask_reset = 0; + endcase + + unique case (state) + SETUP: edges_ready = 1; + default: edges_ready = 0; + endcase + + unique case (state) + SETUP: + edges_recv = 1; + + TEST_AB, TEST_BC: + edges_recv = first_run; + + default: + edges_recv = 0; + endcase + + unique case (state) + OUT: fine_valid = mask; + default: fine_valid = 0; + endcase + + unique case (state) + OUT: begin + out_flow = ~mask | fine_ready; + test_flow = 0; + end + + default: begin + out_flow = 0; + test_flow = 1; + end + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= SETUP; + first_run <= 1; + end else + unique case (state) + SETUP: + if (edges_valid) + state <= TEST_AB; + + TEST_AB: + state <= TEST_BC; + + TEST_BC: + state <= TEST_CA; + + TEST_CA: + state <= OUT; + + OUT: begin + first_run <= end_block; + if (out_flow) + state <= end_block ? SETUP : TEST_AB; + end + endcase + + always_ff @(posedge clk) begin + if (new_geom) begin + width <= edges_span.x.coarse; + stride.x <= edges_span.x.coarse; + stride.y <= edges_span.y.coarse; + reference_x <= edges_ref.x; + + fine_ref <= edges_ref; + fine_geom_id <= edges_geom_id; + end + + if (out_flow) begin + stride.x <= stride.x - 1; + fine_ref.x.fint <= fine_ref.x.fint + ($bits(fixed_int))'(RASTER_SIZE); + + if (end_x) begin + fine_ref.x <= reference_x; + fine_ref.y.fint <= fine_ref.y.fint + ($bits(fixed_int))'(RASTER_SIZE); + + stride.x <= width; + stride.y <= stride.y - 1; + end + end + + if (test_flow) begin + offsets.cur <= next_offsets; + offsets.next <= offsets.prev; + offsets.prev <= offsets.cur; + + vertical.cur <= vertical.next; + vertical.next <= vertical.prev; + vertical.prev <= vertical.cur; + + edge_fn.cur <= edge_fn.next; + edge_fn.next <= edge_fn.prev; + edge_fn.prev <= edge_fn.cur + coarse_offset(offsets.cur.x); + + if (end_x) begin + edge_fn.prev <= vertical_inc; + vertical.prev <= vertical_inc; + end + + corner.cur <= corner.next; + corner.next <= corner.prev; + corner.prev <= edge_fn.cur; + + if (coarse_offset(next_offsets.x) >= 'sd0) + test_offset.x <= max_offset.x; + else + test_offset.x <= min_offset.x; + + if (coarse_offset(next_offsets.y) >= 'sd0) + test_offset.y <= max_offset.y; + else + test_offset.y <= min_offset.y; + + mask <= (mask | mask_reset) & 1/*(edge_test >= 'sd0)*/; + end + + if (edges_recv) begin + edge_fn.cur <= edges_base; + vertical.cur <= edges_base; + end + end + +endmodule + +module gfx_raster_fine +( + input logic clk, + rst_n, + + input logic coarse_valid, + input gfx::word coarse_geom_id, + input gfx::fixed_xy coarse_ref, + input gfx::fixed coarse_corner, + input gfx::raster_offsets_xy coarse_offsets, + output logic coarse_ready, + + gfx_pkts.tx coverage +); + + import gfx::*; + + enum int unsigned + { + IN_C, + IN_A, + IN_B, + IN_MASK + } in_state; + + enum int unsigned + { + OUT_ACCEPT, + OUT_GEOM_ID, + OUT_POS, + OUT_MASK, + OUT_BARY_C, + OUT_BARY_A, + OUT_BARY_B + } out_state; + + struct + { + fixed cur, + next, + prev; + } corner; + + struct + { + raster_offsets_xy cur, + next, + prev; + } offsets; + + logic begin_bary, hold_block, in_valid, mask_in_clean, + mask_in_reset, new_block, out_last; + + word geom_id; + fixed bary_coord; + lane_no lane, lane_ctz, lane_hold; + fixed_xy block_ref; + lane_mask mask_in, mask, mask_ctz; + raster_index lane_x, lane_y; + logic[$bits(lane_ctz):0] ctz_count; + + function shword ref_half(raster_prec dim); + return dim.coarse[$bits(shword) - 1:0]; + endfunction + + assign lane_ctz = ctz_count[$bits(lane_ctz) - 1:0]; + assign in_valid = mask_in_clean & |mask_in; + assign out_last = ~|mask; + assign {lane_y, lane_x} = lane; + + // **IMPORTANTE**: Esto va a fallar a partir de RASTER_BITS >= 3, + // ya que la fsm asume que ctz termina en 3 ciclos o menos + + gfx_ctz #(RASTER_COARSE_FRAGS) ctz + ( + .clk, + .value(mask_ctz), + .ctz(ctz_count) + ); + + always_comb begin + unique case (out_state) + OUT_ACCEPT: new_block = 1; + default: new_block = 0; + endcase + + unique case (out_state) + OUT_ACCEPT: mask_ctz = mask_in; + default: mask_ctz = mask; + endcase + + unique case (out_state) + OUT_ACCEPT: coverage.tvalid = 0; + default: coverage.tvalid = 1; + endcase + + unique case (out_state) + OUT_MASK, OUT_BARY_B: + begin_bary = coverage.tready; + + default: + begin_bary = 0; + endcase + + unique case (out_state) + OUT_BARY_B: coverage.tlast = out_last; + default: coverage.tlast = 0; + endcase + + unique case (out_state) + OUT_GEOM_ID: + coverage.tdata = geom_id; + + OUT_POS: + coverage.tdata = {ref_half(coarse_ref.y), ref_half(block_ref.x)}; + + OUT_MASK: + coverage.tdata = {{($bits(word) - $bits(mask)){1'b0}}, mask}; + + OUT_BARY_C, OUT_BARY_A, OUT_BARY_B: + coverage.tdata = bary_coord; + + default: + coverage.tdata = 'x; + endcase + + unique case (out_state) + OUT_MASK: + lane = lane_ctz; + + default: + lane = lane_hold; + endcase + + unique case (in_state) + IN_C: coarse_ready = new_block; + default: coarse_ready = 0; + endcase + + unique case (in_state) + IN_C: hold_block = new_block; + IN_A: hold_block = 1; + IN_B: hold_block = 1; + IN_MASK: hold_block = 0; + endcase + + unique case (in_state) + IN_C: mask_in_reset = 1; + default: mask_in_reset = 0; + endcase + + unique case (in_state) + IN_MASK: mask_in_clean = 1; + default: mask_in_clean = 0; + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + in_state <= IN_C; + out_state <= OUT_ACCEPT; + end else begin + unique case (in_state) + IN_C: + if (coarse_valid & new_block) + in_state <= IN_A; + + IN_A: + in_state <= IN_B; + + IN_B: + in_state <= IN_MASK; + + IN_MASK: + in_state <= IN_C; + endcase + + unique case (out_state) + OUT_ACCEPT: + if (in_valid) + out_state <= OUT_GEOM_ID; + + OUT_GEOM_ID: + if (coverage.tready) + out_state <= OUT_POS; + + OUT_POS: + if (coverage.tready) + out_state <= OUT_MASK; + + OUT_MASK: + if (coverage.tready) + out_state <= OUT_BARY_C; + + OUT_BARY_C: + if (coverage.tready) + out_state <= OUT_BARY_A; + + OUT_BARY_A: + if (coverage.tready) + out_state <= OUT_BARY_B; + + OUT_BARY_B: + if (coverage.tready) + out_state <= out_last ? OUT_ACCEPT : OUT_BARY_C; + endcase + end + + always_ff @(posedge clk) begin + // Prueba paralela de signos, esto hace el heavy lifting de fine raster + // Nótese que muchos sumadores serán eliminados en síntesis + for (int i = 0; i < RASTER_SIZE; ++i) + for (int j = 0; j < RASTER_SIZE; ++j) + mask_in[i * RASTER_SIZE + j] <= + (mask_in[i * RASTER_SIZE + j] | mask_in_reset) + & (coarse_corner + + raster_idx(coarse_offsets.y, RASTER_BITS'(i)) + + raster_idx(coarse_offsets.x, RASTER_BITS'(j)) + >= 'sd0); + + // Recalculamos las coordenadas baricéntricas de cada fragmento que + // no haya sido descartado. La razón de esto es evitar almacenar y + // luego multiplexar las coordenadas de un bloque entero (48 words). + if (coverage.tready) + bary_coord <= corner.next + + raster_idx(offsets.next.y, RASTER_BITS'(lane_y)) + + raster_idx(offsets.next.x, RASTER_BITS'(lane_x)); + + if (new_block & mask_in_reset) begin + geom_id <= coarse_geom_id; + block_ref <= coarse_ref; + end + + // new_block = 0 => coverage.tvalid = 1 + if (new_block | coverage.tready) begin + corner.cur <= corner.next; + corner.next <= corner.prev; + corner.prev <= corner.cur; + + offsets.cur <= offsets.next; + offsets.next <= offsets.prev; + offsets.prev <= offsets.cur; + end + + if (hold_block) begin + // Para prev en vez de cur para que los primeros valores queden en + // cur justamente al llegar a OUT_BARY_C + corner.prev <= coarse_corner; + offsets.prev <= coarse_offsets; + end + + if (new_block) + mask <= mask_in; + + if (begin_bary) begin + mask <= mask & (mask - 1); + lane_hold <= lane_ctz; + end + end + +endmodule diff --git a/rtl/gfx/gfx_regfile_io.sv b/rtl/gfx/gfx_regfile_io.sv new file mode 100644 index 0000000..2459049 --- /dev/null +++ b/rtl/gfx/gfx_regfile_io.sv @@ -0,0 +1,106 @@ +interface gfx_regfile_io; + + import gfx::*; + + struct + { + group_id group; + sgpr_num a_sgpr, + b_sgpr; + vgpr_num a_vgpr, + b_vgpr; + logic[12:0] b_imm; + logic a_scalar, + b_scalar, + b_is_imm, + b_is_const, + scalar_rev; + } op; + + struct + { + logic write; + group_id group; + sgpr_num sgpr; + word data; + } sgpr_write; + + struct + { + lane_mask mask; + group_id group; + vgpr_num vgpr; + word data[SHADER_LANES]; + } vgpr_write; + + word a[SHADER_LANES], b[SHADER_LANES], sgpr_write_data, vgpr_write_data[SHADER_LANES]; + logic mask_wb_write, pc_wb_write; + word_ptr pc_back, pc_front, pc_wb; + group_id mask_back_group, mask_wb_group, pc_back_group, pc_front_group, pc_wb_group; + lane_mask mask_back, mask_wb; + + modport ab + ( + input a, + b + ); + + modport read + ( + output op + ); + + modport bind_ + ( + input pc_front, + + output pc_front_group + ); + + modport wb + ( + input pc_back, + mask_back, + + output sgpr_write, + vgpr_write, + + pc_back_group, + mask_back_group, + + pc_wb, + pc_wb_group, + pc_wb_write, + + mask_wb, + mask_wb_group, + mask_wb_write + ); + + modport regs + ( + input op, + sgpr_write, + vgpr_write, + + pc_back_group, + pc_front_group, + mask_back_group, + + pc_wb, + pc_wb_group, + pc_wb_write, + + mask_wb, + mask_wb_group, + mask_wb_write, + + output a, + b, + + pc_back, + pc_front, + mask_back + ); + +endinterface diff --git a/rtl/gfx/gfx_rst_sync.sv b/rtl/gfx/gfx_rst_sync.sv new file mode 100644 index 0000000..2a8ea3b --- /dev/null +++ b/rtl/gfx/gfx_rst_sync.sv @@ -0,0 +1,13 @@ +//FIXME: peligro +module gfx_rst_sync +( + input logic clk, + rst_n, + + output logic srst_n +); + + always_ff @(posedge clk or negedge rst_n) + srst_n <= ~rst_n ? 0 : 1; + +endmodule diff --git a/rtl/gfx/gfx_sched.sv b/rtl/gfx/gfx_sched.sv new file mode 100644 index 0000000..0ffaecd --- /dev/null +++ b/rtl/gfx/gfx_sched.sv @@ -0,0 +1,141 @@ +module gfx_sched +import gfx::*; +( + input logic clk, + rst_n, + srst_n, + + gfx_axil.m axim, + + input irq_lines irq +); + + // verilator tracing_off + + logic axi_ready, axi_valid, bram_ready, bram_read, bram_write, bram_write_next, + mem_instr, mem_la_read, mem_la_write, mem_ready, mem_valid, select_bram; + + word bram[SCHED_BRAM_WORDS]; + word axi_rdata, bram_rdata, mem_addr, mem_la_addr, mem_rdata, mem_wdata; + logic[$bits(word) / $bits(byte) - 1:0] mem_wstrb; + + logic[$clog2(SCHED_BRAM_WORDS) - 1:0] bram_addr; + + assign bram_addr = mem_addr[$bits(bram_addr) + SUBWORD_BITS - 1:SUBWORD_BITS]; + assign mem_ready = (axi_valid & axi_ready) | bram_ready; + assign mem_rdata = bram_ready ? bram_rdata : axi_rdata; + assign select_bram = ~|mem_la_addr[$bits(mem_la_addr) - 1:$bits(bram_addr) + SUBWORD_BITS]; + assign bram_write_next = mem_la_write & select_bram; + + defparam core.ENABLE_COUNTERS = 0; + defparam core.ENABLE_COUNTERS64 = 0; + defparam core.BARREL_SHIFTER = 1; + defparam core.COMPRESSED_ISA = 1; + defparam core.CATCH_MISALIGN = 0; + defparam core.CATCH_ILLINSN = 0; + defparam core.ENABLE_MUL = 1; + defparam core.ENABLE_DIV = 1; + defparam core.ENABLE_IRQ = 1; + defparam core.ENABLE_IRQ_QREGS = 0; + defparam core.ENABLE_IRQ_TIMER = 0; + defparam core.PROGADDR_RESET = BOOTROM_BASE; + + picorv32 core + ( + .clk, + .resetn(srst_n), + .trap(), + + .mem_valid, + .mem_instr, + .mem_ready, + + .mem_addr, + .mem_wdata, + .mem_wstrb, + .mem_rdata, + + .mem_la_read, + .mem_la_write, + .mem_la_addr, + .mem_la_wdata(), + .mem_la_wstrb(), + + .pcpi_valid(), + .pcpi_insn(), + .pcpi_rs1(), + .pcpi_rs2(), + .pcpi_wr(), + .pcpi_rd(), + .pcpi_wait(0), + .pcpi_ready(0), + + .irq, + .eoi(), + + .trace_valid(), + .trace_data() + ); + + picorv32_axi_adapter axi + ( + .clk, + .resetn(srst_n), + + .mem_axi_awvalid(axim.awvalid), + .mem_axi_awready(axim.awready), + .mem_axi_awaddr(axim.awaddr), + .mem_axi_awprot(), + + .mem_axi_wvalid(axim.wvalid), + .mem_axi_wready(axim.wready), + .mem_axi_wdata(axim.wdata), + .mem_axi_wstrb(), // Potenciales sorpresas + + .mem_axi_bvalid(axim.bvalid), + .mem_axi_bready(axim.bready), + + .mem_axi_arvalid(axim.arvalid), + .mem_axi_arready(axim.arready), + .mem_axi_araddr(axim.araddr), + .mem_axi_arprot(), + + .mem_axi_rvalid(axim.rvalid), + .mem_axi_rready(axim.rready), + .mem_axi_rdata(axim.rdata), + + .mem_valid(mem_valid & axi_valid), + .mem_instr, + .mem_ready(axi_ready), + .mem_addr, + .mem_wdata, + .mem_wstrb, + .mem_rdata(axi_rdata) + ); + + always_ff @(posedge clk) begin + if (bram_write) begin + for (int i = 0; i < $bits(mem_wstrb); ++i) + if (mem_wstrb[i]) + bram[bram_addr][i] <= mem_wdata[i]; + + bram_rdata <= 'x; + end else + bram_rdata <= bram[bram_addr]; + end + + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + axi_valid <= 0; + bram_read <= 0; + bram_ready <= 0; + bram_write <= 0; + end else begin + axi_valid <= ~select_bram | (axi_valid & ~axi_ready); + bram_read <= mem_la_read & select_bram; + bram_write <= bram_write_next; + bram_ready <= bram_read | bram_write_next; + end + +endmodule diff --git a/rtl/gfx/gfx_shader.sv b/rtl/gfx/gfx_shader.sv new file mode 100644 index 0000000..322ffb5 --- /dev/null +++ b/rtl/gfx/gfx_shader.sv @@ -0,0 +1,77 @@ +module gfx_shader +import gfx::*; +import gfx_shader_schedif_pkg::*; +( + input logic clk, + rst_n, + + gfx_axib.m insn_mem, + + gfx_axil.s sched +); + + axi4lite_intf #(.ADDR_WIDTH(GFX_SHADER_SCHEDIF_MIN_ADDR_WIDTH)) regblock(); + + gfx_axil2regblock axil2regblock + ( + .axis(sched), + .axim(regblock.master) + ); + + gfx_shader_schedif__in_t schedif_in; + gfx_shader_schedif__out_t schedif_out; + + gfx_front_back front_back(); + gfx_regfile_io regfile(); + gfx_shader_setup setup(); + + assign schedif_in.SETUP_CTRL.GPR_DONE.hwset = setup.sched.set_done.gpr; + assign schedif_in.SETUP_CTRL.MASK_DONE.hwset = setup.sched.set_done.mask; + assign schedif_in.SETUP_CTRL.SUBMIT_DONE.hwset = setup.sched.set_done.submit; + + assign setup.sched.write.pc = schedif_out.SETUP_SUBMIT.PC.value; + assign setup.sched.write.gpr = schedif_out.SETUP_CTRL.XGPR.value; + assign setup.sched.write.mask = schedif_out.SETUP_MASK.MASK.value; + assign setup.sched.write.group = schedif_out.SETUP_CTRL.GROUP.value; + assign setup.sched.write.pc_set = schedif_out.SETUP_SUBMIT.PC.swmod; + assign setup.sched.write.gpr_set = schedif_out.SETUP_GPR.VALUE.swmod; + assign setup.sched.write.mask_set = schedif_out.SETUP_MASK.MASK.swmod; + assign setup.sched.write.gpr_value = schedif_out.SETUP_GPR.VALUE.value; + + gfx_shader_front frontend + ( + .clk, + .rst_n, + .front(front_back.front), + .reg_bind(regfile.bind_), + .reg_read(regfile.read), + .fetch_mem(insn_mem), + .icache_flush(schedif_out.CORE.IFLUSH.value) + ); + + gfx_shader_back backend + ( + .clk, + .rst_n, + .back(front_back.back), + .setup(setup.core), + .reg_wb(regfile.wb), + .read_data(regfile.ab) + ); + + gfx_shader_regs regs + ( + .clk, + .io(regfile.regs) + ); + + gfx_shader_schedif schedif + ( + .clk, + .arst_n(rst_n), + .s_axil(regblock.slave), + .hwif_in(schedif_in), + .hwif_out(schedif_out) + ); + +endmodule diff --git a/rtl/gfx/gfx_shader_back.sv b/rtl/gfx/gfx_shader_back.sv new file mode 100644 index 0000000..4929192 --- /dev/null +++ b/rtl/gfx/gfx_shader_back.sv @@ -0,0 +1,335 @@ +module gfx_shader_back +import gfx::*; +( + input logic clk, + rst_n, + + gfx_front_back.back back, + + gfx_regfile_io.ab read_data, + gfx_regfile_io.wb reg_wb, + + gfx_shader_setup.core setup +); + + logic abort; + + gfx_wb out_wb(), p0_wb(), p1_wb(), p2_wb(), p3_wb(); + gfx_shake p1_shake(), p2_shake(), p3_shake(); + + gfx_shader_abort p0_abort + ( + .clk, + .p1(p1_shake.peek), + .p2(p2_shake.peek), + .p3(p3_shake.peek), + .abort + ); + + gfx_shader_fpint p0 + ( + .clk, + .rst_n, + .op(back.execute.p0), + .wb(p0_wb.tx), + .wave(back.execute.wave), + .abort, + .read_data, + .in_valid(back.dispatch.valid) + ); + + gfx_shader_mem p1 + ( + .clk, + .rst_n, + .op(back.execute.p1), + .wb(p1_wb.tx), + .wave(back.execute.wave), + .in_shake(p1_shake.rx), + .read_data + ); + + gfx_shader_sfu p2 + ( + .clk, + .rst_n, + .op(back.execute.p2), + .wb(p2_wb.tx), + .wave(back.execute.wave), + .in_shake(p2_shake.rx), + .read_data + ); + + gfx_shader_group p3 + ( + .clk, + .rst_n, + .op(back.execute.p3), + .wb(p3_wb.tx), + .wave(back.execute.wave), + .in_shake(p3_shake.rx), + .read_data + ); + + gfx_shader_writeback_arbiter4 writeback_arbiter + ( + .clk, + .rst_n, + .p0(p0_wb.rx), + .p1(p1_wb.rx), + .p2(p2_wb.rx), + .p3(p3_wb.rx), + .out(out_wb.tx) + ); + + gfx_shader_writeback writeback + ( + .clk, + .rst_n, + .wb(out_wb.rx), + .regs(reg_wb), + .setup, + .loop_group(back.loop.group), + .loop_valid(back.loop.valid) + ); + +endmodule + +module gfx_shader_abort +( + input logic clk, + + gfx_shake.peek p1, + p2, + p3, + + output logic abort +); + + always_ff @(posedge clk) + abort <= + (p1.valid & p1.ready) + | (p2.valid & p2.ready) + | (p3.valid & p3.ready); + +endmodule + +module gfx_shader_writeback_arbiter4 +( + input logic clk, + rst_n, + + gfx_wb.rx p0, + p1, + p2, + p3, + + gfx_wb.tx out +); + + assert property ( + @(posedge clk) + disable iff (~rst_n) + + (p0.ready & out.ready) + ); + + gfx_wb p0_p1(), p2_p3(); + + gfx_shader_writeback_arbiter2_prio arbiter_p0_p1 + ( + .clk, + .rst_n, + .a(p0), + .b(p1), + .out(p0_p1.tx) + ); + + gfx_shader_writeback_arbiter2_prio arbiter_p2_p3 + ( + .clk, + .rst_n, + .a(p2), + .b(p3), + .out(p2_p3.tx) + ); + + gfx_shader_writeback_arbiter2_prio arbiter_out + ( + .clk, + .rst_n, + .a(p0_p1.rx), + .b(p2_p3.tx), + .out + ); + +endmodule + +module gfx_shader_writeback_arbiter2_prio +( + input logic clk, + rst_n, + + gfx_wb.rx a, + b, + + gfx_wb.tx out +); + + //TODO + assign a.ready = out.ready; + assign b.ready = 0; + + assign out.dest = a.dest; + assign out.lanes = a.lanes; + assign out.group = a.group; + assign out.valid = a.valid; + assign out.scalar = a.scalar; + assign out.writeback = a.writeback; + + assign out.mask = a.mask; + assign out.mask_update = a.mask_update; + + assign out.pc_add = a.pc_add; + assign out.pc_inc = a.pc_inc; + assign out.pc_update = a.pc_update; + +endmodule + +module gfx_shader_writeback +import gfx::*; +( + input logic clk, + rst_n, + + gfx_wb.rx wb, + + gfx_regfile_io.wb regs, + + output logic loop_valid, + output group_id loop_group, + + gfx_shader_setup.core setup +); + + struct + { + group_id group; + word lanes[SHADER_LANES]; + pc_offset pc_add; + lane_mask mask; + vgpr_num vgpr; + logic pc_update, + mask_update, + vgpr_update; + } loop_hold[REGFILE_STAGES], loop_out; + + logic loop_valid_hold[REGFILE_STAGES], loop_out_valid, mask_wb, scalar_wb, + setup_gpr, setup_mask, setup_submit; + + assign wb.ready = 1; + + assign loop_out = loop_hold[REGFILE_STAGES - 1]; + assign loop_out_valid = loop_valid_hold[REGFILE_STAGES - 1]; + + assign loop_valid = loop_out_valid | setup_submit; + + assign regs.pc_back_group = wb.group; + assign regs.mask_back_group = wb.group; + + assign regs.pc_wb_write = (loop_out_valid & loop_out.pc_update) | setup_submit; + assign regs.mask_wb_write = mask_wb | setup_mask; + assign regs.sgpr_write.write = scalar_wb | setup_gpr; + + assign regs.vgpr_write.vgpr = loop_out.vgpr; + assign regs.vgpr_write.group = loop_out.group; + + assign mask_wb = loop_out_valid & loop_out.mask_update; + assign scalar_wb = wb.valid & wb.writeback & wb.scalar; + + always_comb begin + loop_group = setup.write.group; + regs.pc_wb = setup.write.pc; + regs.pc_wb_group = setup.write.group; + + if (loop_out_valid) begin + loop_group = loop_out.group; + regs.pc_wb = regs.pc_back + word_ptr'(loop_out.pc_add); + regs.pc_wb_group = loop_out.group; + end + + regs.mask_wb = setup.write.mask; + regs.mask_wb_group = setup.write.group; + + if (mask_wb) begin + regs.mask_wb = loop_out.mask; + regs.mask_wb_group = loop_out.group; + end + + regs.sgpr_write.data = setup.write.gpr_value; + regs.sgpr_write.sgpr = setup.write.gpr.sgpr; + regs.sgpr_write.group = setup.write.group; + + if (scalar_wb) begin + regs.sgpr_write.data = wb.lanes[0]; + regs.sgpr_write.sgpr = wb.dest.sgpr; + regs.sgpr_write.group = wb.group; + end + + for (int i = 0; i < SHADER_LANES; ++i) + regs.vgpr_write.data[i] = loop_out.lanes[i]; + + regs.vgpr_write.mask = regs.mask_back; + if (~loop_out_valid | ~loop_out.vgpr_update) + regs.vgpr_write.mask = '0; + end + + always_ff @(posedge clk) begin + // Blocking assignments por bug de verilator (ver for de lanes abajo) + + for (int i = REGFILE_STAGES - 1; i > 0; --i) + loop_hold[i] = loop_hold[i - 1]; + + loop_hold[0].mask = wb.mask; + loop_hold[0].vgpr = wb.dest.vgpr.num; + loop_hold[0].group = wb.group; + loop_hold[0].pc_add = wb.pc_add; + loop_hold[0].pc_update = wb.pc_update; + loop_hold[0].mask_update = wb.mask_update; + loop_hold[0].vgpr_update = wb.writeback & ~wb.scalar; + + // https://github.com/verilator/verilator/issues/4804 + for (int i = 0; i < SHADER_LANES; ++i) + loop_hold[0].lanes[i] = wb.lanes[i]; + + if (wb.pc_inc) + loop_hold[0].pc_add = pc_offset'(1); + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + setup_gpr <= 0; + setup_mask <= 0; + setup_submit <= 0; + + setup.set_done.gpr <= 0; + setup.set_done.mask <= 0; + setup.set_done.submit <= 0; + + for (int i = 0; i < $size(loop_valid_hold); ++i) + loop_valid_hold[i] <= 0; + end else begin + setup_gpr <= (setup_gpr & scalar_wb) | setup.write.gpr_set; + setup_mask <= (setup_mask & mask_wb) | setup.write.mask_set; + setup_submit <= (setup_submit & loop_out_valid) | setup.write.pc_set; + + setup.set_done.gpr <= setup_gpr & ~scalar_wb; + setup.set_done.mask <= setup_mask & ~mask_wb; + setup.set_done.submit <= setup_submit & ~loop_out_valid; + + loop_valid_hold[0] <= wb.valid; + for (int i = 1; i < REGFILE_STAGES; ++i) + loop_valid_hold[i] <= loop_valid_hold[i - 1]; + end + +endmodule diff --git a/rtl/gfx/gfx_shader_fpint.sv b/rtl/gfx/gfx_shader_fpint.sv new file mode 100644 index 0000000..a418dcc --- /dev/null +++ b/rtl/gfx/gfx_shader_fpint.sv @@ -0,0 +1,932 @@ +// -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32 +localparam int FPINT_CLZ_STAGES = 4; + +localparam bit[$clog2($bits(gfx::float_mant_ext)):0] FPINT_MAX_SHIFT + = 1 << $clog2($bits(gfx::float_mant_ext)); + +typedef logic[$clog2(FPINT_MAX_SHIFT):0] fpint_shift; + +/* Las 15 etapas son: + * - setup + * - mulclass + * - mnorm + * - minmax + * - expdiff + * - shiftr + * - addsub + * - clz0-clz3 + * - shiftl + * - round + * - rnorm + * - encode + */ + +typedef struct +{ + gfx::float a, + b, + a_mul, + b_mul; +} fpint_setup_mulclass; + +typedef struct +{ + gfx::float b; + gfx::float_exp exp; + gfx::float_class a_class, + b_class; + gfx::udword product; + logic sign, + overflow; +} fpint_mulclass_mnorm; + +typedef struct +{ + gfx::float a, + b; + gfx::float_class a_class, + b_class; + logic slow, + zero, + guard, + round, + sticky, + slow_in, + overflow; +} fpint_mnorm_minmax; + +typedef struct +{ + gfx::float max, + min; + gfx::float_class max_class, + min_class; + logic slow, + zero, + guard, + round, + sticky; +} fpint_minmax_expdiff; + +typedef struct +{ + gfx::float max, + min; + gfx::float_class max_class, + min_class; + fpint_shift exp_shift; + logic slow, + zero, + guard, + round, + sticky; +} fpint_expdiff_shiftr; + +typedef struct +{ + gfx::float max, + min; + gfx::float_class max_class, + min_class; + gfx::float_mant_ext max_mant, + min_mant, + sticky_mask; + logic slow, + zero, + guard, + round, + sticky, + int_sign; +} fpint_shiftr_addsub; + +typedef struct +{ + gfx::float max; + gfx::word add_sub; + logic slow, + zero, + guard, + round, + sticky; +} fpint_clz_hold; + +typedef fpint_clz_hold fpint_addsub_clz; + +typedef struct +{ + fpint_clz_hold hold; + fpint_shift shift; +} fpint_clz_shiftl; + +typedef struct +{ + gfx::float val; + logic slow, + zero, + guard, + round, + sticky, + overflow, + sticky_last; +} fpint_shiftl_round; + +typedef struct +{ + gfx::float val; + logic slow, + zero, + exp_step, + overflow; +} fpint_round_rnorm; + +typedef struct +{ + gfx::float val; + logic slow, + zero, + overflow; +} fpint_rnorm_encode; + +module gfx_shader_fpint +import gfx::*; +( + input logic clk, + rst_n, + + input fpint_op op, + input wave_exec wave, + input logic abort, + in_valid, + + gfx_regfile_io.ab read_data, + + gfx_wb.tx wb +); + + localparam int FPINT_STAGES = 7 + FPINT_CLZ_STAGES + 4; + + struct + { + fpint_op op; + wave_exec wave; + } stage[FPINT_STAGES]; + + logic stage_valid[FPINT_STAGES]; + + assign wb.dest = stage[FPINT_STAGES - 1].wave.dest; + assign wb.mask = 'x; + assign wb.group = stage[FPINT_STAGES - 1].wave.group; + assign wb.pc_add = 'x; + assign wb.pc_inc = 1; + assign wb.scalar = stage[FPINT_STAGES - 1].wave.dest_scalar; + assign wb.pc_update = wb.writeback; + assign wb.writeback = stage[FPINT_STAGES - 1].op.writeback; + assign wb.mask_update = 0; + + // Ojo: stage_valid[0], pero stage[0] no + assign stage_valid[0] = in_valid; + + genvar lane; + generate + for (lane = 0; lane < SHADER_LANES; ++lane) begin: lanes + gfx_shader_fpint_lane unit + ( + .clk(clk), + .a(read_data.a[lane]), + .b(read_data.b[lane]), + .q(wb.lanes[lane]), + .mul_float_0(op.setup_mul_float), + .unit_b_0(op.setup_unit_b), + .put_hi_2(stage[2 - 1].op.mnorm_put_hi), + .put_lo_2(stage[2 - 1].op.mnorm_put_lo), + .put_mul_2(stage[2 - 1].op.mnorm_put_mul), + .zero_b_2(stage[2 - 1].op.mnorm_zero_b), + .zero_flags_2(stage[2 - 1].op.mnorm_zero_flags), + .abs_3(stage[3 - 1].op.minmax_abs), + .swap_3(stage[3 - 1].op.minmax_swap), + .zero_min_3(stage[3 - 1].op.minmax_zero_min), + .copy_flags_3(stage[3 - 1].op.minmax_copy_flags), + .int_signed_5(stage[5 - 1].op.shiftr_int_signed), + .copy_flags_6(stage[6 - 1].op.addsub_copy_flags), + .int_operand_6(stage[6 - 1].op.addsub_int_operand), + .force_nop_7(stage[7 - 1].op.clz_force_nop), + .copy_flags_11(stage[11 - 1].op.shiftl_copy_flags), + .copy_flags_12(stage[12 - 1].op.round_copy_flags), + .enable_12(stage[12 - 1].op.round_enable), + .enable_14(stage[14 - 1].op.encode_enable) + ); + end + endgenerate + + always_ff @(posedge clk) begin + stage[0].op <= op; + stage[0].wave <= wave; + + for (int i = 1; i < FPINT_STAGES; ++i) + stage[i] <= stage[i - 1]; + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + for (int i = 1; i < FPINT_STAGES; ++i) + stage_valid[i] <= 0; + + wb.valid <= 0; + end else begin + for (int i = 1; i < FPINT_STAGES; ++i) + stage_valid[i] <= stage_valid[i - 1]; + + // Se levanta 1 ciclo luego que in_valid + stage_valid[2] <= stage_valid[1] & ~abort; + + wb.valid <= stage_valid[FPINT_STAGES - 1]; + end + +endmodule + +module gfx_shader_fpint_lane +import gfx::*; +( + input logic clk, + + input word a, + b, + + input logic mul_float_0, + unit_b_0, + put_hi_2, + put_lo_2, + put_mul_2, + zero_b_2, + zero_flags_2, + abs_3, + swap_3, + zero_min_3, + copy_flags_3, + int_signed_5, + copy_flags_6, + int_operand_6, + force_nop_7, + copy_flags_11, + copy_flags_12, + enable_12, + enable_14, + + output word q +); + + /* Notas de implementación para floating-point + * + * === PRODUCTO === + * + * Queremos calcular q = a * b. + * + * Donde a = (-1)^s * 1.m * 2^f, + * b = (-1)^t * 1.n * 2^g + * + * Entonces q = (-1)^(s + t) (1.m * 1.n) 2^(f + g) + * + * El producto es entre números >= 1.0 y < 2.0. En el peor caso: + * Mejor caso: 1.000... * 1.000... ~ 1.000... + * Peor caso: 1.999... * 1.999... ~ 3.999... = 2^1 * 1.999 + * + * Así que, si el producto es >= 2, hay que hacerle >> 1 a la mantisa + * y sumarle 1 al exponente para normalizar. + * + * + * === SUMA/RESTA === + * + * Queremos calcular q = a + b. Curiosamente, eso es más complicado que a * b. + * Hay que ajustar el exponente del menor entre a y b para que coincida + * con el del mayor (desnormalizando), realizar la operación y finalmente + * renormalizar. Se hace suma o resta dependiendo de relaciones de signos, + * no según la operación de entrada (eso último solo le hace xor al signo de b). + * Recordar aquí que IEEE 754 es una especie de signo-magnitud y no complemento. + * + * En el caso de una resta, el exponente normalizado puede ser mucho más + * pequeño que cualquiera de los exponentes de entrada. Necesitamos + * entonces de lǵoica CLZ (count leading zeros) para renormalizar. + * + * + * === CONVERSIÓN INTEGER->FP === + * + * Esto simplemente usa el mismo datapath de fadd, con el abs del entero + * como entrada como entrada de clz. El exponente de referencia se fija + * en 30 (aludiendo al segundo msb de un entero de 32 bits). A partir de + * ese punto es idéntico a un fadd, las etapas de clz se encargan de ajustar + * el exponente. + */ + + fpint_setup_mulclass setup_mulclass; + fpint_mulclass_mnorm mulclass_mnorm; + fpint_mnorm_minmax mnorm_minmax; + fpint_minmax_expdiff minmax_expdiff; + fpint_expdiff_shiftr expdiff_shiftr; + fpint_shiftr_addsub shiftr_addsub; + fpint_addsub_clz addsub_clz; + fpint_clz_shiftl clz_shiftl; + fpint_shiftl_round shiftl_round; + fpint_round_rnorm round_rnorm; + fpint_rnorm_encode rnorm_encode; + + gfx_shader_fpint_setup stage_0 + ( + .clk(clk), + .a(a), + .b(b), + .out(setup_mulclass), + .unit_b(unit_b_0), + .mul_float(mul_float_0) + ); + + gfx_shader_fpint_mulclass stage_1 + ( + .clk(clk), + .in(setup_mulclass), + .out(mulclass_mnorm) + ); + + gfx_shader_fpint_mnorm stage_2 + ( + .clk(clk), + .in(mulclass_mnorm), + .out(mnorm_minmax), + .put_hi(put_hi_2), + .put_lo(put_lo_2), + .put_mul(put_mul_2), + .zero_b(zero_b_2), + .zero_flags(zero_flags_2) + ); + + gfx_shader_fpint_minmax stage_3 + ( + .clk(clk), + .in(mnorm_minmax), + .out(minmax_expdiff), + .abs(abs_3), + .swap(swap_3), + .zero_min(zero_min_3), + .copy_flags(copy_flags_3) + ); + + gfx_shader_fpint_expdiff stage_4 + ( + .clk(clk), + .in(minmax_expdiff), + .out(expdiff_shiftr) + ); + + gfx_shader_fpint_shiftr stage_5 + ( + .clk(clk), + .in(expdiff_shiftr), + .out(shiftr_addsub), + .int_signed(int_signed_5) + ); + + gfx_shader_fpint_addsub stage_6 + ( + .clk(clk), + .in(shiftr_addsub), + .out(addsub_clz), + .copy_flags(copy_flags_6), + .int_operand(int_operand_6) + ); + + gfx_shader_fpint_clz stage_7_8_9_10 + ( + .clk(clk), + .in(addsub_clz), + .out(clz_shiftl), + .force_nop(force_nop_7) + ); + + gfx_shader_fpint_shiftl stage_11 + ( + .clk(clk), + .in(clz_shiftl), + .out(shiftl_round), + .copy_flags(copy_flags_11) + ); + + gfx_shader_fpint_round stage_12 + ( + .clk(clk), + .in(shiftl_round), + .out(round_rnorm), + .enable(enable_12), + .copy_flags(copy_flags_12) + ); + + gfx_shader_fpint_rnorm stage_13 + ( + .clk(clk), + .in(round_rnorm), + .out(rnorm_encode) + ); + + gfx_shader_fpint_encode stage_14 + ( + .clk(clk), + .q(q), + .in(rnorm_encode), + .enable(enable_14) + ); + +endmodule + +// Stage 0: argumentos de mul +module gfx_shader_fpint_setup +import gfx::*; +( + input logic clk, + + input word a, + b, + input logic mul_float, + unit_b, + + output fpint_setup_mulclass out +); + + always_ff @(posedge clk) begin + out.a <= a; + out.b <= b; + out.a_mul <= a; + out.b_mul <= b; + + /* Nótese que el orden es sign-exp-mant. Esto coloca el '1.' implícito + * en la posición correcta para multiplicar las mantisas. + */ + if (mul_float) begin + out.a_mul.exp <= 1; + out.b_mul.exp <= 1; + out.a_mul.sign <= 0; + out.b_mul.sign <= 0; + end + + if (unit_b) begin + out.b_mul.exp <= 0; + out.b_mul.mant <= 1; + out.b_mul.sign <= 0; + end + end + +endmodule + +// Stage 1: multiplicación de fp o enteros +module gfx_shader_fpint_mulclass +import gfx::*; +( + input logic clk, + + input fpint_setup_mulclass in, + + output fpint_mulclass_mnorm out +); + + always_ff @(posedge clk) begin + out.b <= in.b; + out.sign <= in.a.sign ^ in.b.sign; + out.a_class <= classify_float(in.a); + out.b_class <= classify_float(in.b); + out.product <= in.a_mul * in.b_mul; + {out.overflow, out.exp} <= {1'b0, in.a.exp} + {1'b0, in.b.exp} - {1'b0, FLOAT_EXP_BIAS}; + end + +endmodule + +// Stage 2: normalización +module gfx_shader_fpint_mnorm +import gfx::*; +( + input logic clk, + + input fpint_mulclass_mnorm in, + input logic put_hi, + put_lo, + put_mul, + zero_b, + zero_flags, + + output fpint_mnorm_minmax out +); + + word product_hi, product_lo; + logic guard, lo_msb, lo_reduce, round, slow_in_next; + float_mant_full hi; + logic[$bits(float_mant_full) - 3:0] lo; + + assign lo_msb = lo[$bits(lo) - 1]; + assign lo_reduce = |lo[$bits(lo) - 2:0]; + assign slow_in_next = is_float_special(in.a_class) | is_float_special(in.b_class); + assign {product_hi, product_lo} = in.product; + assign {hi, guard, round, lo} = in.product[2 * $bits(float_mant_full) - 1:0]; + + always_ff @(posedge clk) begin + if (put_mul) begin + out.slow <= slow_in_next | (in.overflow & ~in.a_class.exp_min & ~in.a_class.exp_min); + out.zero <= in.a_class.exp_min | in.b_class.exp_min; + end else begin + out.slow <= 0; + out.zero <= 0; + end + + out.a.sign <= in.sign; + out.overflow <= 0; + + if (hi[$bits(hi) - 1]) begin + out.guard <= guard; + out.round <= round; + out.sticky <= lo_msb | lo_reduce; + out.a.mant <= implicit_mant(hi); + {out.overflow, out.a.exp} <= {1'b0, in.exp} + 1; + end else begin + /* Bit antes de msb es necesariamente 1, ya que los msb de + * ambos multiplicandos son 1. Ver assert en implicit_mant(). + */ + out.guard <= round; + out.round <= lo_msb; + out.sticky <= lo_reduce; + + out.a.exp <= in.exp; + out.a.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard}); + end + + unique case (1'b1) + put_mul: ; + + put_hi: + out.a <= product_hi; + + put_lo: + out.a <= product_lo; + endcase + + out.a_class <= in.a_class; + out.slow_in <= slow_in_next; + + if (zero_flags) begin + out.a_class <= classify_float(0); + out.slow_in <= 0; + end + + if (zero_b) begin + out.b <= 0; + out.b_class <= classify_float(0); + end else begin + out.b <= in.b; + out.b_class <= in.b_class; + end + end + +endmodule + +// Stage 3: ordenar tal que abs(max) >= abs(min) +module gfx_shader_fpint_minmax +import gfx::*; +( + input logic clk, + + input fpint_mnorm_minmax in, + input logic abs, + swap, + zero_min, + copy_flags, + + output fpint_minmax_expdiff out +); + + logic abs_b_gt_abs_a, b_gt_a; + + /* Wiki dice: + * + * A property of the single- and double-precision formats is that + * their encoding allows one to easily sort them without using + * floating-point hardware, as if the bits represented sign-magnitude + * integers, although it is unclear whether this was a design + * consideration (it seems noteworthy that the earlier IBM hexadecimal + * floating-point representation also had this property for normalized + * numbers). + */ + assign abs_b_gt_abs_a = {in.b.exp, in.b.mant} > {in.a.exp, in.a.mant}; + + always_comb begin + unique case ({in.b.sign, in.a.sign}) + 2'b00: b_gt_a = abs_b_gt_abs_a; + 2'b01: b_gt_a = 1; + 2'b10: b_gt_a = 0; + 2'b11: b_gt_a = abs_b_gt_abs_a; + endcase + + if (abs) + b_gt_a = abs_b_gt_abs_a; + end + + always_ff @(posedge clk) begin + if (b_gt_a ^ swap) begin + out.max <= in.b; + out.min <= in.a; + out.max_class <= in.b_class; + out.min_class <= in.a_class; + end else begin + out.max <= in.a; + out.min <= in.b; + out.max_class <= in.a_class; + out.min_class <= in.b_class; + end + + if (zero_min) begin + out.min <= 0; + out.min_class <= classify_float(0); + end + + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + + if (copy_flags) begin + out.slow <= in.slow | in.overflow; + out.zero <= in.zero; + end else begin + out.slow <= in.slow_in; + out.zero <= 0; + end + end + +endmodule + +// Stage 4: exp_shift amount +module gfx_shader_fpint_expdiff +import gfx::*; +( + input logic clk, + + input fpint_minmax_expdiff in, + + output fpint_expdiff_shiftr out +); + + float_exp exp_delta; + + assign exp_delta = in.max.exp - in.min.exp; + + always_ff @(posedge clk) begin + out.max <= in.max; + out.min <= in.min; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + out.max_class <= in.max_class; + out.min_class <= in.min_class; + + out.exp_shift <= exp_delta[$bits(out.exp_shift) - 1:0]; + if (exp_delta > {{($bits(exp_delta) - $bits(FPINT_MAX_SHIFT)){1'b0}}, FPINT_MAX_SHIFT}) + out.exp_shift <= FPINT_MAX_SHIFT; + end + +endmodule + +// Stage 5: shifts y abs(max) para enteros con signo +module gfx_shader_fpint_shiftr +import gfx::*; +( + input logic clk, + + input fpint_expdiff_shiftr in, + input logic int_signed, + + output fpint_shiftr_addsub out +); + + always_ff @(posedge clk) begin + out.min <= in.min; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + out.min_class <= in.min_class; + + out.max_mant <= float_prepare_round(in.max, in.max_class); + out.min_mant <= float_prepare_round(in.min, in.min_class) >> in.exp_shift; + out.sticky_mask <= {($bits(out.min_mant)){1'b1}} << in.exp_shift; + + out.max <= in.max; + out.int_sign <= in.max[$bits(in.max) - 1]; + + if (int_signed & in.max[$bits(in.max) - 1]) + out.max <= -in.max; + end + +endmodule + +// Stage 6: suma de mantisas +module gfx_shader_fpint_addsub +import gfx::*; +( + input logic clk, + + input fpint_shiftr_addsub in, + input logic copy_flags, + int_operand, + + output fpint_addsub_clz out +); + + localparam int INT_SHIFT_REF = $bits(word) - 2; + + function word fp_add_sub_arg(float_mant_ext arg); + fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}}; + endfunction + + always_ff @(posedge clk) begin + out.max <= in.max; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + + if (int_operand) begin + out.max.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0]; + out.max.sign <= in.int_sign; + end + + if (copy_flags) + out.sticky <= in.sticky; + else + out.sticky <= |(float_prepare_round(in.min, in.min_class) & ~in.sticky_mask); + + if (int_operand) + out.add_sub <= in.max; + else if (in.max.sign ^ in.min.sign) + out.add_sub <= fp_add_sub_arg(in.max_mant) - fp_add_sub_arg(in.min_mant); + else + out.add_sub <= fp_add_sub_arg(in.max_mant) + fp_add_sub_arg(in.min_mant); + end + +endmodule + +// Stages 7-10: encontrar el 1 más significativo +module gfx_shader_fpint_clz +import gfx::*; +( + input logic clk, + + input fpint_addsub_clz in, + input logic force_nop, + + output fpint_clz_shiftl out +); + + word clz_in; + fpint_clz_hold hold[FPINT_CLZ_STAGES]; + + assign out.hold = hold[FPINT_CLZ_STAGES - 1]; + + gfx_clz #($bits(word)) clz + ( + .clk(clk), + .clz(out.shift), + .value(clz_in) + ); + + always_comb begin + clz_in = in.add_sub; + if (force_nop) + clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01; + end + + always_ff @(posedge clk) begin + hold[0] <= in; + + for (int i = 1; i < FPINT_CLZ_STAGES; ++i) + hold[i] <= hold[i - 1]; + end + +endmodule + +// Stage 11: normalización +module gfx_shader_fpint_shiftl +import gfx::*; +( + input logic clk, + + input fpint_clz_shiftl in, + input logic copy_flags, + + output fpint_shiftl_round out +); + + localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(in.shift) + 1; + + word normalized; + + assign normalized = in.hold.add_sub << in.shift; + + always_ff @(posedge clk) begin + out.slow <= in.hold.slow; + out.zero <= in.hold.zero; + out.sticky <= in.hold.sticky; + out.val.sign <= in.hold.max.sign; + + {out.val.mant, out.guard, out.round, out.sticky_last} <= + normalized[$bits(normalized) - 2:$bits(normalized) - $bits(float_mant) - 4]; + + {out.overflow, out.val.exp} <= + {1'b0, in.hold.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, in.shift} + 1; + + if (in.shift[$bits(in.shift) - 1]) + out.zero <= 1; + + if (copy_flags) begin + out.guard <= in.hold.guard; + out.round <= in.hold.round; + out.overflow <= 0; + out.sticky_last <= 0; + end + end + +endmodule + +// Stage 12: redondeo +module gfx_shader_fpint_round +import gfx::*; +( + input logic clk, + + input fpint_shiftl_round in, + input logic copy_flags, + enable, + + output fpint_round_rnorm out +); + + always_ff @(posedge clk) begin + out.val <= in.val; + out.slow <= in.slow | (~copy_flags & in.overflow & ~in.zero); + out.zero <= in.zero; + out.exp_step <= 0; + + // Este es el modo de redondeo más usual: round to nearest, ties to even + if (enable & in.guard & (in.round | in.sticky | in.sticky_last | in.val.mant[0])) + {out.exp_step, out.val.mant} <= {1'b0, out.val.mant} + 1; + end + +endmodule + +// Stage 13: ajuste de exponente por redondeo +module gfx_shader_fpint_rnorm +import gfx::*; +( + input logic clk, + + input fpint_round_rnorm in, + + output fpint_rnorm_encode out +); + + always_ff @(posedge clk) begin + out.slow <= in.slow; + out.zero <= in.zero; + out.overflow <= 0; + out.val.mant <= in.val.mant; + out.val.sign <= in.val.sign; + + if (in.exp_step) + {out.overflow, out.val.exp} <= {1'b0, in.val.exp} + 1; + else + out.val.exp <= in.val.exp; + end + +endmodule + +// Stage 14: salida y codificación de ceros y NaNs +module gfx_shader_fpint_encode +import gfx::*; +( + input logic clk, + + input fpint_rnorm_encode in, + input logic enable, + + output float q +); + + always_ff @(posedge clk) begin + q <= in.val; + + if (enable) begin + if (&in.val.exp | in.slow | in.overflow) begin + q.exp <= FLOAT_EXP_MAX; + q.mant <= 1; + end else if (in.zero) begin + q.exp <= 0; + q.mant <= 0; + end + end + end + +endmodule diff --git a/rtl/gfx/gfx_shader_front.sv b/rtl/gfx/gfx_shader_front.sv new file mode 100644 index 0000000..52074fd --- /dev/null +++ b/rtl/gfx/gfx_shader_front.sv @@ -0,0 +1,746 @@ +typedef struct +{ + logic valid, + retry; + gfx::group_id group; + gfx_isa::insn_word insn; +} front_wave; + +typedef struct +{ + gfx::xgpr_num dest; + logic dest_scalar; +} front_reg_passthru; + +typedef logic[4:0] icache_line_num; + +typedef logic[$bits(gfx::oword_ptr) - $bits(icache_line_num) - 1:0] icache_tag; + +typedef struct packed +{ + icache_tag tag; + icache_line_num line; +} icache_line_tag; + +typedef struct packed +{ + icache_line_tag line_tag; + logic[2:0] word_num; +} icache_ptr; + +module gfx_shader_front +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axib.m fetch_mem, + + input logic icache_flush, + + gfx_regfile_io.read reg_read, + gfx_regfile_io.bind_ reg_bind, + + gfx_front_back.front front +); + + word fetch_insn, port_insn; + logic fetch_hit, p0_writeback; + front_wave bind_wave, dec_wave, port_dec_wave; + front_reg_passthru reg_passthru; + + assign front.execute.wave.dest = reg_passthru.dest; + assign front.execute.wave.dest_scalar = reg_passthru.dest_scalar; + + gfx_shader_bind bind_ + ( + .clk, + .rst_n, + .mem(fetch_mem), + .wave(bind_wave), + .regs(reg_bind), + .loop_valid(front.loop.valid), + .loop_group(front.loop.group), + .icache_flush + ); + + gfx_shader_read_regs reg_dec + ( + .clk, + .rst_n, + .in(bind_wave), + .out(dec_wave), + .read(reg_read), + .passthru(reg_passthru) + ); + + gfx_shader_decode_class class_dec + ( + .clk, + .rst_n, + .wave(dec_wave), + .out_group(front.execute.wave.group), + .port_wave(port_dec_wave), + .dispatch(front.dispatch), + .p0_writeback + ); + + gfx_shader_decode_fpint p0_dec + ( + .clk, + .op(front.execute.p0), + .insn(port_dec_wave.insn), + .writeback(p0_writeback) + ); + +endmodule + +module gfx_shader_bind +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axib.m mem, + + input logic icache_flush, + + input logic loop_valid, + input group_id loop_group, + + gfx_regfile_io.bind_ regs, + + output front_wave wave +); + + localparam int ICACHE_STAGES = 6; + localparam int BIND_STAGES = REGFILE_STAGES + ICACHE_STAGES; + + gfx_beats #($bits(group_id)) runnable_in(), runnable_out(); + + logic ar_stall, request_ready, request_valid, valids[BIND_STAGES]; + group_id groups[BIND_STAGES]; + icache_line_tag araddr, request_addr; + + assign mem.bready = 0; + assign mem.wvalid = 0; + assign mem.awvalid = 0; + + assign mem.arlen = ($bits(mem.arlen))'($bits(oword) / $bits(word) - 1); + assign mem.araddr = {araddr, ($clog2($bits(oword)) - $clog2($bits(word)) + SUBWORD_BITS)'('0)}; + assign mem.arburst = 2'b01; // Incremental mode + + assign runnable_in.tx.data = loop_group; + assign runnable_in.tx.valid = loop_valid; + + assign regs.pc_front_group = runnable_out.rx.data; + assign runnable_out.rx.ready = 1; + + assign wave.group = groups[$size(groups) - 1]; + + gfx_skid_buf #($bits(araddr)) ar_skid + ( + .clk, + .in(request_addr), + .out(araddr), + .stall(ar_stall) + ); + + gfx_skid_flow ar_flow + ( + .clk, + .rst_n, + .stall(ar_stall), + .in_ready(request_ready), + .in_valid(request_valid), + .out_ready(mem.arready), + .out_valid(mem.arvalid) + ); + + //TODO: Podríamos quitar ~25 entries sin afectar throughput, latencia o correctitud + gfx_fifo #(.WIDTH($bits(group_id)), .DEPTH(1 << $bits(group_id))) runnable + ( + .clk, + .rst_n, + .in(runnable_in.rx), + .out(runnable_out.tx) + ); + + gfx_shader_bind_icache icache + ( + .clk, + .rst_n, + + .icache_flush, + .read_addr(regs.pc_front), + .read_valid(valids[REGFILE_STAGES - 1]), + + .request_addr, + .request_valid, + .request_ready, + + .fetch_data(mem.rdata), + .fetch_last(mem.rlast), + .fetch_valid(mem.rvalid), + .fetch_ready(mem.rready), + + .insn(wave.insn), + .insn_retry(wave.retry), + .insn_valid(wave.valid) + ); + + always_ff @(posedge clk) begin + groups[0] <= runnable_out.rx.data; + for (int i = 1; i < $size(groups); ++i) + groups[i] <= groups[i - 1]; + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) + for (int i = 0; i < $size(valids); ++i) + valids[i] <= 0; + else begin + valids[0] <= runnable_out.rx.valid; + for (int i = 1; i < $size(valids); ++i) + valids[i] <= valids[i - 1]; + end + +endmodule + +module gfx_shader_bind_icache +import gfx::*; +( + input logic clk, + rst_n, + + input logic icache_flush, + + input logic read_valid, + input icache_ptr read_addr, + + input logic fetch_last, + fetch_valid, + input word fetch_data, + output logic fetch_ready, + + input logic request_ready, + output logic request_valid, + output icache_line_tag request_addr, + + output logic insn_valid, + insn_retry, + output word insn +); + + // Dan Gisselquist limita a (1 << 3) bursts por defecto. + // Ver LGMAXBURST en axixbar.v + localparam int PENDING_FIFO_DEPTH = 8; + + enum int unsigned + { + FLUSH, + RUN + } state; + + struct + { + logic valid, + accessed, + hit; + icache_tag tag; + oword data; + } cache[1 << $bits(icache_line_num)], read, read_hold; + + gfx_beats #($bits(icache_line_tag)) pending_in(), pending_out(); + + logic accessed_write, accessed_write_enable, burst, fetch_done, hit_write, + in_flush, hit_commit, hit_write_enable, retry_4, retry_5, rollback, + tag_hit, valid_1, valid_2, valid_3, valid_4, valid_5, valid_write, + valid_write_enable; + + icache_ptr read_addr_1, read_addr_2, read_addr_3, read_addr_4, read_addr_5; + icache_tag tag_write; + icache_line_num accessed_write_line, flush_ptr, hit_write_line, valid_write_line; + icache_line_tag pending_pop; + + oword data_write; + word[1:0] data_5; + word[7:0] fetch_shift; + qword[1:0] data_3; + udword[1:0] data_4; + + assign data_3 = read.data; + assign tag_hit = read.tag == read_addr_3.line_tag.tag; + assign fetch_ready = ~fetch_done; + assign pending_pop = pending_out.rx.data; + + assign request_addr = read_addr_4.line_tag; + assign request_valid = burst & pending_in.tx.ready; + assign pending_in.tx.data = read_addr_4.line_tag; + assign pending_in.tx.valid = burst & request_ready; + assign pending_out.rx.ready = fetch_done & ~hit_commit & ~rollback; + + gfx_fifo #(.WIDTH($bits(icache_line_tag)), .DEPTH(PENDING_FIFO_DEPTH)) pending + ( + .clk, + .rst_n, + .in(pending_in.rx), + .out(pending_out.tx) + ); + + always_comb + unique case (state) + FLUSH: in_flush = 1; + RUN: in_flush = 0; + endcase + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= FLUSH; + flush_ptr <= '0; + fetch_done <= 0; + + valid_1 <= 0; + valid_2 <= 0; + valid_3 <= 0; + valid_4 <= 0; + valid_5 <= 0; + + burst <= 0; + end else begin + unique case (state) + FLUSH: + if (~icache_flush & &flush_ptr) + state <= RUN; + + RUN: + if (icache_flush) + state <= FLUSH; + endcase + + flush_ptr <= flush_ptr + 1; + if (icache_flush) + flush_ptr <= '0; + + if (fetch_done) + fetch_done <= hit_commit | ~pending_out.rx.valid | rollback; + else if (fetch_ready & fetch_valid) + fetch_done <= fetch_last; + + valid_1 <= read_valid; + valid_2 <= valid_1; + valid_3 <= valid_2; + valid_4 <= valid_3; + valid_5 <= valid_4; + + burst <= valid_3 & ~tag_hit & ~read.accessed & (~read.valid | read.hit); + end + + always_ff @(posedge clk) begin + tag_write <= pending_pop.tag; + data_write <= fetch_shift; + + valid_write <= 1; + valid_write_line <= pending_pop.line; + valid_write_enable <= fetch_done & ~hit_commit & pending_out.rx.valid & ~rollback; + + accessed_write <= 0; + accessed_write_enable <= 1; + + if (rollback) + accessed_write_line <= read_addr_5.line_tag.line; + else if (fetch_done & ~hit_commit & pending_out.rx.valid) + accessed_write_line <= pending_pop.line; + else begin + accessed_write <= 1; + accessed_write_line <= read_addr.line_tag.line; + accessed_write_enable <= read_valid; + end + + hit_write <= hit_commit; + if (hit_commit) begin + hit_write_line <= read_addr_4.line_tag.line; + hit_write_enable <= 1; + end else begin + hit_write_line <= pending_pop.line; + hit_write_enable <= fetch_done & pending_out.rx.valid & ~rollback; + end + + if (in_flush) begin + valid_write <= 0; + valid_write_line <= flush_ptr; + valid_write_enable <= 1; + + accessed_write <= 0; + accessed_write_line <= flush_ptr; + accessed_write_enable <= 1; + + hit_write <= 0; + hit_write_line <= flush_ptr; + hit_write_enable <= 1; + end + + if (valid_write_enable) begin + cache[valid_write_line].tag <= tag_write; + cache[valid_write_line].data <= data_write; + cache[valid_write_line].valid <= valid_write; + end + + if (accessed_write_enable) + cache[accessed_write_line].accessed <= accessed_write; + + if (hit_write_enable) + cache[hit_write_line].hit <= hit_write; + + read_addr_1 <= read_addr; + + read_hold <= cache[read_addr_1.line_tag.line]; + read_addr_2 <= read_addr_1; + + read <= read_hold; + read_addr_3 <= read_addr_2; + + data_4 <= data_3[read_addr_3.word_num[2]]; + retry_4 <= ~tag_hit | ~read.valid; + hit_commit <= valid_3 & tag_hit & read.valid; + read_addr_4 <= read_addr_3; + + data_5 <= data_4[read_addr_4.word_num[1]]; + retry_5 <= retry_4; + rollback <= burst & (~request_valid | ~pending_in.tx.valid); + read_addr_5 <= read_addr_4; + + insn <= data_5[read_addr_5.word_num[0]]; + insn_retry <= retry_5; + insn_valid <= valid_5; + + if (fetch_ready & fetch_valid) begin + fetch_shift[0] <= fetch_data; + for (int i = 1; i < $size(fetch_shift); ++i) + fetch_shift[i] <= fetch_shift[i - 1]; + end + end + +endmodule + +module gfx_shader_read_regs +import gfx::*; +import gfx_isa::*; +( + input logic clk, + rst_n, + + input front_wave in, + + gfx_regfile_io.read read, + + output front_wave out, + output front_reg_passthru passthru +); + + // + 1 por next-cycle de read.op + localparam int PASSTHRU_DEPTH = REG_READ_STAGES + 1 - 2; + localparam int HOLD_DEPTH = PASSTHRU_DEPTH - 2; + + logic reg_rev; + logic valid[HOLD_DEPTH]; + front_wave out_hold[HOLD_DEPTH]; + front_reg_passthru passthru_hold[PASSTHRU_DEPTH]; + + assign passthru = passthru_hold[$size(passthru_hold) - 1]; + + assign reg_rev = in.insn.reg_rev; + + always_comb begin + out = out_hold[$size(out_hold) - 1]; + out.valid = valid[$size(valid) - 1]; + end + + always_ff @(posedge clk) begin + out_hold[0] <= in; + for (int i = 1; i < $size(out_hold); ++i) + out_hold[i] <= out_hold[i - 1]; + + passthru_hold[0].dest <= in.insn.dst_src.rr.rd; + unique case (in.insn.reg_mode) + REGS_SVS, REGS_SSS: + passthru_hold[0].dest_scalar <= 1; + + REGS_VVS, REGS_VVV: + passthru_hold[0].dest_scalar <= 0; + endcase + + for (int i = 1; i < $size(passthru_hold); ++i) + passthru_hold[i] <= passthru_hold[i - 1]; + + read.op.group <= in.group; + + read.op.b_imm <= in.insn.dst_src.rr.b.imm; + read.op.a_sgpr <= in.insn.dst_src.rr.ra.sgpr; + read.op.b_sgpr <= in.insn.dst_src.rr.b.read.r.sgpr; + read.op.a_vgpr <= in.insn.dst_src.rr.ra.vgpr.num; + read.op.b_vgpr <= in.insn.dst_src.rr.b.read.r.vgpr.num; + read.op.b_is_imm <= in.insn.dst_src.rr.b_is_imm; + read.op.b_is_const <= in.insn.dst_src.rr.b.read.from_consts; + read.op.scalar_rev <= reg_rev; + + unique case (in.insn.reg_mode) + REGS_SVS, REGS_VVS: begin + read.op.a_scalar <= reg_rev; + read.op.b_scalar <= ~reg_rev; + end + + REGS_SSS: begin + read.op.a_scalar <= 1; + read.op.b_scalar <= 1; + end + + REGS_VVV: begin + read.op.a_scalar <= 0; + read.op.b_scalar <= 0; + end + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) + for (int i = 0; i < HOLD_DEPTH; ++i) + valid[i] <= 0; + else begin + valid[0] <= in.valid; + + for (int i = 1; i < HOLD_DEPTH; ++i) + valid[i] <= valid[i - 1]; + end + +endmodule + +module gfx_shader_decode_class +import gfx::*; +import gfx_isa::*; +( + input logic clk, + rst_n, + + input front_wave wave, + output front_wave port_wave, + output group_id out_group, + + output shader_dispatch dispatch, + output logic p0_writeback +); + + logic is_fsu, is_mem, is_group, hold_valid, retry; + front_wave hold_wave; + + assign p0_writeback = ~(is_mem | is_fsu | is_group | retry); + + always_comb begin + port_wave = hold_wave; + port_wave.valid = hold_valid; + end + + always_ff @(posedge clk) begin + hold_wave <= wave; + out_group <= port_wave.group; + end + + always_ff @(posedge clk or negedge rst_n) + // Intencionalmente repetitivo + if (~rst_n) begin + is_fsu <= 0; + is_mem <= 0; + is_group <= 0; + + retry <= 0; + hold_valid <= 0; + + dispatch <= '0; + end else begin + is_fsu <= 0; + is_mem <= 0; + is_group <= 0; + + retry <= wave.retry; + hold_valid <= wave.valid; + + unique case (wave.insn.insn_class) + INSN_FPINT: ; // p0 no tiene ready + INSN_MEM: is_mem <= 1; + INSN_SFU: is_fsu <= 1; + INSN_GROUP: is_group <= 1; + + default: + {is_mem, is_fsu, is_group} <= 'x; + endcase + + dispatch.p1 <= is_mem; + dispatch.p2 <= is_fsu; + dispatch.p3 <= is_group; + + if (~hold_valid | retry) begin + dispatch.p1 <= 0; + dispatch.p2 <= 0; + dispatch.p3 <= 0; + end + + dispatch.valid <= hold_valid; + end + +endmodule + +module gfx_shader_decode_fpint +import gfx::*; +import gfx_isa::*; +( + input logic clk, + + input insn_word insn, + input logic writeback, + + output fpint_op op +); + + always_ff @(posedge clk) begin + unique case (insn.by_class.fpint.op) + INSN_FPINT_MOV: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 1; + op.mnorm_zero_b <= 1; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + INSN_FPINT_FMUL: begin + op.setup_mul_float <= 1; + op.setup_unit_b <= 0; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 0; + op.mnorm_put_mul <= 1; + op.mnorm_zero_flags <= 0; + op.mnorm_zero_b <= 1; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + INSN_FPINT_IMUL: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 0; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 1; + op.mnorm_zero_b <= 1; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 0; + op.encode_enable <= 0; + end + + INSN_FPINT_FADD: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 0; + op.mnorm_zero_b <= 0; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 0; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 0; + op.clz_force_nop <= 0; + op.shiftl_copy_flags <= 0; + op.round_copy_flags <= 0; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + INSN_FPINT_FMAX, INSN_FPINT_FMIN: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 0; + op.mnorm_zero_b <= 0; + op.minmax_abs <= 0; + op.minmax_swap <= insn.by_class.fpint.op == INSN_FPINT_FMIN; + op.minmax_zero_min <= 1; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 0; + op.encode_enable <= 0; + end + + INSN_FPINT_FCVT: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 1; + op.mnorm_zero_b <= 1; + + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 0; + op.shiftr_int_signed <= 1; + op.addsub_int_operand <= 1; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 0; + op.shiftl_copy_flags <= 0; + op.round_copy_flags <= 0; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + default: + op <= 'x; + endcase + + op.writeback <= writeback; + end + +endmodule diff --git a/rtl/gfx/gfx_shader_group.sv b/rtl/gfx/gfx_shader_group.sv new file mode 100644 index 0000000..e668877 --- /dev/null +++ b/rtl/gfx/gfx_shader_group.sv @@ -0,0 +1,17 @@ +module gfx_shader_group +import gfx::*; +( + input logic clk, + rst_n, + + input group_op op, + input wave_exec wave, + + gfx_regfile_io.ab read_data, + + gfx_shake.rx in_shake, + + gfx_wb.tx wb +); + +endmodule diff --git a/rtl/gfx/gfx_shader_mem.sv b/rtl/gfx/gfx_shader_mem.sv new file mode 100644 index 0000000..403c9e4 --- /dev/null +++ b/rtl/gfx/gfx_shader_mem.sv @@ -0,0 +1,17 @@ +module gfx_shader_mem +import gfx::*; +( + input logic clk, + rst_n, + + input mem_op op, + input wave_exec wave, + + gfx_regfile_io.ab read_data, + + gfx_shake.rx in_shake, + + gfx_wb.tx wb +); + +endmodule diff --git a/rtl/gfx/gfx_shader_regs.sv b/rtl/gfx/gfx_shader_regs.sv new file mode 100644 index 0000000..ef3a129 --- /dev/null +++ b/rtl/gfx/gfx_shader_regs.sv @@ -0,0 +1,302 @@ +module gfx_shader_regs +import gfx::*; +( + input logic clk, + + gfx_regfile_io.regs io +); + + // verilator tracing_off + + localparam PC_TABLE_PORTS = 2; + localparam MASK_TABLE_PORTS = 1; + + word hold_imm[REGFILE_STAGES], imm_out, read_a_data_sgpr, read_b_data_scalar, + read_b_data_sgpr, read_const, read_a_data_vgpr[SHADER_LANES], + read_b_data_vgpr[SHADER_LANES], sgpr_out_a, sgpr_out_b; + + group_id mask_read_groups[MASK_TABLE_PORTS], pc_read_groups[PC_TABLE_PORTS]; + word_ptr pc_read[PC_TABLE_PORTS]; + lane_mask mask_read[MASK_TABLE_PORTS]; + + logic a_scalar_out, b_is_const_out, b_is_imm_out, b_scalar_out, scalar_rev_out; + group_id hold_read_group_1, hold_read_group_2; + sgpr_num hold_read_a_sgpr; + vgpr_num hold_read_a_vgpr_1, hold_read_a_vgpr_2, hold_read_b_vgpr_1, hold_read_b_vgpr_2; + logic[REGFILE_STAGES - 1:0] hold_b_is_imm, hold_b_is_const; + logic[REGFILE_STAGES + 1 - 1:0] hold_scalar_rev; + logic[REGFILE_STAGES + 2 - 1:0] hold_a_scalar, hold_b_scalar; + + assign io.pc_back = pc_read[0]; + assign io.pc_front = pc_read[1]; + assign pc_read_groups[0] = io.pc_back_group; + assign pc_read_groups[1] = io.pc_front_group; + + assign io.mask_back = mask_read[0]; + assign pc_read_groups[0] = io.mask_back_group; + + assign imm_out = hold_imm[$size(hold_imm) - 1]; + assign a_scalar_out = hold_a_scalar[$bits(hold_a_scalar) - 1]; + assign b_scalar_out = hold_b_scalar[$bits(hold_b_scalar) - 1]; + assign b_is_imm_out = hold_b_is_imm[$bits(hold_b_is_imm) - 1]; + assign b_is_const_out = hold_b_is_const[$bits(hold_b_is_const) - 1]; + assign scalar_rev_out = hold_scalar_rev[$bits(hold_scalar_rev) - 1]; + + gfx_shader_table #(.DATA_WIDTH($bits(word_ptr)), .READ_PORTS(PC_TABLE_PORTS)) pc_table + ( + .clk, + .read(pc_read), + .write(io.pc_wb), + .read_groups(pc_read_groups), + .write_group(io.pc_wb_group), + .write_enable(io.pc_wb_write) + ); + + gfx_shader_table #(.DATA_WIDTH($bits(lane_mask)), .READ_PORTS(MASK_TABLE_PORTS)) mask_table + ( + .clk, + .read(mask_read), + .write(io.mask_wb), + .read_groups(mask_read_groups), + .write_group(io.mask_wb_group), + .write_enable(io.mask_wb_write) + ); + + gfx_shader_consts consts + ( + .clk, + .num(io.op.b_sgpr), + .value(read_const) + ); + + gfx_shader_regfile #($bits(group_id) + $bits(sgpr_num)) sgprs + ( + .clk, + + .read_a_num({hold_read_group_1, hold_read_a_sgpr}), + .read_b_num({io.op.group, io.op.b_sgpr}), + .read_a_data(read_a_data_sgpr), + .read_b_data(read_b_data_sgpr), + + .write(io.sgpr_write.write), + .write_num({io.sgpr_write.group, io.sgpr_write.sgpr}), + .write_data(io.sgpr_write.data) + ); + + generate + for (genvar i = 0; i < SHADER_LANES; ++i) begin: vgprs + gfx_shader_regfile #($bits(group_id) + $bits(vgpr_num)) vgprs + ( + .clk, + + .read_a_num({hold_read_group_2, hold_read_a_vgpr_2}), + .read_b_num({hold_read_group_2, hold_read_b_vgpr_2}), + .read_a_data(read_a_data_vgpr[i]), + .read_b_data(read_b_data_vgpr[i]), + + .write(io.vgpr_write.mask[i]), + .write_num({io.vgpr_write.group, io.vgpr_write.vgpr}), + .write_data(io.vgpr_write.data[i]) + ); + end + endgenerate + + always_ff @(posedge clk) begin + hold_imm[0] <= {{($bits(word) - $bits(io.op.b_imm)){1'b0}}, io.op.b_imm}; + hold_a_scalar[0] <= io.op.a_scalar; + hold_b_scalar[0] <= io.op.b_scalar; + hold_b_is_imm[0] <= io.op.b_is_imm; + hold_b_is_const[0] <= io.op.b_is_const; + hold_scalar_rev[0] <= io.op.scalar_rev; + + for (int i = 1; i < REGFILE_STAGES; ++i) begin + hold_imm[i] <= hold_imm[i - 1]; + hold_a_scalar[i] <= hold_a_scalar[i - 1]; + hold_b_scalar[i] <= hold_b_scalar[i - 1]; + hold_b_is_imm[i] <= hold_b_is_imm[i - 1]; + hold_b_is_const[i] <= hold_b_is_const[i - 1]; + hold_scalar_rev[i] <= hold_scalar_rev[i - 1]; + end + + for (int i = REGFILE_STAGES; i < REGFILE_STAGES + 2; ++i) begin + hold_a_scalar[i] <= hold_a_scalar[i - 1]; + hold_b_scalar[i] <= hold_b_scalar[i - 1]; + end + + hold_scalar_rev[REGFILE_STAGES] <= hold_scalar_rev[REGFILE_STAGES - 1]; + + hold_read_a_sgpr <= io.op.a_sgpr; + hold_read_group_1 <= io.op.group; + hold_read_group_2 <= hold_read_group_1; + + hold_read_a_vgpr_1 <= io.op.a_vgpr; + hold_read_a_vgpr_2 <= hold_read_a_vgpr_1; + + hold_read_b_vgpr_1 <= io.op.b_vgpr; + hold_read_b_vgpr_2 <= hold_read_b_vgpr_1; + + if (b_is_imm_out) + read_b_data_scalar <= imm_out; + else if (b_is_const_out) + read_b_data_scalar <= read_const; + else + read_b_data_scalar <= read_b_data_sgpr; + + if (scalar_rev_out) begin + sgpr_out_a <= read_b_data_scalar; + sgpr_out_b <= read_a_data_sgpr; + end else begin + sgpr_out_a <= read_a_data_sgpr; + sgpr_out_b <= read_b_data_scalar; + end + + for (int i = 0; i < SHADER_LANES; ++i) begin + io.a[i] <= a_scalar_out ? sgpr_out_a : read_a_data_vgpr[i]; + io.b[i] <= b_scalar_out ? sgpr_out_b : read_a_data_vgpr[i]; + end + end + +endmodule + +module gfx_shader_consts +import gfx::*; +( + input logic clk, + + input sgpr_num num, + output word value +); + + word hold_out, rom[1 << $bits(sgpr_num)]; + sgpr_num hold_in; + + always_ff @(posedge clk) begin + value <= hold_out; + hold_in <= num; + hold_out <= rom[hold_in]; + end + + initial begin + rom[0] = 'hffff_ffff; // -1 + rom[1] = 'h7fff_ffff; // 2^31 - 1, útil para abs de fp + rom[2] = 'h8000_0000; // 2^31, útil para neg de fp + rom[3] = 'h3f80_0000; // +1.0 + rom[4] = 'hbf80_0000; // -1.0 + end + +endmodule + +module gfx_shader_regfile +import gfx::*; +#(int DEPTH_LOG = 0) +( + input logic clk, + + input logic[DEPTH_LOG - 1:0] read_a_num, + read_b_num, + output word read_a_data, + read_b_data, + + input logic write, + input logic[DEPTH_LOG - 1:0] write_num, + input word write_data +); + + gfx_shader_regfile_port #(DEPTH_LOG) a + ( + .clk, + .write, + .read_num(read_a_num), + .read_data(read_a_data), + .write_num, + .write_data + ); + + gfx_shader_regfile_port #(DEPTH_LOG) b + ( + .clk, + .write, + .read_num(read_b_num), + .read_data(read_b_data), + .write_num, + .write_data + ); + +endmodule + +module gfx_shader_regfile_port +import gfx::*; +#(int DEPTH_LOG = 0) +( + input logic clk, + + input logic[DEPTH_LOG - 1:0] read_num, + output word read_data, + + input logic write, + input logic[DEPTH_LOG - 1:0] write_num, + input word write_data +); + + word file[1 << DEPTH_LOG], hold_read_data, hold_write_data; + logic hold_write; + logic[DEPTH_LOG - 1:0] hold_read_num, hold_write_num; + + // hold_write no necesita rst_n porque cualquier write inicial es inofensivo + + always_ff @(posedge clk) begin + hold_write <= write; + hold_read_num <= read_num; + hold_write_num <= write_num; + hold_write_data <= write_data; + + hold_read_data <= file[hold_read_num]; + if (hold_write) + file[hold_write_num] <= hold_write_data; + + read_data <= hold_read_data; + end + +endmodule + +module gfx_shader_table +import gfx::*; +#(int DATA_WIDTH = 0, + int READ_PORTS = 0) +( + input logic clk, + + input group_id write_group, + read_groups[READ_PORTS], + + input logic[DATA_WIDTH - 1:0] write, + input logic write_enable, + + output logic[DATA_WIDTH - 1:0] read[READ_PORTS] +); + + genvar i; + + generate + for (i = 0; i < READ_PORTS; ++i) begin: ports + logic write_enable_hold; + group_id read_group_hold, write_group_hold; + logic[DATA_WIDTH - 1:0] data[1 << $bits(group_id)], read_hold, write_hold; + + always_ff @(posedge clk) begin + write_hold <= write; + read_group_hold <= read_groups[i]; + write_group_hold <= write_group; + write_enable_hold <= write_enable; + + read_hold <= data[read_group_hold]; + + if (write_enable_hold) + data[write_group_hold] <= write_hold; + + read[i] <= read_hold; + end + end + endgenerate + +endmodule diff --git a/rtl/gfx/gfx_shader_schedif.rdl b/rtl/gfx/gfx_shader_schedif.rdl new file mode 100644 index 0000000..c846da9 --- /dev/null +++ b/rtl/gfx/gfx_shader_schedif.rdl @@ -0,0 +1,91 @@ +addrmap gfx_shader_schedif { + name = "Scheduler<->core interface"; + + default hw = r; + default sw = w; + default regwidth = 32; + + reg { + name = "Shader core control register"; + + field { + desc = "Set this field to flush the instruction cache"; + + singlepulse; + } IFLUSH[0:0] = 0; + } CORE @ 0x00; + + reg { + name = "Wavefront setup control register"; + + default hw = na; + default sw = r; + default precedence = hw; + + field { + desc = "Wavefront group number"; + + hw = r; + sw = rw; + } GROUP[5:0]; + + field { + desc = "Destination SGPR number"; + + hw = r; + sw = rw; + } XGPR[11:8]; + + field { + desc = "PC table update done, group submitted"; + + rclr; + hwset; + } SUBMIT_DONE[16:16] = 0; + + field { + desc = "General-purpose register update done"; + + rclr; + hwset; + } GPR_DONE[17:17] = 0; + + field { + desc = "Lane mask update done"; + + rclr; + hwset; + } MASK_DONE[18:18] = 0; + } SETUP_CTRL @ 0x04; + + reg { + name = "SGPR/VGPR write register"; + + field { + desc = "Value to write"; + + swmod; + } VALUE[31:0]; + } SETUP_GPR @ 0x08; + + reg { + name = "Lane mask write register"; + + field { + desc = "Mask value to write"; + + swmod; + } MASK[15:0]; + } SETUP_MASK @ 0x0c; + + reg { + name = "Group submit register"; + + field { + desc = "Initial group program counter, submits group on write"; + + swmod; + } PC[31:2]; + } SETUP_SUBMIT @ 0x10; +}; + diff --git a/rtl/gfx/gfx_shader_setup.sv b/rtl/gfx/gfx_shader_setup.sv new file mode 100644 index 0000000..f46fb66 --- /dev/null +++ b/rtl/gfx/gfx_shader_setup.sv @@ -0,0 +1,37 @@ +interface gfx_shader_setup +import gfx::*;; + + struct + { + group_id group; + word_ptr pc; + xgpr_num gpr; + word gpr_value; + lane_mask mask; + logic pc_set, + gpr_set, + mask_set; + } write; + + struct + { + logic gpr, + mask, + submit; + } set_done; + + modport core + ( + input write, + + output set_done + ); + + modport sched + ( + input set_done, + + output write + ); + +endinterface diff --git a/rtl/gfx/gfx_shader_sfu.sv b/rtl/gfx/gfx_shader_sfu.sv new file mode 100644 index 0000000..d65e522 --- /dev/null +++ b/rtl/gfx/gfx_shader_sfu.sv @@ -0,0 +1,17 @@ +module gfx_shader_sfu +import gfx::*; +( + input logic clk, + rst_n, + + input sfu_op op, + input wave_exec wave, + + gfx_regfile_io.ab read_data, + + gfx_shake.rx in_shake, + + gfx_wb.tx wb +); + +endmodule diff --git a/rtl/gfx/gfx_shake.sv b/rtl/gfx/gfx_shake.sv new file mode 100644 index 0000000..baae0c3 --- /dev/null +++ b/rtl/gfx/gfx_shake.sv @@ -0,0 +1,24 @@ +interface gfx_shake; + + logic ready; + logic valid; + + modport tx + ( + input ready, + output valid + ); + + modport rx + ( + input valid, + output ready + ); + + modport peek + ( + input ready, + valid + ); + +endinterface diff --git a/rtl/gfx/gfx_sim_debug.sv b/rtl/gfx/gfx_sim_debug.sv new file mode 100644 index 0000000..4b4622a --- /dev/null +++ b/rtl/gfx/gfx_sim_debug.sv @@ -0,0 +1,50 @@ +module gfx_sim_debug +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axil.s axis +); + + enum int unsigned + { + INPUT, + STALL + } state; + + assign axis.rvalid = 0; + assign axis.arready = 0; + assign axis.awready = 1; + + always_comb + unique case (state) + INPUT: begin + axis.wready = 1; + axis.bvalid = axis.wvalid; + end + + STALL: begin + axis.wready = 0; + axis.bvalid = 1; + end + endcase + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) + state <= INPUT; + else + unique case (state) + INPUT: + if (axis.wvalid) begin + $display("%c", axis.wdata[7:0]); + if (~axis.bready) + state <= STALL; + end + + STALL: + if (axis.bready) + state <= INPUT; + endcase + +endmodule diff --git a/rtl/gfx/gfx_skid_buf.sv b/rtl/gfx/gfx_skid_buf.sv new file mode 100644 index 0000000..e3e5247 --- /dev/null +++ b/rtl/gfx/gfx_skid_buf.sv @@ -0,0 +1,20 @@ +module gfx_skid_buf +#(int WIDTH = 0) +( + input logic clk, + + input logic[WIDTH - 1:0] in, + input logic stall, + + output logic[WIDTH - 1:0] out +); + + logic[WIDTH - 1:0] skid; + + assign out = stall ? skid : in; + + always_ff @(posedge clk) + if (~stall) + skid <= in; + +endmodule diff --git a/rtl/gfx/gfx_skid_flow.sv b/rtl/gfx/gfx_skid_flow.sv new file mode 100644 index 0000000..7890ae3 --- /dev/null +++ b/rtl/gfx/gfx_skid_flow.sv @@ -0,0 +1,31 @@ +module gfx_skid_flow +( + input logic clk, + rst_n, + + input logic in_valid, + out_ready, + + output logic in_ready, + out_valid, + stall +); + + logic was_ready, was_valid; + + assign stall = ~in_ready; + assign in_ready = was_ready | ~was_valid; + assign out_valid = in_valid | stall; + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + was_ready <= 0; + was_valid <= 0; + end else begin + was_ready <= out_ready; + + if (~stall) + was_valid <= in_valid; + end + +endmodule diff --git a/rtl/gfx/gfx_wb.sv b/rtl/gfx/gfx_wb.sv new file mode 100644 index 0000000..20c7c64 --- /dev/null +++ b/rtl/gfx/gfx_wb.sv @@ -0,0 +1,51 @@ +interface gfx_wb; + + import gfx::*; + + word lanes[SHADER_LANES]; + logic mask_update, pc_inc, pc_update, ready, scalar, valid, writeback; + group_id group; + xgpr_num dest; + lane_mask mask; + pc_offset pc_add; + + modport tx + ( + input ready, + + output dest, + group, + lanes, + valid, + scalar, + writeback, + + mask, + mask_update, + + pc_add, + pc_inc, + pc_update + ); + + modport rx + ( + input dest, + group, + lanes, + valid, + scalar, + writeback, + + mask, + mask_update, + + pc_add, + pc_inc, + pc_update, + + output ready + ); + + +endinterface diff --git a/rtl/gfx/gfx_xbar_sched.sv b/rtl/gfx/gfx_xbar_sched.sv new file mode 100644 index 0000000..95e4afb --- /dev/null +++ b/rtl/gfx/gfx_xbar_sched.sv @@ -0,0 +1,146 @@ +module gfx_xbar_sched +import gfx::*; +( + input logic clk, + srst_n, + + gfx_axil.s sched, + + gfx_axil.m debug, + gfx_axil.m bootrom, + gfx_axil.m shader_0 +); + + localparam word BOOTROM_MASK = 32'hfff0_0000; + localparam word DEBUG_BASE = 32'h0020_0000; + localparam word DEBUG_MASK = 32'hfff0_0000; + localparam word SHADER_0_BASE = 32'h0100_0000; + localparam word SHADER_0_MASK = 32'hfff0_0000; + + defparam xbar.NM = 1; + defparam xbar.NS = 3; + defparam xbar.OPT_LOWPOWER = 0; + + defparam xbar.SLAVE_ADDR = { + SHADER_0_BASE, + DEBUG_BASE, + BOOTROM_BASE + }; + + defparam xbar.SLAVE_MASK = { + SHADER_0_MASK, + DEBUG_MASK, + BOOTROM_MASK + }; + + axilxbar xbar + ( + .S_AXI_ACLK(clk), + .S_AXI_ARESETN(srst_n), + + .S_AXI_AWVALID(sched.awvalid), + .S_AXI_AWREADY(sched.awready), + .S_AXI_AWADDR(sched.awaddr), + .S_AXI_AWPROT('0), + + .S_AXI_WVALID(sched.wvalid), + .S_AXI_WREADY(sched.wready), + .S_AXI_WDATA(sched.wdata), + .S_AXI_WSTRB('1), + + .S_AXI_BVALID(sched.bvalid), + .S_AXI_BREADY(sched.bready), + .S_AXI_BRESP(), + + .S_AXI_ARVALID(sched.arvalid), + .S_AXI_ARREADY(sched.arready), + .S_AXI_ARADDR(sched.araddr), + .S_AXI_ARPROT('0), + + .S_AXI_RVALID(sched.rvalid), + .S_AXI_RREADY(sched.rready), + .S_AXI_RDATA(sched.rdata), + .S_AXI_RRESP(), + + .M_AXI_AWADDR({ + shader_0.awaddr, + debug.awaddr, + bootrom.awaddr + }), + .M_AXI_AWPROT(), + .M_AXI_AWVALID({ + shader_0.awvalid, + debug.awvalid, + bootrom.awvalid + }), + .M_AXI_AWREADY({ + shader_0.awready, + debug.awready, + bootrom.awready + }), + + .M_AXI_WDATA({ + shader_0.wdata, + debug.wdata, + bootrom.wdata + }), + .M_AXI_WSTRB(), + .M_AXI_WVALID({ + shader_0.wvalid, + debug.wvalid, + bootrom.wvalid + }), + .M_AXI_WREADY({ + shader_0.wready, + debug.wready, + bootrom.wready + }), + + .M_AXI_BRESP('0), + .M_AXI_BVALID({ + shader_0.bvalid, + debug.bvalid, + bootrom.bvalid + }), + .M_AXI_BREADY({ + shader_0.bready, + debug.bready, + bootrom.bready + }), + + .M_AXI_ARADDR({ + shader_0.araddr, + debug.araddr, + bootrom.araddr + }), + .M_AXI_ARPROT(), + .M_AXI_ARVALID({ + shader_0.arvalid, + debug.arvalid, + bootrom.arvalid + }), + .M_AXI_ARREADY({ + shader_0.arready, + debug.arready, + bootrom.arready + }), + + .M_AXI_RDATA({ + shader_0.rdata, + debug.rdata, + bootrom.rdata + }), + .M_AXI_RRESP('0), + .M_AXI_RVALID({ + shader_0.rvalid, + debug.rvalid, + bootrom.rvalid + }), + .M_AXI_RREADY({ + shader_0.rready, + debug.rready, + bootrom.rready + }) + ); + +endmodule diff --git a/rtl/gfx/mod.mk b/rtl/gfx/mod.mk new file mode 100644 index 0000000..7525276 --- /dev/null +++ b/rtl/gfx/mod.mk @@ -0,0 +1,18 @@ +cores := gfx_shader_schedif + +define core + $(this)/deps := axixbar gfx_shader_schedif picorv32 + + $(this)/rtl_top := gfx_top + $(this)/rtl_dirs := . + $(this)/rtl_files := gfx_isa.sv gfx_pkg.sv +endef + +define core/gfx_shader_schedif + $(this)/hooks := regblock + + $(this)/regblock_rdl := gfx_shader_schedif.rdl + $(this)/regblock_top := gfx_shader_schedif + $(this)/regblock_args := --default-reset arst_n + $(this)/regblock_cpuif := axi4-lite +endef diff --git a/rtl/mod.mk b/rtl/mod.mk index 081d3a3..2fb0ffa 100644 --- a/rtl/mod.mk +++ b/rtl/mod.mk @@ -1,5 +1,5 @@ cores := config debounce intc -subdirs := cache core dma_axi32 fpu legacy_gfx perf picorv32 pkt_switch smp top wb2axip +subdirs := cache core dma_axi32 fpu gfx legacy_gfx perf picorv32 pkt_switch smp top wb2axip define core/config $(this)/rtl_include_dirs := . -- cgit v1.2.3