diff options
Diffstat (limited to 'rtl/gfx')
36 files changed, 5133 insertions, 0 deletions
diff --git a/rtl/gfx/gfx_axib.sv b/rtl/gfx/gfx_axib.sv new file mode 100644 index 0000000..7b3cbdc --- /dev/null +++ b/rtl/gfx/gfx_axib.sv @@ -0,0 +1,81 @@ +// AXI4 con burst +interface gfx_axib; + + import gfx::word; + + logic awvalid, + awready; + logic[7:0] awlen; + logic[1:0] awburst; + word awaddr; + + logic wlast; + logic wvalid; + logic wready; + word wdata; + + logic bvalid; + logic bready; + + logic arvalid, + arready; + logic[7:0] arlen; + logic[1:0] arburst; + word araddr; + + logic rlast; + logic rvalid; + logic rready; + word rdata; + + modport m + ( + input awready, + wready, + bvalid, + arready, + rlast, + rvalid, + rdata, + + output awlen, + awburst, + awvalid, + awaddr, + wlast, + wvalid, + wdata, + bready, + arlen, + arburst, + arvalid, + araddr, + rready + ); + + modport s + ( + input awlen, + awburst, + awvalid, + awaddr, + wlast, + wvalid, + wdata, + bready, + arlen, + arburst, + arvalid, + araddr, + rready, + + output awready, + wready, + bvalid, + arready, + rlast, + rvalid, + rdata + ); + +endinterface diff --git a/rtl/gfx/gfx_axil.sv b/rtl/gfx/gfx_axil.sv new file mode 100644 index 0000000..c254e26 --- /dev/null +++ b/rtl/gfx/gfx_axil.sv @@ -0,0 +1,61 @@ +// AXI4-Lite, sin wstrb ni axprot +interface gfx_axil; + import gfx::*; + + logic awvalid; + logic awready; + word awaddr; + + logic wvalid; + logic wready; + word wdata; + + logic bvalid; + logic bready; + + logic arvalid; + logic arready; + word araddr; + + logic rvalid; + logic rready; + word rdata; + + modport m + ( + input awready, + wready, + bvalid, + arready, + rvalid, + rdata, + + output awvalid, + awaddr, + wvalid, + wdata, + bready, + arvalid, + araddr, + rready + ); + + modport s + ( + input awvalid, + awaddr, + wvalid, + wdata, + bready, + arvalid, + araddr, + rready, + + output awready, + wready, + bvalid, + arready, + rvalid, + rdata + ); +endinterface diff --git a/rtl/gfx/gfx_axil2regblock.sv b/rtl/gfx/gfx_axil2regblock.sv new file mode 100644 index 0000000..2449b05 --- /dev/null +++ b/rtl/gfx/gfx_axil2regblock.sv @@ -0,0 +1,30 @@ +module gfx_axil2regblock +( + gfx_axil.s axis, + axi4lite_intf.master axim +); + + assign axis.rdata = axim.RDATA; + assign axis.rvalid = axim.RVALID; + assign axis.bvalid = axim.BVALID; + assign axis.wready = axim.WREADY; + assign axis.arready = axim.ARREADY; + assign axis.awready = axim.AWREADY; + + assign axim.AWVALID = axis.awvalid; + assign axim.AWADDR = axis.awaddr[$bits(axim.AWADDR) - 1:0]; + assign axim.AWPROT = '0; + + assign axim.WVALID = axis.wvalid; + assign axim.WDATA = axis.wdata; + assign axim.WSTRB = '1; + + assign axim.BREADY = axis.bready; + + assign axim.ARVALID = axis.arvalid; + assign axim.ARADDR = axis.araddr[$bits(axim.ARADDR) - 1:0]; + assign axim.ARPROT = '0; + + assign axim.RREADY = axis.rready; + +endmodule diff --git a/rtl/gfx/gfx_beats.sv b/rtl/gfx/gfx_beats.sv new file mode 100644 index 0000000..fcbb091 --- /dev/null +++ b/rtl/gfx/gfx_beats.sv @@ -0,0 +1,29 @@ +interface gfx_beats +#(int WIDTH = $bits(gfx::word)); + + logic[WIDTH - 1:0] data; + logic ready; + logic valid; + + modport tx + ( + input ready, + output data, + valid + ); + + modport rx + ( + input data, + valid, + output ready + ); + + modport peek + ( + input data, + ready, + valid + ); + +endinterface diff --git a/rtl/gfx/gfx_bootrom.sv b/rtl/gfx/gfx_bootrom.sv new file mode 100644 index 0000000..2c4581e --- /dev/null +++ b/rtl/gfx/gfx_bootrom.sv @@ -0,0 +1,66 @@ +module gfx_bootrom +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axil.s axis +); + + localparam ROM_WORDS_LOG = 8; + + enum int unsigned + { + WAIT, + READ, + RDATA, + READY + } state; + + word read, rom[1 << ROM_WORDS_LOG]; + logic[ROM_WORDS_LOG - 1:0] read_addr; + + assign axis.bvalid = 0; + assign axis.wready = 0; + assign axis.awready = 0; + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= WAIT; + axis.rvalid <= 0; + axis.arready <= 0; + end else begin + axis.arready <= 0; + + unique case (state) + WAIT: + if (axis.arvalid & ~axis.arready) + state <= READ; + + READ: + state <= RDATA; + + RDATA: begin + state <= READY; + axis.rvalid <= 1; + end + + READY: + if (axis.rready) begin + state <= WAIT; + axis.rvalid <= 0; + axis.arready <= 1; + end + endcase + end + + always_ff @(posedge clk) begin + read <= rom[read_addr]; + read_addr <= axis.araddr[$bits(read_addr) + SUBWORD_BITS - 1:SUBWORD_BITS]; + axis.rdata <= read; + end + + initial + $readmemh("gfx_bootrom.hex", rom); + +endmodule diff --git a/rtl/gfx/gfx_clz.sv b/rtl/gfx/gfx_clz.sv new file mode 100644 index 0000000..8d6f100 --- /dev/null +++ b/rtl/gfx/gfx_clz.sv @@ -0,0 +1,68 @@ +/* Implementación en árbol de count leading zeros (CLZ). + * WIDTH debe ser una potencia de 2. + */ +module gfx_clz +#(int WIDTH = 0) +( + input logic clk, + + input logic[WIDTH - 1:0] value, + output logic[$clog2(WIDTH):0] clz +); + + genvar i; + generate + if (WIDTH <= 1) begin + always_ff @(posedge clk) + clz <= !value; + end else if (WIDTH == 2) begin + always_ff @(posedge clk) + unique case (value) + 2'b00: clz <= 2'b10; + 2'b01: clz <= 2'b01; + 2'b10: clz <= 2'b00; + 2'b11: clz <= 2'b00; + endcase + end else if (WIDTH == 4) begin + // Eficiente en FPGAs con 4-LUTs + always_ff @(posedge clk) + if (value[3]) + clz <= 3'b000; + else if (value[2]) + clz <= 3'b001; + else if (value[1]) + clz <= 3'b010; + else if (value[0]) + clz <= 3'b011; + else + clz <= 3'b100; + end else begin + logic msb_right; + logic[$clog2(WIDTH) - 1:0] clz_left, clz_right; + logic[$clog2(WIDTH) - 2:0] tail_right; + + assign {msb_right, tail_right} = clz_right; + + gfx_clz #(WIDTH / 2) left + ( + .clk(clk), + .clz(clz_left), + .value(value[WIDTH - 1:WIDTH / 2]) + ); + + gfx_clz #(WIDTH / 2) right + ( + .clk(clk), + .clz(clz_right), + .value(value[WIDTH / 2 - 1:0]) + ); + + always_ff @(posedge clk) + if (clz_left[$clog2(WIDTH) - 1]) + clz <= {msb_right, ~msb_right, tail_right}; + else + clz <= {1'b0, clz_left}; + end + endgenerate + +endmodule diff --git a/rtl/gfx/gfx_ctz.sv b/rtl/gfx/gfx_ctz.sv new file mode 100644 index 0000000..2713f8a --- /dev/null +++ b/rtl/gfx/gfx_ctz.sv @@ -0,0 +1,18 @@ +// Count trailing zeros (ctz), clz al revés +module gfx_ctz +#(int WIDTH = 0) +( + input logic clk, + + input logic[WIDTH - 1:0] value, + output logic[$clog2(WIDTH):0] ctz +); + + gfx_clz #(WIDTH) clz + ( + .clk, + .value({<<{value}}), + .clz(ctz) + ); + +endmodule diff --git a/rtl/gfx/gfx_fifo.sv b/rtl/gfx/gfx_fifo.sv new file mode 100644 index 0000000..7174e4d --- /dev/null +++ b/rtl/gfx/gfx_fifo.sv @@ -0,0 +1,102 @@ +module gfx_fifo +#(int WIDTH = 0, + int DEPTH = 0) +( + input logic clk, + rst_n, + + gfx_beats.rx in, + gfx_beats.tx out +); + + logic do_read, do_write, full_if_eq, in_stall, out_stall, + may_read, may_write, read, read_ok, write; + + logic[WIDTH - 1:0] fifo[DEPTH], read_data, write_data; + logic[$clog2(DEPTH) - 1:0] read_ptr, write_ptr; + + assign do_read = read & may_read; + assign do_write = write & may_write; + + always_comb begin + may_read = full_if_eq; + may_write = !full_if_eq; + + if (read) + may_write = 1; + + if (read_ptr != write_ptr) begin + may_read = 1; + may_write = 1; + end + end + + gfx_skid_flow in_flow + ( + .clk, + .rst_n, + .stall(in_stall), + .in_ready(in.ready), + .in_valid(in.valid), + .out_ready(may_write), + .out_valid(write) + ); + + gfx_skid_flow out_flow + ( + .clk, + .rst_n, + .stall(out_stall), + .in_ready(read), + .in_valid(read_ok), + .out_ready(out.ready), + .out_valid(out.valid) + ); + + gfx_skid_buf #(WIDTH) in_skid + ( + .clk, + .in(in.data), + .out(write_data), + .stall(in_stall) + ); + + gfx_skid_buf #(WIDTH) out_skid + ( + .clk, + .in(read_data), + .out(out.data), + .stall(out_stall) + ); + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + read_ok <= 0; + read_ptr <= 0; + write_ptr <= 0; + full_if_eq <= 0; + end else begin + if (~out_stall) + read_ok <= read && may_read; + + if (do_read) + read_ptr <= read_ptr + 1; + + if (do_write) + write_ptr <= write_ptr + 1; + + if (do_read & ~do_write) + full_if_eq <= 0; + else if (~do_read & do_write) + full_if_eq <= 1; + end + + always_ff @(posedge clk) begin + if (~out_stall) + read_data <= fifo[read_ptr]; + + if (may_write) + fifo[write_ptr] <= write_data; + end + +endmodule diff --git a/rtl/gfx/gfx_fixed_dotadd.sv b/rtl/gfx/gfx_fixed_dotadd.sv new file mode 100644 index 0000000..fdd5ffd --- /dev/null +++ b/rtl/gfx/gfx_fixed_dotadd.sv @@ -0,0 +1,55 @@ +module gfx_fixed_dotadd +( + input logic clk, + + input gfx::fixed a0, + b0, + a1, + b1, + c, + input logic stall, + + output gfx::fixed q +); + + import gfx::*; + + fixed q0, a1_hold, b1_hold; + + gfx_fixed_muladd muladd_0 + ( + .clk, + .a(a0), + .b(b0), + .c, + .q(q0), + .stall + ); + + gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes + ( + .clk, + .in(a1), + .out(a1_hold), + .stall + ); + + gfx_pipes #(.WIDTH($bits(fixed)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes + ( + .clk, + .in(b1), + .out(b1_hold), + .stall + ); + + gfx_fixed_muladd muladd_1 + ( + .clk, + .a(a1_hold), + .b(b1_hold), + .c(q0), + .q, + .stall + ); + +endmodule diff --git a/rtl/gfx/gfx_fixed_muladd.sv b/rtl/gfx/gfx_fixed_muladd.sv new file mode 100644 index 0000000..22b7247 --- /dev/null +++ b/rtl/gfx/gfx_fixed_muladd.sv @@ -0,0 +1,77 @@ +module gfx_fixed_muladd +( + input logic clk, + + input gfx::fixed a, + b, + c, + input logic stall, + + output gfx::fixed q +); + + import gfx::*; + +`ifndef VERILATOR + logic[2 * $bits(fixed) - $bits(fixed_frac) - 1:0] q_ext; + + assign q = q_ext[$bits(fixed) - 1:0]; + + lpm_mult mult + ( + .aclr(0), + .clock(clk), + .clken(!stall), + + .sum({c, {`FIXED_FRAC{1'b0}}}), + .dataa(a), + .datab(b), + .result(q_ext) + ); + + defparam + mult.lpm_widtha = $bits(fixed), + mult.lpm_widthb = $bits(fixed), + mult.lpm_widths = $bits(fixed) + $bits(fixed_frac), + /* Esto es crucial. No está documentado en ningún lado (aparte de un + * comentario en r/fpga). Si lpm_widthp < lpm_widtha + lpm_widthb, + * entonces result contiene los lpm_widthp bits más significativos + * del producto, no los menos significativos como tendría sentido. + */ + mult.lpm_widthp = 2 * $bits(fixed) - $bits(fixed_frac), + mult.lpm_representation = "SIGNED", + mult.lpm_pipeline = FIXED_MULADD_DEPTH; +`else + logic[$bits(fixed) + $bits(fixed_frac) - 1:0] q_ext; + + fixed a_hold, b_hold, c_hold; + + assign q = q_ext[$bits(fixed) + $bits(fixed_frac) - 1:$bits(fixed_frac)] + c_hold; + assign q_ext = a_hold * b_hold; + + gfx_pipes #(.WIDTH($bits(a)), .DEPTH(FIXED_MULADD_DEPTH)) a_pipes + ( + .clk, + .in(a), + .out(a_hold), + .stall + ); + + gfx_pipes #(.WIDTH($bits(b)), .DEPTH(FIXED_MULADD_DEPTH)) b_pipes + ( + .clk, + .in(b), + .out(b_hold), + .stall + ); + + gfx_pipes #(.WIDTH($bits(c)), .DEPTH(FIXED_MULADD_DEPTH)) c_pipes + ( + .clk, + .in(c), + .out(c_hold), + .stall + ); +`endif + +endmodule diff --git a/rtl/gfx/gfx_front_back.sv b/rtl/gfx/gfx_front_back.sv new file mode 100644 index 0000000..b768532 --- /dev/null +++ b/rtl/gfx/gfx_front_back.sv @@ -0,0 +1,37 @@ +interface gfx_front_back +import gfx::*;; + + struct + { + wave_exec wave; + fpint_op p0; + mem_op p1; + sfu_op p2; + group_op p3; + } execute; + + struct + { + logic valid; + group_id group; + } loop; + + shader_dispatch dispatch; + + modport front + ( + input loop, + + output execute, + dispatch + ); + + modport back + ( + input execute, + dispatch, + + output loop + ); + +endinterface diff --git a/rtl/gfx/gfx_isa.sv b/rtl/gfx/gfx_isa.sv new file mode 100644 index 0000000..7239478 --- /dev/null +++ b/rtl/gfx/gfx_isa.sv @@ -0,0 +1,84 @@ +package gfx_isa; + + typedef logic[3:0] sgpr_num; + typedef logic[2:0] vgpr_num; + + typedef logic signed[7:0] pc_offset; + + typedef union packed + { + sgpr_num sgpr; + + struct packed + { + logic[$bits(sgpr_num) - $bits(vgpr_num) - 1:0] reserved; + vgpr_num num; + } vgpr; + } xgpr_num; + + typedef struct packed + { + enum logic[1:0] + { + REGS_SVS = 2'b00, + REGS_SSS = 2'b01, + REGS_VVS = 2'b10, + REGS_VVV = 2'b11 + } reg_mode; + + union packed + { + struct packed + { + logic b_is_imm; + + union packed + { + logic[12:0] imm; + + struct packed + { + logic from_consts; + logic[7:0] reserved; + xgpr_num r; + } read; + } b; + + xgpr_num ra, + rd; + } rr; + } dst_src; + + logic reg_rev; + + union packed + { + struct packed + { + enum logic[4:0] + { + INSN_FPINT_MOV = 0, + INSN_FPINT_FMUL = 1, + INSN_FPINT_IMUL = 2, + INSN_FPINT_FADD = 3, + INSN_FPINT_RES4 = 4, + INSN_FPINT_FMAX = 5, + INSN_FPINT_RES6 = 6, + INSN_FPINT_FMIN = 7, + INSN_FPINT_RES8 = 8, + INSN_FPINT_FCVT = 9, + INSN_FPINT_RES[10:31] + } op; + } fpint; + } by_class; + + enum logic[1:0] + { + INSN_FPINT = 0, + INSN_MEM = 1, + INSN_SFU = 2, + INSN_GROUP = 3 + } insn_class; + } insn_word; + +endpackage diff --git a/rtl/gfx/gfx_pipes.sv b/rtl/gfx/gfx_pipes.sv new file mode 100644 index 0000000..2fa875a --- /dev/null +++ b/rtl/gfx/gfx_pipes.sv @@ -0,0 +1,24 @@ +module gfx_pipes +#(int WIDTH=0, int DEPTH=0) +( + input logic clk, + + input logic[WIDTH - 1:0] in, + input logic stall, + + output logic[WIDTH - 1:0] out +); + + logic[WIDTH - 1:0] pipes[DEPTH]; + + assign out = pipes[DEPTH - 1]; + + always_ff @(posedge clk) + if (~stall) begin + pipes[0] <= in; + + for (integer i = 1; i < DEPTH; ++i) + pipes[i] <= pipes[i - 1]; + end + +endmodule diff --git a/rtl/gfx/gfx_pkg.sv b/rtl/gfx/gfx_pkg.sv new file mode 100644 index 0000000..7072967 --- /dev/null +++ b/rtl/gfx/gfx_pkg.sv @@ -0,0 +1,271 @@ +package gfx; + + typedef logic[31:0] word; + + typedef word uword; + typedef logic signed[$bits(word) - 1:0] sword; + typedef logic[$bits(word) / 2 - 1:0] uhword; + typedef logic signed[$bits(word) / 2 - 1:0] shword; + typedef logic[2 * $bits(word) - 1:0] udword; + typedef logic signed[2 * $bits(word) - 1:0] sdword; + typedef logic signed[4 * $bits(word) - 1:0] qword; + typedef logic signed[8 * $bits(word) - 1:0] oword; + + localparam int SUBWORD_BITS = $clog2($bits(word)) - $clog2($bits(byte)); + localparam int BYTES_PER_WORD = 1 << SUBWORD_BITS; + + typedef logic[$bits(word) - SUBWORD_BITS - 1:0] word_ptr; + typedef logic[$bits(word_ptr) - 1 - 1:0] dword_ptr; + typedef logic[$bits(word_ptr) - 2 - 1:0] qword_ptr; + typedef logic[$bits(word_ptr) - 3 - 1:0] oword_ptr; + + typedef logic[7:0] float_exp; + typedef logic[$bits(word) - $bits(float_exp) - 2:0] float_mant; + typedef logic[$bits(float_mant):0] float_mant_full; // Incluye '1.' explícito + typedef logic[$bits(float_mant_full) + 1:0] float_mant_ext; // Considera overflow + + localparam float_exp FLOAT_EXP_BIAS = (1 << ($bits(float_exp) - 1)) - 1; + localparam float_exp FLOAT_EXP_MAX = {($bits(float_exp)){1'b1}}; + + function float_mant_full full_mant(float_mant in); + full_mant = {1'b1, in}; + endfunction + + function float_mant implicit_mant(float_mant_full in); + assert (in[$bits(in) - 1]); + implicit_mant = in[$bits(in) - 2:0]; + endfunction + + typedef struct packed + { + logic sign; + float_exp exp; + float_mant mant; + } float; + + /* Explicación de guard, round, sticky: + * https://drilian.com/2023/01/10/floating-point-numbers-and-rounding/ + */ + typedef struct packed + { + float normal; + logic slow, + zero, + guard, + round, + sticky; + } float_round; + + typedef struct packed + { + logic exp_max, + exp_min, + mant_zero; + } float_class; + + function float_class classify_float(float in); + classify_float.exp_max = &in.exp; + classify_float.exp_min = ~|in.exp; + classify_float.mant_zero = ~|in.mant; + endfunction + + function logic is_float_special(float_class in); + is_float_special = in.exp_max | (in.exp_min & ~in.mant_zero); + endfunction + + function float_mant_ext float_prepare_round(float in, float_class in_class); + float_prepare_round = {~in_class.exp_min, in.mant, 2'b00}; + endfunction + + typedef struct packed + { + logic setup_mul_float, + setup_unit_b, + mnorm_put_hi, + mnorm_put_lo, + mnorm_put_mul, + mnorm_zero_b, + mnorm_zero_flags, + minmax_abs, + minmax_swap, + minmax_zero_min, + minmax_copy_flags, + shiftr_int_signed, + addsub_copy_flags, + addsub_int_operand, + clz_force_nop, + shiftl_copy_flags, + round_copy_flags, + round_enable, + encode_enable, + writeback; + } fpint_op; + + typedef struct packed + { + logic todo; + } mem_op; + + typedef struct packed + { + logic todo; + } sfu_op; + + typedef struct packed + { + logic todo; + } group_op; + + // Q22.10 + typedef logic[9:0] fixed_frac; + typedef logic[$bits(word) - $bits(fixed_frac) - 1:0] fixed_int; + + typedef struct packed signed + { + fixed_int fint; // 'int' es una keyword + fixed_frac frac; + } fixed; + + typedef struct packed + { + fixed x, + y; + } fixed_xy; + + typedef struct packed + { + fixed a, + b, + c; + } vtx_fixed; + + typedef struct packed + { + fixed_xy a, + b, + c; + } vtx_xy; + + localparam int RASTER_BITS = 2; + localparam int RASTER_SUB_BITS = 4; + localparam int RASTER_SIZE = 1 << RASTER_BITS; + localparam int RASTER_COARSE_FRAGS = RASTER_SIZE * RASTER_SIZE; + + typedef logic[RASTER_BITS - 1:0] raster_index; + + // Caso RASTER_BITS = 2: -> 4,4,4,4 -> 8,8-> 16 + localparam int RASTER_OUT_CLZ_DEPTH = 3; + + // Asume RASTER_BITS == 2, hay que ajustarlo si cambia + typedef struct packed + { + // Esto ahorra muchos flops + // + // offsets[0] = inc * 0 = 0 + // offsets[1] = inc * 1 = raster2_times1 + // offsets[2] = inc * 2 = raster2_times1 << 1 + // offsets[3] = inc * 3 = raster2_times3 + fixed raster2_times1, + raster2_times3; + } raster_offsets; + + function fixed raster_idx(raster_offsets offsets, raster_index idx); + unique case (idx) + RASTER_BITS'(0): + return '0; + + RASTER_BITS'(1): + return offsets.raster2_times1; + + RASTER_BITS'(2): + return offsets.raster2_times1 << 1; + + RASTER_BITS'(3): + return offsets.raster2_times3; + endcase + endfunction + + function raster_offsets make_raster_offsets(fixed inc); + make_raster_offsets.raster2_times1 = inc; + make_raster_offsets.raster2_times3 = inc + (inc << 1); + endfunction + + typedef struct packed + { + raster_offsets x, + y; + } raster_offsets_xy; + + typedef struct packed + { + logic[RASTER_SUB_BITS - 1:0] num; + logic[$bits(fixed_frac) - RASTER_SUB_BITS - 1:0] prec; + } raster_sub; + + localparam int RASTER_COARSE_DIM_BITS = $bits(fixed) - $bits(raster_index) - $bits(raster_sub); + + typedef logic signed[RASTER_COARSE_DIM_BITS - 1:0] raster_coarse_dim; + + typedef struct packed + { + raster_coarse_dim x, + y; + } raster_coarse_xy; + + typedef struct packed signed + { + raster_coarse_dim coarse; + raster_index fine; + raster_sub sub; + } raster_prec; + + typedef struct packed + { + raster_prec x, + y; + } raster_prec_xy; + + // Definir el número de lanes a partir de las dimensiones del + // rasterizer es una decisión crucial, el diseño entero depende de esto + + localparam int SHADER_LANES = RASTER_COARSE_FRAGS; + + typedef logic[RASTER_SIZE - 1:0] lane_no; + typedef logic[SHADER_LANES - 1:0] lane_mask; + + typedef logic[5:0] group_id; + + localparam int REGFILE_STAGES = 3; + localparam int REG_READ_STAGES = 2 + REGFILE_STAGES + 1; + + typedef gfx_isa::sgpr_num sgpr_num; + typedef gfx_isa::vgpr_num vgpr_num; + typedef gfx_isa::xgpr_num xgpr_num; + typedef gfx_isa::pc_offset pc_offset; + + typedef struct packed + { + // No incluye p0 porque p0 no tiene señal ready + logic p1, + p2, + p3, + valid; + } shader_dispatch; + + typedef struct + { + group_id group; + xgpr_num dest; + logic dest_scalar; + } wave_exec; + + localparam int FIXED_MULADD_DEPTH = 5; + localparam int FIXED_DOTADD_DEPTH = 2 * FIXED_MULADD_DEPTH; + + localparam word BOOTROM_BASE = 32'h0010_0000; + + localparam int SCHED_BRAM_WORDS = 2048; // 8KiB + + typedef word irq_lines; + +endpackage diff --git a/rtl/gfx/gfx_pkts.sv b/rtl/gfx/gfx_pkts.sv new file mode 100644 index 0000000..41399ce --- /dev/null +++ b/rtl/gfx/gfx_pkts.sv @@ -0,0 +1,29 @@ +interface gfx_pkts +#(parameter int WIDTH = $bits(gfx::word)); + + import gfx::*; + + logic tlast; + logic tready; + logic tvalid; + logic[WIDTH - 1:0] tdata; + + modport tx + ( + input tready, + + output tdata, + tlast, + tvalid + ); + + modport rx + ( + input tdata, + tlast, + tvalid, + + output tready + ); + +endinterface diff --git a/rtl/gfx/gfx_raster.sv b/rtl/gfx/gfx_raster.sv new file mode 100644 index 0000000..a57a672 --- /dev/null +++ b/rtl/gfx/gfx_raster.sv @@ -0,0 +1,930 @@ +module gfx_raster +( + input logic clk, + rst_n, + + gfx_pkts.rx geometry, + + gfx_pkts.tx coverage +); + + import gfx::*; + + gfx_raster_bounds setup_bounds + ( + .clk, + .rst_n, + + .geometry, + + .edges_ref(bounds_edges_ref), + .edges_vtx(bounds_edges_vtx), + .edges_span(bounds_edges_span), + .edges_ready(bounds_edges_ready), + .edges_valid(bounds_edges_valid), + .edges_geom_id(bounds_edges_geom_id) + ); + + word bounds_edges_geom_id; + logic bounds_edges_ready, bounds_edges_valid; + vtx_xy bounds_edges_vtx; + fixed_xy bounds_edges_ref; + raster_prec_xy bounds_edges_span; + + gfx_raster_edges setup_edges + ( + .clk, + .rst_n, + + .bounds_ref(bounds_edges_ref), + .bounds_vtx(bounds_edges_vtx), + .bounds_span(bounds_edges_span), + .bounds_ready(bounds_edges_ready), + .bounds_valid(bounds_edges_valid), + .bounds_geom_id(bounds_edges_geom_id), + + .coarse_ref(edges_coarse_ref), + .coarse_base(edges_coarse_base), + .coarse_span(edges_coarse_span), + .coarse_ready(edges_coarse_ready), + .coarse_valid(edges_coarse_valid), + .coarse_geom_id(edges_coarse_geom_id), + .coarse_offsets(edges_coarse_offsets) + ); + + word edges_coarse_geom_id; + fixed edges_coarse_base; + logic edges_coarse_ready, edges_coarse_valid; + fixed_xy edges_coarse_ref; + raster_prec_xy edges_coarse_span; + raster_offsets_xy edges_coarse_offsets; + + gfx_raster_coarse coarse + ( + .clk, + .rst_n, + + .edges_ref(edges_coarse_ref), + .edges_base(edges_coarse_base), + .edges_span(edges_coarse_span), + .edges_ready(edges_coarse_ready), + .edges_valid(edges_coarse_valid), + .edges_geom_id(edges_coarse_geom_id), + .edges_offsets(edges_coarse_offsets), + + .fine_ref(coarse_fine_ref), + .fine_ready(coarse_fine_ready), + .fine_valid(coarse_fine_valid), + .fine_corner(coarse_fine_corner), + .fine_geom_id(coarse_fine_geom_id), + .fine_offsets(coarse_fine_offsets) + ); + + word coarse_fine_geom_id; + fixed coarse_fine_corner; + logic coarse_fine_ready, coarse_fine_valid; + fixed_xy coarse_fine_ref; + raster_offsets_xy coarse_fine_offsets; + + gfx_raster_fine fine + ( + .clk, + .rst_n, + + .coarse_ref(coarse_fine_ref), + .coarse_ready(coarse_fine_ready), + .coarse_valid(coarse_fine_valid), + .coarse_corner(coarse_fine_corner), + .coarse_geom_id(coarse_fine_geom_id), + .coarse_offsets(coarse_fine_offsets), + + .coverage + ); + +endmodule + +module gfx_raster_bounds +( + input logic clk, + rst_n, + + gfx_pkts.rx geometry, + + input logic edges_ready, + output logic edges_valid, + output gfx::word edges_geom_id, + output gfx::fixed_xy edges_ref, + output gfx::raster_prec_xy edges_span, + output gfx::vtx_xy edges_vtx +); + + import gfx::*; + + enum int unsigned + { + IN_GEOM_ID, + IN_DIM_X, + IN_DIM_Y + } in_state; + + enum int unsigned + { + VTX_A, + VTX_B, + VTX_C + } vtx_state; + + logic a_lt_b, a_lt_c, b_lt_c, edges_handshake, geom_complete, geom_last, + geom_recv, in_vtx, next_dim, new_vtx; + + logic end_new_dim, end_valid, vtx_valid, lt_new_dim, lt_valid, minmax_new_dim, minmax_valid; + + fixed geom_data; + vtx_fixed dim_vtx, dim_vtx_x, dim_vtx_y; + raster_prec max, min; + + assign geom_recv = geometry.tready & geometry.tvalid; + assign edges_handshake = edges_valid & edges_ready; + + assign edges_vtx.a.x = dim_vtx_x.a; + assign edges_vtx.a.y = dim_vtx_y.a; + assign edges_vtx.b.x = dim_vtx_x.b; + assign edges_vtx.b.y = dim_vtx_y.b; + assign edges_vtx.c.x = dim_vtx_x.c; + assign edges_vtx.c.y = dim_vtx_y.c; + + assign geometry.tready = edges_handshake | ~geom_complete; + + always_comb begin + unique case (vtx_state) + VTX_C: next_dim = geom_recv; + default: next_dim = 0; + endcase + + unique case (in_state) + IN_DIM_Y: geom_last = next_dim; + default: geom_last = 0; + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + in_state <= IN_GEOM_ID; + vtx_state <= VTX_A; + + in_vtx <= 0; + new_vtx <= 0; + geom_complete <= 0; + + lt_valid <= 0; + end_valid <= 0; + vtx_valid <= 0; + edges_valid <= 0; + minmax_valid <= 0; + + lt_new_dim <= 0; + end_new_dim <= 0; + minmax_new_dim <= 0; + + edges_geom_id <= 'x; + end else begin + end_valid <= 0; + vtx_valid <= end_valid; + lt_valid <= vtx_valid; + minmax_valid <= lt_valid; + + if (~edges_valid | edges_ready) + edges_valid <= minmax_valid; + + geom_complete <= (geom_complete | geom_last) & ~edges_handshake; + + unique case (in_state) + IN_GEOM_ID: + if (geom_recv) begin + in_state <= IN_DIM_X; + + in_vtx <= 1; + edges_geom_id <= geometry.tdata; + end + + IN_DIM_X: + if (next_dim) + in_state <= IN_DIM_Y; + + IN_DIM_Y: + if (next_dim) begin + in_state <= IN_GEOM_ID; + + in_vtx <= 0; + end_valid <= 1; + end + endcase + + new_vtx <= 0; + + lt_new_dim <= 0; + minmax_new_dim <= lt_new_dim; + end_new_dim <= minmax_new_dim; + + unique case (vtx_state) + VTX_A: begin + if (in_vtx & geom_recv) begin + new_vtx <= 1; + vtx_state <= VTX_B; + end + + if (new_vtx) begin + dim_vtx.c <= geom_data; + lt_new_dim <= 1; + end + end + + VTX_B: begin + if (geom_recv) begin + new_vtx <= 1; + vtx_state <= VTX_C; + end + + if (new_vtx) + dim_vtx.a <= geom_data; + end + + VTX_C: begin + if (geom_recv) begin + new_vtx <= 1; + vtx_state <= VTX_A; + end + + if (new_vtx) + dim_vtx.b <= geom_data; + end + endcase + + if (in_state == IN_DIM_Y & next_dim) + assert (geometry.tlast); + end + + always_ff @(posedge clk) begin + geom_data <= geometry.tdata; + + a_lt_b <= $signed(dim_vtx.a) < $signed(dim_vtx.b); + a_lt_c <= $signed(dim_vtx.a) < $signed(dim_vtx.c); + b_lt_c <= $signed(dim_vtx.b) < $signed(dim_vtx.c); + + // Realmente no son 'x' o 'y' hasta cuando edges_valid = 1 + if (lt_new_dim) begin + dim_vtx_y <= dim_vtx; + dim_vtx_x <= dim_vtx_y; + end + + if (a_lt_b) begin + min <= a_lt_c ? dim_vtx_y.a : dim_vtx_y.c; + max <= b_lt_c ? dim_vtx_y.c : dim_vtx_y.b; + end else begin + min <= b_lt_c ? dim_vtx_y.b : dim_vtx_y.c; + max <= a_lt_c ? dim_vtx_y.c : dim_vtx_y.a; + end + + {min.fine, min.sub} <= '0; + {max.fine, max.sub} <= '0; + + if (end_new_dim) begin + edges_ref.y <= min; + edges_ref.x <= edges_ref.y; + + edges_span.y <= max - min; + edges_span.x <= edges_span.y; + end + end + +endmodule + +module gfx_raster_edges +( + input logic clk, + rst_n, + + input logic bounds_valid, + input gfx::word bounds_geom_id, + input gfx::fixed_xy bounds_ref, + input gfx::raster_prec_xy bounds_span, + input gfx::vtx_xy bounds_vtx, + output logic bounds_ready, + + input logic coarse_ready, + output logic coarse_valid, + output gfx::word coarse_geom_id, + output gfx::fixed_xy coarse_ref, + output gfx::raster_prec_xy coarse_span, + output gfx::fixed coarse_base, + output gfx::raster_offsets_xy coarse_offsets +); + + import gfx::*; + + enum int unsigned + { + EDGE_AB, + EDGE_BC, + EDGE_CA, + // EDGE_CA cumple doble función como OFFSETS_AB + OFFSETS_BC, + OFFSETS_CA, + OUT + } state; + + struct + { + fixed_xy cur, + delay1, + delay2; + } inc; + + logic coarse_handshake, coarse_stall, offsets_flow; + fixed_xy delta, p, q; + + // - 2 porque coarse valid va al final + logic[FIXED_DOTADD_DEPTH - 2:0] dotadd_valid; + + assign coarse_stall = coarse_valid & ~coarse_ready; + assign coarse_handshake = coarse_valid & coarse_ready; + + gfx_fixed_dotadd edge_base + ( + .clk, + .c(0), + .q(coarse_base), + .a0(delta.x), + .b0(inc.cur.x), + .a1(delta.y), + .b1(inc.cur.y), + .stall(coarse_stall) + ); + + always_comb + unique case (state) + OUT: offsets_flow = coarse_handshake; + default: offsets_flow = 1; + endcase + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= EDGE_AB; + + p <= 'x; + q <= 'x; + coarse_ref <= 'x; + coarse_geom_id <= 'x; + + bounds_ready <= 0; + coarse_valid <= 0; + + for (int i = 0; i < $bits(dotadd_valid) - 1; ++i) + dotadd_valid[i] <= 0; + end else begin + for (int i = 1; i < $bits(dotadd_valid); ++i) + dotadd_valid[i] <= dotadd_valid[i - 1]; + + if (~coarse_stall) + coarse_valid <= dotadd_valid[$bits(dotadd_valid) - 1]; + + bounds_ready <= 0; + dotadd_valid[0] <= 0; + + unique case (state) + EDGE_AB: begin + if (bounds_valid) + state <= EDGE_BC; + + coarse_ref <= bounds_ref; + coarse_span <= bounds_span; + coarse_geom_id <= bounds_geom_id; + + p <= bounds_vtx.a; + q <= bounds_vtx.b; + end + + EDGE_BC: begin + state <= EDGE_CA; + bounds_ready <= 1; + + p <= bounds_vtx.b; + q <= bounds_vtx.c; + end + + EDGE_CA: begin + state <= OFFSETS_BC; + + p <= bounds_vtx.c; + q <= bounds_vtx.a; + + // Esto ocurre justamente en un momento en que ab, bc, ca + // quedan todos en sus lugares correctos en la pipeline + dotadd_valid[0] <= 1; + end + + OFFSETS_BC: + state <= OFFSETS_CA; + + OFFSETS_CA: + state <= OUT; + + OUT: + if (coarse_handshake) + state <= EDGE_AB; + endcase + end + + always_ff @(posedge clk) begin + delta.x <= coarse_ref.x - q.x; + delta.y <= coarse_ref.y - q.y; + + inc.cur.x <= p.y - q.y; + inc.cur.y <= q.x - p.x; + + //TODO: top-left rule + if (offsets_flow) begin + inc.delay1 <= inc.cur; + inc.delay2 <= inc.delay1; + + coarse_offsets.x <= make_raster_offsets(inc.delay2.x); + coarse_offsets.y <= make_raster_offsets(inc.delay2.y); + end + end + +endmodule + +module gfx_raster_coarse +( + input logic clk, + rst_n, + + input logic edges_valid, + input gfx::word edges_geom_id, + input gfx::fixed_xy edges_ref, + input gfx::raster_prec_xy edges_span, + input gfx::fixed edges_base, + input gfx::raster_offsets_xy edges_offsets, + output logic edges_ready, + + input logic fine_ready, + output logic fine_valid, + output gfx::word fine_geom_id, + output gfx::fixed_xy fine_ref, + output gfx::fixed fine_corner, + output gfx::raster_offsets_xy fine_offsets +); + + import gfx::*; + + enum int unsigned + { + SETUP, + TEST_AB, + TEST_BC, + TEST_CA, + OUT + } state; + + struct + { + fixed cur, + next, + prev; + } corner, edge_fn, vertical; + + struct + { + raster_offsets_xy cur, + next, + prev; + } offsets; + + logic edges_recv, end_block, end_x, end_y, first_run, + mask, mask_reset, new_geom, test_flow, out_flow; + + fixed edge_test, reference_x, vertical_inc; + fixed_xy max_offset, min_offset, test_offset; + raster_coarse_xy stride; + raster_coarse_dim width; + raster_offsets_xy next_offsets; + + function fixed coarse_offset(raster_offsets offsets); + return raster_idx(offsets, RASTER_BITS'(1)) << RASTER_BITS; + endfunction + + assign end_x = stride.x == '0; + assign end_y = stride.y == '0; + assign end_block = end_x & end_y; + + assign edge_test = edge_fn.cur + test_offset.x + test_offset.y; + assign vertical_inc = vertical.cur + coarse_offset(offsets.cur.y); + + assign fine_corner = corner.cur; + assign fine_offsets = offsets.cur; // Vuelve a cur luego de 3 ciclos + + assign min_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(0)); + assign min_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(0)); + assign max_offset.x = raster_idx(next_offsets.x, RASTER_BITS'(RASTER_SIZE - 1)); + assign max_offset.y = raster_idx(next_offsets.y, RASTER_BITS'(RASTER_SIZE - 1)); + assign next_offsets = edges_recv ? edges_offsets : offsets.next; + + always_comb begin + unique case (state) + SETUP: new_geom = 1; + default: new_geom = 0; + endcase + + unique case (state) + TEST_AB: mask_reset = 1; + default: mask_reset = 0; + endcase + + unique case (state) + SETUP: edges_ready = 1; + default: edges_ready = 0; + endcase + + unique case (state) + SETUP: + edges_recv = 1; + + TEST_AB, TEST_BC: + edges_recv = first_run; + + default: + edges_recv = 0; + endcase + + unique case (state) + OUT: fine_valid = mask; + default: fine_valid = 0; + endcase + + unique case (state) + OUT: begin + out_flow = ~mask | fine_ready; + test_flow = 0; + end + + default: begin + out_flow = 0; + test_flow = 1; + end + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= SETUP; + first_run <= 1; + end else + unique case (state) + SETUP: + if (edges_valid) + state <= TEST_AB; + + TEST_AB: + state <= TEST_BC; + + TEST_BC: + state <= TEST_CA; + + TEST_CA: + state <= OUT; + + OUT: begin + first_run <= end_block; + if (out_flow) + state <= end_block ? SETUP : TEST_AB; + end + endcase + + always_ff @(posedge clk) begin + if (new_geom) begin + width <= edges_span.x.coarse; + stride.x <= edges_span.x.coarse; + stride.y <= edges_span.y.coarse; + reference_x <= edges_ref.x; + + fine_ref <= edges_ref; + fine_geom_id <= edges_geom_id; + end + + if (out_flow) begin + stride.x <= stride.x - 1; + fine_ref.x.fint <= fine_ref.x.fint + ($bits(fixed_int))'(RASTER_SIZE); + + if (end_x) begin + fine_ref.x <= reference_x; + fine_ref.y.fint <= fine_ref.y.fint + ($bits(fixed_int))'(RASTER_SIZE); + + stride.x <= width; + stride.y <= stride.y - 1; + end + end + + if (test_flow) begin + offsets.cur <= next_offsets; + offsets.next <= offsets.prev; + offsets.prev <= offsets.cur; + + vertical.cur <= vertical.next; + vertical.next <= vertical.prev; + vertical.prev <= vertical.cur; + + edge_fn.cur <= edge_fn.next; + edge_fn.next <= edge_fn.prev; + edge_fn.prev <= edge_fn.cur + coarse_offset(offsets.cur.x); + + if (end_x) begin + edge_fn.prev <= vertical_inc; + vertical.prev <= vertical_inc; + end + + corner.cur <= corner.next; + corner.next <= corner.prev; + corner.prev <= edge_fn.cur; + + if (coarse_offset(next_offsets.x) >= 'sd0) + test_offset.x <= max_offset.x; + else + test_offset.x <= min_offset.x; + + if (coarse_offset(next_offsets.y) >= 'sd0) + test_offset.y <= max_offset.y; + else + test_offset.y <= min_offset.y; + + mask <= (mask | mask_reset) & 1/*(edge_test >= 'sd0)*/; + end + + if (edges_recv) begin + edge_fn.cur <= edges_base; + vertical.cur <= edges_base; + end + end + +endmodule + +module gfx_raster_fine +( + input logic clk, + rst_n, + + input logic coarse_valid, + input gfx::word coarse_geom_id, + input gfx::fixed_xy coarse_ref, + input gfx::fixed coarse_corner, + input gfx::raster_offsets_xy coarse_offsets, + output logic coarse_ready, + + gfx_pkts.tx coverage +); + + import gfx::*; + + enum int unsigned + { + IN_C, + IN_A, + IN_B, + IN_MASK + } in_state; + + enum int unsigned + { + OUT_ACCEPT, + OUT_GEOM_ID, + OUT_POS, + OUT_MASK, + OUT_BARY_C, + OUT_BARY_A, + OUT_BARY_B + } out_state; + + struct + { + fixed cur, + next, + prev; + } corner; + + struct + { + raster_offsets_xy cur, + next, + prev; + } offsets; + + logic begin_bary, hold_block, in_valid, mask_in_clean, + mask_in_reset, new_block, out_last; + + word geom_id; + fixed bary_coord; + lane_no lane, lane_ctz, lane_hold; + fixed_xy block_ref; + lane_mask mask_in, mask, mask_ctz; + raster_index lane_x, lane_y; + logic[$bits(lane_ctz):0] ctz_count; + + function shword ref_half(raster_prec dim); + return dim.coarse[$bits(shword) - 1:0]; + endfunction + + assign lane_ctz = ctz_count[$bits(lane_ctz) - 1:0]; + assign in_valid = mask_in_clean & |mask_in; + assign out_last = ~|mask; + assign {lane_y, lane_x} = lane; + + // **IMPORTANTE**: Esto va a fallar a partir de RASTER_BITS >= 3, + // ya que la fsm asume que ctz termina en 3 ciclos o menos + + gfx_ctz #(RASTER_COARSE_FRAGS) ctz + ( + .clk, + .value(mask_ctz), + .ctz(ctz_count) + ); + + always_comb begin + unique case (out_state) + OUT_ACCEPT: new_block = 1; + default: new_block = 0; + endcase + + unique case (out_state) + OUT_ACCEPT: mask_ctz = mask_in; + default: mask_ctz = mask; + endcase + + unique case (out_state) + OUT_ACCEPT: coverage.tvalid = 0; + default: coverage.tvalid = 1; + endcase + + unique case (out_state) + OUT_MASK, OUT_BARY_B: + begin_bary = coverage.tready; + + default: + begin_bary = 0; + endcase + + unique case (out_state) + OUT_BARY_B: coverage.tlast = out_last; + default: coverage.tlast = 0; + endcase + + unique case (out_state) + OUT_GEOM_ID: + coverage.tdata = geom_id; + + OUT_POS: + coverage.tdata = {ref_half(coarse_ref.y), ref_half(block_ref.x)}; + + OUT_MASK: + coverage.tdata = {{($bits(word) - $bits(mask)){1'b0}}, mask}; + + OUT_BARY_C, OUT_BARY_A, OUT_BARY_B: + coverage.tdata = bary_coord; + + default: + coverage.tdata = 'x; + endcase + + unique case (out_state) + OUT_MASK: + lane = lane_ctz; + + default: + lane = lane_hold; + endcase + + unique case (in_state) + IN_C: coarse_ready = new_block; + default: coarse_ready = 0; + endcase + + unique case (in_state) + IN_C: hold_block = new_block; + IN_A: hold_block = 1; + IN_B: hold_block = 1; + IN_MASK: hold_block = 0; + endcase + + unique case (in_state) + IN_C: mask_in_reset = 1; + default: mask_in_reset = 0; + endcase + + unique case (in_state) + IN_MASK: mask_in_clean = 1; + default: mask_in_clean = 0; + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + in_state <= IN_C; + out_state <= OUT_ACCEPT; + end else begin + unique case (in_state) + IN_C: + if (coarse_valid & new_block) + in_state <= IN_A; + + IN_A: + in_state <= IN_B; + + IN_B: + in_state <= IN_MASK; + + IN_MASK: + in_state <= IN_C; + endcase + + unique case (out_state) + OUT_ACCEPT: + if (in_valid) + out_state <= OUT_GEOM_ID; + + OUT_GEOM_ID: + if (coverage.tready) + out_state <= OUT_POS; + + OUT_POS: + if (coverage.tready) + out_state <= OUT_MASK; + + OUT_MASK: + if (coverage.tready) + out_state <= OUT_BARY_C; + + OUT_BARY_C: + if (coverage.tready) + out_state <= OUT_BARY_A; + + OUT_BARY_A: + if (coverage.tready) + out_state <= OUT_BARY_B; + + OUT_BARY_B: + if (coverage.tready) + out_state <= out_last ? OUT_ACCEPT : OUT_BARY_C; + endcase + end + + always_ff @(posedge clk) begin + // Prueba paralela de signos, esto hace el heavy lifting de fine raster + // Nótese que muchos sumadores serán eliminados en síntesis + for (int i = 0; i < RASTER_SIZE; ++i) + for (int j = 0; j < RASTER_SIZE; ++j) + mask_in[i * RASTER_SIZE + j] <= + (mask_in[i * RASTER_SIZE + j] | mask_in_reset) + & (coarse_corner + + raster_idx(coarse_offsets.y, RASTER_BITS'(i)) + + raster_idx(coarse_offsets.x, RASTER_BITS'(j)) + >= 'sd0); + + // Recalculamos las coordenadas baricéntricas de cada fragmento que + // no haya sido descartado. La razón de esto es evitar almacenar y + // luego multiplexar las coordenadas de un bloque entero (48 words). + if (coverage.tready) + bary_coord <= corner.next + + raster_idx(offsets.next.y, RASTER_BITS'(lane_y)) + + raster_idx(offsets.next.x, RASTER_BITS'(lane_x)); + + if (new_block & mask_in_reset) begin + geom_id <= coarse_geom_id; + block_ref <= coarse_ref; + end + + // new_block = 0 => coverage.tvalid = 1 + if (new_block | coverage.tready) begin + corner.cur <= corner.next; + corner.next <= corner.prev; + corner.prev <= corner.cur; + + offsets.cur <= offsets.next; + offsets.next <= offsets.prev; + offsets.prev <= offsets.cur; + end + + if (hold_block) begin + // Para prev en vez de cur para que los primeros valores queden en + // cur justamente al llegar a OUT_BARY_C + corner.prev <= coarse_corner; + offsets.prev <= coarse_offsets; + end + + if (new_block) + mask <= mask_in; + + if (begin_bary) begin + mask <= mask & (mask - 1); + lane_hold <= lane_ctz; + end + end + +endmodule diff --git a/rtl/gfx/gfx_regfile_io.sv b/rtl/gfx/gfx_regfile_io.sv new file mode 100644 index 0000000..2459049 --- /dev/null +++ b/rtl/gfx/gfx_regfile_io.sv @@ -0,0 +1,106 @@ +interface gfx_regfile_io; + + import gfx::*; + + struct + { + group_id group; + sgpr_num a_sgpr, + b_sgpr; + vgpr_num a_vgpr, + b_vgpr; + logic[12:0] b_imm; + logic a_scalar, + b_scalar, + b_is_imm, + b_is_const, + scalar_rev; + } op; + + struct + { + logic write; + group_id group; + sgpr_num sgpr; + word data; + } sgpr_write; + + struct + { + lane_mask mask; + group_id group; + vgpr_num vgpr; + word data[SHADER_LANES]; + } vgpr_write; + + word a[SHADER_LANES], b[SHADER_LANES], sgpr_write_data, vgpr_write_data[SHADER_LANES]; + logic mask_wb_write, pc_wb_write; + word_ptr pc_back, pc_front, pc_wb; + group_id mask_back_group, mask_wb_group, pc_back_group, pc_front_group, pc_wb_group; + lane_mask mask_back, mask_wb; + + modport ab + ( + input a, + b + ); + + modport read + ( + output op + ); + + modport bind_ + ( + input pc_front, + + output pc_front_group + ); + + modport wb + ( + input pc_back, + mask_back, + + output sgpr_write, + vgpr_write, + + pc_back_group, + mask_back_group, + + pc_wb, + pc_wb_group, + pc_wb_write, + + mask_wb, + mask_wb_group, + mask_wb_write + ); + + modport regs + ( + input op, + sgpr_write, + vgpr_write, + + pc_back_group, + pc_front_group, + mask_back_group, + + pc_wb, + pc_wb_group, + pc_wb_write, + + mask_wb, + mask_wb_group, + mask_wb_write, + + output a, + b, + + pc_back, + pc_front, + mask_back + ); + +endinterface diff --git a/rtl/gfx/gfx_rst_sync.sv b/rtl/gfx/gfx_rst_sync.sv new file mode 100644 index 0000000..2a8ea3b --- /dev/null +++ b/rtl/gfx/gfx_rst_sync.sv @@ -0,0 +1,13 @@ +//FIXME: peligro +module gfx_rst_sync +( + input logic clk, + rst_n, + + output logic srst_n +); + + always_ff @(posedge clk or negedge rst_n) + srst_n <= ~rst_n ? 0 : 1; + +endmodule diff --git a/rtl/gfx/gfx_sched.sv b/rtl/gfx/gfx_sched.sv new file mode 100644 index 0000000..0ffaecd --- /dev/null +++ b/rtl/gfx/gfx_sched.sv @@ -0,0 +1,141 @@ +module gfx_sched +import gfx::*; +( + input logic clk, + rst_n, + srst_n, + + gfx_axil.m axim, + + input irq_lines irq +); + + // verilator tracing_off + + logic axi_ready, axi_valid, bram_ready, bram_read, bram_write, bram_write_next, + mem_instr, mem_la_read, mem_la_write, mem_ready, mem_valid, select_bram; + + word bram[SCHED_BRAM_WORDS]; + word axi_rdata, bram_rdata, mem_addr, mem_la_addr, mem_rdata, mem_wdata; + logic[$bits(word) / $bits(byte) - 1:0] mem_wstrb; + + logic[$clog2(SCHED_BRAM_WORDS) - 1:0] bram_addr; + + assign bram_addr = mem_addr[$bits(bram_addr) + SUBWORD_BITS - 1:SUBWORD_BITS]; + assign mem_ready = (axi_valid & axi_ready) | bram_ready; + assign mem_rdata = bram_ready ? bram_rdata : axi_rdata; + assign select_bram = ~|mem_la_addr[$bits(mem_la_addr) - 1:$bits(bram_addr) + SUBWORD_BITS]; + assign bram_write_next = mem_la_write & select_bram; + + defparam core.ENABLE_COUNTERS = 0; + defparam core.ENABLE_COUNTERS64 = 0; + defparam core.BARREL_SHIFTER = 1; + defparam core.COMPRESSED_ISA = 1; + defparam core.CATCH_MISALIGN = 0; + defparam core.CATCH_ILLINSN = 0; + defparam core.ENABLE_MUL = 1; + defparam core.ENABLE_DIV = 1; + defparam core.ENABLE_IRQ = 1; + defparam core.ENABLE_IRQ_QREGS = 0; + defparam core.ENABLE_IRQ_TIMER = 0; + defparam core.PROGADDR_RESET = BOOTROM_BASE; + + picorv32 core + ( + .clk, + .resetn(srst_n), + .trap(), + + .mem_valid, + .mem_instr, + .mem_ready, + + .mem_addr, + .mem_wdata, + .mem_wstrb, + .mem_rdata, + + .mem_la_read, + .mem_la_write, + .mem_la_addr, + .mem_la_wdata(), + .mem_la_wstrb(), + + .pcpi_valid(), + .pcpi_insn(), + .pcpi_rs1(), + .pcpi_rs2(), + .pcpi_wr(), + .pcpi_rd(), + .pcpi_wait(0), + .pcpi_ready(0), + + .irq, + .eoi(), + + .trace_valid(), + .trace_data() + ); + + picorv32_axi_adapter axi + ( + .clk, + .resetn(srst_n), + + .mem_axi_awvalid(axim.awvalid), + .mem_axi_awready(axim.awready), + .mem_axi_awaddr(axim.awaddr), + .mem_axi_awprot(), + + .mem_axi_wvalid(axim.wvalid), + .mem_axi_wready(axim.wready), + .mem_axi_wdata(axim.wdata), + .mem_axi_wstrb(), // Potenciales sorpresas + + .mem_axi_bvalid(axim.bvalid), + .mem_axi_bready(axim.bready), + + .mem_axi_arvalid(axim.arvalid), + .mem_axi_arready(axim.arready), + .mem_axi_araddr(axim.araddr), + .mem_axi_arprot(), + + .mem_axi_rvalid(axim.rvalid), + .mem_axi_rready(axim.rready), + .mem_axi_rdata(axim.rdata), + + .mem_valid(mem_valid & axi_valid), + .mem_instr, + .mem_ready(axi_ready), + .mem_addr, + .mem_wdata, + .mem_wstrb, + .mem_rdata(axi_rdata) + ); + + always_ff @(posedge clk) begin + if (bram_write) begin + for (int i = 0; i < $bits(mem_wstrb); ++i) + if (mem_wstrb[i]) + bram[bram_addr][i] <= mem_wdata[i]; + + bram_rdata <= 'x; + end else + bram_rdata <= bram[bram_addr]; + end + + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + axi_valid <= 0; + bram_read <= 0; + bram_ready <= 0; + bram_write <= 0; + end else begin + axi_valid <= ~select_bram | (axi_valid & ~axi_ready); + bram_read <= mem_la_read & select_bram; + bram_write <= bram_write_next; + bram_ready <= bram_read | bram_write_next; + end + +endmodule diff --git a/rtl/gfx/gfx_shader.sv b/rtl/gfx/gfx_shader.sv new file mode 100644 index 0000000..322ffb5 --- /dev/null +++ b/rtl/gfx/gfx_shader.sv @@ -0,0 +1,77 @@ +module gfx_shader +import gfx::*; +import gfx_shader_schedif_pkg::*; +( + input logic clk, + rst_n, + + gfx_axib.m insn_mem, + + gfx_axil.s sched +); + + axi4lite_intf #(.ADDR_WIDTH(GFX_SHADER_SCHEDIF_MIN_ADDR_WIDTH)) regblock(); + + gfx_axil2regblock axil2regblock + ( + .axis(sched), + .axim(regblock.master) + ); + + gfx_shader_schedif__in_t schedif_in; + gfx_shader_schedif__out_t schedif_out; + + gfx_front_back front_back(); + gfx_regfile_io regfile(); + gfx_shader_setup setup(); + + assign schedif_in.SETUP_CTRL.GPR_DONE.hwset = setup.sched.set_done.gpr; + assign schedif_in.SETUP_CTRL.MASK_DONE.hwset = setup.sched.set_done.mask; + assign schedif_in.SETUP_CTRL.SUBMIT_DONE.hwset = setup.sched.set_done.submit; + + assign setup.sched.write.pc = schedif_out.SETUP_SUBMIT.PC.value; + assign setup.sched.write.gpr = schedif_out.SETUP_CTRL.XGPR.value; + assign setup.sched.write.mask = schedif_out.SETUP_MASK.MASK.value; + assign setup.sched.write.group = schedif_out.SETUP_CTRL.GROUP.value; + assign setup.sched.write.pc_set = schedif_out.SETUP_SUBMIT.PC.swmod; + assign setup.sched.write.gpr_set = schedif_out.SETUP_GPR.VALUE.swmod; + assign setup.sched.write.mask_set = schedif_out.SETUP_MASK.MASK.swmod; + assign setup.sched.write.gpr_value = schedif_out.SETUP_GPR.VALUE.value; + + gfx_shader_front frontend + ( + .clk, + .rst_n, + .front(front_back.front), + .reg_bind(regfile.bind_), + .reg_read(regfile.read), + .fetch_mem(insn_mem), + .icache_flush(schedif_out.CORE.IFLUSH.value) + ); + + gfx_shader_back backend + ( + .clk, + .rst_n, + .back(front_back.back), + .setup(setup.core), + .reg_wb(regfile.wb), + .read_data(regfile.ab) + ); + + gfx_shader_regs regs + ( + .clk, + .io(regfile.regs) + ); + + gfx_shader_schedif schedif + ( + .clk, + .arst_n(rst_n), + .s_axil(regblock.slave), + .hwif_in(schedif_in), + .hwif_out(schedif_out) + ); + +endmodule diff --git a/rtl/gfx/gfx_shader_back.sv b/rtl/gfx/gfx_shader_back.sv new file mode 100644 index 0000000..4929192 --- /dev/null +++ b/rtl/gfx/gfx_shader_back.sv @@ -0,0 +1,335 @@ +module gfx_shader_back +import gfx::*; +( + input logic clk, + rst_n, + + gfx_front_back.back back, + + gfx_regfile_io.ab read_data, + gfx_regfile_io.wb reg_wb, + + gfx_shader_setup.core setup +); + + logic abort; + + gfx_wb out_wb(), p0_wb(), p1_wb(), p2_wb(), p3_wb(); + gfx_shake p1_shake(), p2_shake(), p3_shake(); + + gfx_shader_abort p0_abort + ( + .clk, + .p1(p1_shake.peek), + .p2(p2_shake.peek), + .p3(p3_shake.peek), + .abort + ); + + gfx_shader_fpint p0 + ( + .clk, + .rst_n, + .op(back.execute.p0), + .wb(p0_wb.tx), + .wave(back.execute.wave), + .abort, + .read_data, + .in_valid(back.dispatch.valid) + ); + + gfx_shader_mem p1 + ( + .clk, + .rst_n, + .op(back.execute.p1), + .wb(p1_wb.tx), + .wave(back.execute.wave), + .in_shake(p1_shake.rx), + .read_data + ); + + gfx_shader_sfu p2 + ( + .clk, + .rst_n, + .op(back.execute.p2), + .wb(p2_wb.tx), + .wave(back.execute.wave), + .in_shake(p2_shake.rx), + .read_data + ); + + gfx_shader_group p3 + ( + .clk, + .rst_n, + .op(back.execute.p3), + .wb(p3_wb.tx), + .wave(back.execute.wave), + .in_shake(p3_shake.rx), + .read_data + ); + + gfx_shader_writeback_arbiter4 writeback_arbiter + ( + .clk, + .rst_n, + .p0(p0_wb.rx), + .p1(p1_wb.rx), + .p2(p2_wb.rx), + .p3(p3_wb.rx), + .out(out_wb.tx) + ); + + gfx_shader_writeback writeback + ( + .clk, + .rst_n, + .wb(out_wb.rx), + .regs(reg_wb), + .setup, + .loop_group(back.loop.group), + .loop_valid(back.loop.valid) + ); + +endmodule + +module gfx_shader_abort +( + input logic clk, + + gfx_shake.peek p1, + p2, + p3, + + output logic abort +); + + always_ff @(posedge clk) + abort <= + (p1.valid & p1.ready) + | (p2.valid & p2.ready) + | (p3.valid & p3.ready); + +endmodule + +module gfx_shader_writeback_arbiter4 +( + input logic clk, + rst_n, + + gfx_wb.rx p0, + p1, + p2, + p3, + + gfx_wb.tx out +); + + assert property ( + @(posedge clk) + disable iff (~rst_n) + + (p0.ready & out.ready) + ); + + gfx_wb p0_p1(), p2_p3(); + + gfx_shader_writeback_arbiter2_prio arbiter_p0_p1 + ( + .clk, + .rst_n, + .a(p0), + .b(p1), + .out(p0_p1.tx) + ); + + gfx_shader_writeback_arbiter2_prio arbiter_p2_p3 + ( + .clk, + .rst_n, + .a(p2), + .b(p3), + .out(p2_p3.tx) + ); + + gfx_shader_writeback_arbiter2_prio arbiter_out + ( + .clk, + .rst_n, + .a(p0_p1.rx), + .b(p2_p3.tx), + .out + ); + +endmodule + +module gfx_shader_writeback_arbiter2_prio +( + input logic clk, + rst_n, + + gfx_wb.rx a, + b, + + gfx_wb.tx out +); + + //TODO + assign a.ready = out.ready; + assign b.ready = 0; + + assign out.dest = a.dest; + assign out.lanes = a.lanes; + assign out.group = a.group; + assign out.valid = a.valid; + assign out.scalar = a.scalar; + assign out.writeback = a.writeback; + + assign out.mask = a.mask; + assign out.mask_update = a.mask_update; + + assign out.pc_add = a.pc_add; + assign out.pc_inc = a.pc_inc; + assign out.pc_update = a.pc_update; + +endmodule + +module gfx_shader_writeback +import gfx::*; +( + input logic clk, + rst_n, + + gfx_wb.rx wb, + + gfx_regfile_io.wb regs, + + output logic loop_valid, + output group_id loop_group, + + gfx_shader_setup.core setup +); + + struct + { + group_id group; + word lanes[SHADER_LANES]; + pc_offset pc_add; + lane_mask mask; + vgpr_num vgpr; + logic pc_update, + mask_update, + vgpr_update; + } loop_hold[REGFILE_STAGES], loop_out; + + logic loop_valid_hold[REGFILE_STAGES], loop_out_valid, mask_wb, scalar_wb, + setup_gpr, setup_mask, setup_submit; + + assign wb.ready = 1; + + assign loop_out = loop_hold[REGFILE_STAGES - 1]; + assign loop_out_valid = loop_valid_hold[REGFILE_STAGES - 1]; + + assign loop_valid = loop_out_valid | setup_submit; + + assign regs.pc_back_group = wb.group; + assign regs.mask_back_group = wb.group; + + assign regs.pc_wb_write = (loop_out_valid & loop_out.pc_update) | setup_submit; + assign regs.mask_wb_write = mask_wb | setup_mask; + assign regs.sgpr_write.write = scalar_wb | setup_gpr; + + assign regs.vgpr_write.vgpr = loop_out.vgpr; + assign regs.vgpr_write.group = loop_out.group; + + assign mask_wb = loop_out_valid & loop_out.mask_update; + assign scalar_wb = wb.valid & wb.writeback & wb.scalar; + + always_comb begin + loop_group = setup.write.group; + regs.pc_wb = setup.write.pc; + regs.pc_wb_group = setup.write.group; + + if (loop_out_valid) begin + loop_group = loop_out.group; + regs.pc_wb = regs.pc_back + word_ptr'(loop_out.pc_add); + regs.pc_wb_group = loop_out.group; + end + + regs.mask_wb = setup.write.mask; + regs.mask_wb_group = setup.write.group; + + if (mask_wb) begin + regs.mask_wb = loop_out.mask; + regs.mask_wb_group = loop_out.group; + end + + regs.sgpr_write.data = setup.write.gpr_value; + regs.sgpr_write.sgpr = setup.write.gpr.sgpr; + regs.sgpr_write.group = setup.write.group; + + if (scalar_wb) begin + regs.sgpr_write.data = wb.lanes[0]; + regs.sgpr_write.sgpr = wb.dest.sgpr; + regs.sgpr_write.group = wb.group; + end + + for (int i = 0; i < SHADER_LANES; ++i) + regs.vgpr_write.data[i] = loop_out.lanes[i]; + + regs.vgpr_write.mask = regs.mask_back; + if (~loop_out_valid | ~loop_out.vgpr_update) + regs.vgpr_write.mask = '0; + end + + always_ff @(posedge clk) begin + // Blocking assignments por bug de verilator (ver for de lanes abajo) + + for (int i = REGFILE_STAGES - 1; i > 0; --i) + loop_hold[i] = loop_hold[i - 1]; + + loop_hold[0].mask = wb.mask; + loop_hold[0].vgpr = wb.dest.vgpr.num; + loop_hold[0].group = wb.group; + loop_hold[0].pc_add = wb.pc_add; + loop_hold[0].pc_update = wb.pc_update; + loop_hold[0].mask_update = wb.mask_update; + loop_hold[0].vgpr_update = wb.writeback & ~wb.scalar; + + // https://github.com/verilator/verilator/issues/4804 + for (int i = 0; i < SHADER_LANES; ++i) + loop_hold[0].lanes[i] = wb.lanes[i]; + + if (wb.pc_inc) + loop_hold[0].pc_add = pc_offset'(1); + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + setup_gpr <= 0; + setup_mask <= 0; + setup_submit <= 0; + + setup.set_done.gpr <= 0; + setup.set_done.mask <= 0; + setup.set_done.submit <= 0; + + for (int i = 0; i < $size(loop_valid_hold); ++i) + loop_valid_hold[i] <= 0; + end else begin + setup_gpr <= (setup_gpr & scalar_wb) | setup.write.gpr_set; + setup_mask <= (setup_mask & mask_wb) | setup.write.mask_set; + setup_submit <= (setup_submit & loop_out_valid) | setup.write.pc_set; + + setup.set_done.gpr <= setup_gpr & ~scalar_wb; + setup.set_done.mask <= setup_mask & ~mask_wb; + setup.set_done.submit <= setup_submit & ~loop_out_valid; + + loop_valid_hold[0] <= wb.valid; + for (int i = 1; i < REGFILE_STAGES; ++i) + loop_valid_hold[i] <= loop_valid_hold[i - 1]; + end + +endmodule diff --git a/rtl/gfx/gfx_shader_fpint.sv b/rtl/gfx/gfx_shader_fpint.sv new file mode 100644 index 0000000..a418dcc --- /dev/null +++ b/rtl/gfx/gfx_shader_fpint.sv @@ -0,0 +1,932 @@ +// -> 4,4,4,4,4,4,4,4 -> 8,8,8,8 -> 16,16 -> 32 +localparam int FPINT_CLZ_STAGES = 4; + +localparam bit[$clog2($bits(gfx::float_mant_ext)):0] FPINT_MAX_SHIFT + = 1 << $clog2($bits(gfx::float_mant_ext)); + +typedef logic[$clog2(FPINT_MAX_SHIFT):0] fpint_shift; + +/* Las 15 etapas son: + * - setup + * - mulclass + * - mnorm + * - minmax + * - expdiff + * - shiftr + * - addsub + * - clz0-clz3 + * - shiftl + * - round + * - rnorm + * - encode + */ + +typedef struct +{ + gfx::float a, + b, + a_mul, + b_mul; +} fpint_setup_mulclass; + +typedef struct +{ + gfx::float b; + gfx::float_exp exp; + gfx::float_class a_class, + b_class; + gfx::udword product; + logic sign, + overflow; +} fpint_mulclass_mnorm; + +typedef struct +{ + gfx::float a, + b; + gfx::float_class a_class, + b_class; + logic slow, + zero, + guard, + round, + sticky, + slow_in, + overflow; +} fpint_mnorm_minmax; + +typedef struct +{ + gfx::float max, + min; + gfx::float_class max_class, + min_class; + logic slow, + zero, + guard, + round, + sticky; +} fpint_minmax_expdiff; + +typedef struct +{ + gfx::float max, + min; + gfx::float_class max_class, + min_class; + fpint_shift exp_shift; + logic slow, + zero, + guard, + round, + sticky; +} fpint_expdiff_shiftr; + +typedef struct +{ + gfx::float max, + min; + gfx::float_class max_class, + min_class; + gfx::float_mant_ext max_mant, + min_mant, + sticky_mask; + logic slow, + zero, + guard, + round, + sticky, + int_sign; +} fpint_shiftr_addsub; + +typedef struct +{ + gfx::float max; + gfx::word add_sub; + logic slow, + zero, + guard, + round, + sticky; +} fpint_clz_hold; + +typedef fpint_clz_hold fpint_addsub_clz; + +typedef struct +{ + fpint_clz_hold hold; + fpint_shift shift; +} fpint_clz_shiftl; + +typedef struct +{ + gfx::float val; + logic slow, + zero, + guard, + round, + sticky, + overflow, + sticky_last; +} fpint_shiftl_round; + +typedef struct +{ + gfx::float val; + logic slow, + zero, + exp_step, + overflow; +} fpint_round_rnorm; + +typedef struct +{ + gfx::float val; + logic slow, + zero, + overflow; +} fpint_rnorm_encode; + +module gfx_shader_fpint +import gfx::*; +( + input logic clk, + rst_n, + + input fpint_op op, + input wave_exec wave, + input logic abort, + in_valid, + + gfx_regfile_io.ab read_data, + + gfx_wb.tx wb +); + + localparam int FPINT_STAGES = 7 + FPINT_CLZ_STAGES + 4; + + struct + { + fpint_op op; + wave_exec wave; + } stage[FPINT_STAGES]; + + logic stage_valid[FPINT_STAGES]; + + assign wb.dest = stage[FPINT_STAGES - 1].wave.dest; + assign wb.mask = 'x; + assign wb.group = stage[FPINT_STAGES - 1].wave.group; + assign wb.pc_add = 'x; + assign wb.pc_inc = 1; + assign wb.scalar = stage[FPINT_STAGES - 1].wave.dest_scalar; + assign wb.pc_update = wb.writeback; + assign wb.writeback = stage[FPINT_STAGES - 1].op.writeback; + assign wb.mask_update = 0; + + // Ojo: stage_valid[0], pero stage[0] no + assign stage_valid[0] = in_valid; + + genvar lane; + generate + for (lane = 0; lane < SHADER_LANES; ++lane) begin: lanes + gfx_shader_fpint_lane unit + ( + .clk(clk), + .a(read_data.a[lane]), + .b(read_data.b[lane]), + .q(wb.lanes[lane]), + .mul_float_0(op.setup_mul_float), + .unit_b_0(op.setup_unit_b), + .put_hi_2(stage[2 - 1].op.mnorm_put_hi), + .put_lo_2(stage[2 - 1].op.mnorm_put_lo), + .put_mul_2(stage[2 - 1].op.mnorm_put_mul), + .zero_b_2(stage[2 - 1].op.mnorm_zero_b), + .zero_flags_2(stage[2 - 1].op.mnorm_zero_flags), + .abs_3(stage[3 - 1].op.minmax_abs), + .swap_3(stage[3 - 1].op.minmax_swap), + .zero_min_3(stage[3 - 1].op.minmax_zero_min), + .copy_flags_3(stage[3 - 1].op.minmax_copy_flags), + .int_signed_5(stage[5 - 1].op.shiftr_int_signed), + .copy_flags_6(stage[6 - 1].op.addsub_copy_flags), + .int_operand_6(stage[6 - 1].op.addsub_int_operand), + .force_nop_7(stage[7 - 1].op.clz_force_nop), + .copy_flags_11(stage[11 - 1].op.shiftl_copy_flags), + .copy_flags_12(stage[12 - 1].op.round_copy_flags), + .enable_12(stage[12 - 1].op.round_enable), + .enable_14(stage[14 - 1].op.encode_enable) + ); + end + endgenerate + + always_ff @(posedge clk) begin + stage[0].op <= op; + stage[0].wave <= wave; + + for (int i = 1; i < FPINT_STAGES; ++i) + stage[i] <= stage[i - 1]; + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + for (int i = 1; i < FPINT_STAGES; ++i) + stage_valid[i] <= 0; + + wb.valid <= 0; + end else begin + for (int i = 1; i < FPINT_STAGES; ++i) + stage_valid[i] <= stage_valid[i - 1]; + + // Se levanta 1 ciclo luego que in_valid + stage_valid[2] <= stage_valid[1] & ~abort; + + wb.valid <= stage_valid[FPINT_STAGES - 1]; + end + +endmodule + +module gfx_shader_fpint_lane +import gfx::*; +( + input logic clk, + + input word a, + b, + + input logic mul_float_0, + unit_b_0, + put_hi_2, + put_lo_2, + put_mul_2, + zero_b_2, + zero_flags_2, + abs_3, + swap_3, + zero_min_3, + copy_flags_3, + int_signed_5, + copy_flags_6, + int_operand_6, + force_nop_7, + copy_flags_11, + copy_flags_12, + enable_12, + enable_14, + + output word q +); + + /* Notas de implementación para floating-point + * + * === PRODUCTO === + * + * Queremos calcular q = a * b. + * + * Donde a = (-1)^s * 1.m * 2^f, + * b = (-1)^t * 1.n * 2^g + * + * Entonces q = (-1)^(s + t) (1.m * 1.n) 2^(f + g) + * + * El producto es entre números >= 1.0 y < 2.0. En el peor caso: + * Mejor caso: 1.000... * 1.000... ~ 1.000... + * Peor caso: 1.999... * 1.999... ~ 3.999... = 2^1 * 1.999 + * + * Así que, si el producto es >= 2, hay que hacerle >> 1 a la mantisa + * y sumarle 1 al exponente para normalizar. + * + * + * === SUMA/RESTA === + * + * Queremos calcular q = a + b. Curiosamente, eso es más complicado que a * b. + * Hay que ajustar el exponente del menor entre a y b para que coincida + * con el del mayor (desnormalizando), realizar la operación y finalmente + * renormalizar. Se hace suma o resta dependiendo de relaciones de signos, + * no según la operación de entrada (eso último solo le hace xor al signo de b). + * Recordar aquí que IEEE 754 es una especie de signo-magnitud y no complemento. + * + * En el caso de una resta, el exponente normalizado puede ser mucho más + * pequeño que cualquiera de los exponentes de entrada. Necesitamos + * entonces de lǵoica CLZ (count leading zeros) para renormalizar. + * + * + * === CONVERSIÓN INTEGER->FP === + * + * Esto simplemente usa el mismo datapath de fadd, con el abs del entero + * como entrada como entrada de clz. El exponente de referencia se fija + * en 30 (aludiendo al segundo msb de un entero de 32 bits). A partir de + * ese punto es idéntico a un fadd, las etapas de clz se encargan de ajustar + * el exponente. + */ + + fpint_setup_mulclass setup_mulclass; + fpint_mulclass_mnorm mulclass_mnorm; + fpint_mnorm_minmax mnorm_minmax; + fpint_minmax_expdiff minmax_expdiff; + fpint_expdiff_shiftr expdiff_shiftr; + fpint_shiftr_addsub shiftr_addsub; + fpint_addsub_clz addsub_clz; + fpint_clz_shiftl clz_shiftl; + fpint_shiftl_round shiftl_round; + fpint_round_rnorm round_rnorm; + fpint_rnorm_encode rnorm_encode; + + gfx_shader_fpint_setup stage_0 + ( + .clk(clk), + .a(a), + .b(b), + .out(setup_mulclass), + .unit_b(unit_b_0), + .mul_float(mul_float_0) + ); + + gfx_shader_fpint_mulclass stage_1 + ( + .clk(clk), + .in(setup_mulclass), + .out(mulclass_mnorm) + ); + + gfx_shader_fpint_mnorm stage_2 + ( + .clk(clk), + .in(mulclass_mnorm), + .out(mnorm_minmax), + .put_hi(put_hi_2), + .put_lo(put_lo_2), + .put_mul(put_mul_2), + .zero_b(zero_b_2), + .zero_flags(zero_flags_2) + ); + + gfx_shader_fpint_minmax stage_3 + ( + .clk(clk), + .in(mnorm_minmax), + .out(minmax_expdiff), + .abs(abs_3), + .swap(swap_3), + .zero_min(zero_min_3), + .copy_flags(copy_flags_3) + ); + + gfx_shader_fpint_expdiff stage_4 + ( + .clk(clk), + .in(minmax_expdiff), + .out(expdiff_shiftr) + ); + + gfx_shader_fpint_shiftr stage_5 + ( + .clk(clk), + .in(expdiff_shiftr), + .out(shiftr_addsub), + .int_signed(int_signed_5) + ); + + gfx_shader_fpint_addsub stage_6 + ( + .clk(clk), + .in(shiftr_addsub), + .out(addsub_clz), + .copy_flags(copy_flags_6), + .int_operand(int_operand_6) + ); + + gfx_shader_fpint_clz stage_7_8_9_10 + ( + .clk(clk), + .in(addsub_clz), + .out(clz_shiftl), + .force_nop(force_nop_7) + ); + + gfx_shader_fpint_shiftl stage_11 + ( + .clk(clk), + .in(clz_shiftl), + .out(shiftl_round), + .copy_flags(copy_flags_11) + ); + + gfx_shader_fpint_round stage_12 + ( + .clk(clk), + .in(shiftl_round), + .out(round_rnorm), + .enable(enable_12), + .copy_flags(copy_flags_12) + ); + + gfx_shader_fpint_rnorm stage_13 + ( + .clk(clk), + .in(round_rnorm), + .out(rnorm_encode) + ); + + gfx_shader_fpint_encode stage_14 + ( + .clk(clk), + .q(q), + .in(rnorm_encode), + .enable(enable_14) + ); + +endmodule + +// Stage 0: argumentos de mul +module gfx_shader_fpint_setup +import gfx::*; +( + input logic clk, + + input word a, + b, + input logic mul_float, + unit_b, + + output fpint_setup_mulclass out +); + + always_ff @(posedge clk) begin + out.a <= a; + out.b <= b; + out.a_mul <= a; + out.b_mul <= b; + + /* Nótese que el orden es sign-exp-mant. Esto coloca el '1.' implícito + * en la posición correcta para multiplicar las mantisas. + */ + if (mul_float) begin + out.a_mul.exp <= 1; + out.b_mul.exp <= 1; + out.a_mul.sign <= 0; + out.b_mul.sign <= 0; + end + + if (unit_b) begin + out.b_mul.exp <= 0; + out.b_mul.mant <= 1; + out.b_mul.sign <= 0; + end + end + +endmodule + +// Stage 1: multiplicación de fp o enteros +module gfx_shader_fpint_mulclass +import gfx::*; +( + input logic clk, + + input fpint_setup_mulclass in, + + output fpint_mulclass_mnorm out +); + + always_ff @(posedge clk) begin + out.b <= in.b; + out.sign <= in.a.sign ^ in.b.sign; + out.a_class <= classify_float(in.a); + out.b_class <= classify_float(in.b); + out.product <= in.a_mul * in.b_mul; + {out.overflow, out.exp} <= {1'b0, in.a.exp} + {1'b0, in.b.exp} - {1'b0, FLOAT_EXP_BIAS}; + end + +endmodule + +// Stage 2: normalización +module gfx_shader_fpint_mnorm +import gfx::*; +( + input logic clk, + + input fpint_mulclass_mnorm in, + input logic put_hi, + put_lo, + put_mul, + zero_b, + zero_flags, + + output fpint_mnorm_minmax out +); + + word product_hi, product_lo; + logic guard, lo_msb, lo_reduce, round, slow_in_next; + float_mant_full hi; + logic[$bits(float_mant_full) - 3:0] lo; + + assign lo_msb = lo[$bits(lo) - 1]; + assign lo_reduce = |lo[$bits(lo) - 2:0]; + assign slow_in_next = is_float_special(in.a_class) | is_float_special(in.b_class); + assign {product_hi, product_lo} = in.product; + assign {hi, guard, round, lo} = in.product[2 * $bits(float_mant_full) - 1:0]; + + always_ff @(posedge clk) begin + if (put_mul) begin + out.slow <= slow_in_next | (in.overflow & ~in.a_class.exp_min & ~in.a_class.exp_min); + out.zero <= in.a_class.exp_min | in.b_class.exp_min; + end else begin + out.slow <= 0; + out.zero <= 0; + end + + out.a.sign <= in.sign; + out.overflow <= 0; + + if (hi[$bits(hi) - 1]) begin + out.guard <= guard; + out.round <= round; + out.sticky <= lo_msb | lo_reduce; + out.a.mant <= implicit_mant(hi); + {out.overflow, out.a.exp} <= {1'b0, in.exp} + 1; + end else begin + /* Bit antes de msb es necesariamente 1, ya que los msb de + * ambos multiplicandos son 1. Ver assert en implicit_mant(). + */ + out.guard <= round; + out.round <= lo_msb; + out.sticky <= lo_reduce; + + out.a.exp <= in.exp; + out.a.mant <= implicit_mant({hi[$bits(hi) - 2:0], guard}); + end + + unique case (1'b1) + put_mul: ; + + put_hi: + out.a <= product_hi; + + put_lo: + out.a <= product_lo; + endcase + + out.a_class <= in.a_class; + out.slow_in <= slow_in_next; + + if (zero_flags) begin + out.a_class <= classify_float(0); + out.slow_in <= 0; + end + + if (zero_b) begin + out.b <= 0; + out.b_class <= classify_float(0); + end else begin + out.b <= in.b; + out.b_class <= in.b_class; + end + end + +endmodule + +// Stage 3: ordenar tal que abs(max) >= abs(min) +module gfx_shader_fpint_minmax +import gfx::*; +( + input logic clk, + + input fpint_mnorm_minmax in, + input logic abs, + swap, + zero_min, + copy_flags, + + output fpint_minmax_expdiff out +); + + logic abs_b_gt_abs_a, b_gt_a; + + /* Wiki dice: + * + * A property of the single- and double-precision formats is that + * their encoding allows one to easily sort them without using + * floating-point hardware, as if the bits represented sign-magnitude + * integers, although it is unclear whether this was a design + * consideration (it seems noteworthy that the earlier IBM hexadecimal + * floating-point representation also had this property for normalized + * numbers). + */ + assign abs_b_gt_abs_a = {in.b.exp, in.b.mant} > {in.a.exp, in.a.mant}; + + always_comb begin + unique case ({in.b.sign, in.a.sign}) + 2'b00: b_gt_a = abs_b_gt_abs_a; + 2'b01: b_gt_a = 1; + 2'b10: b_gt_a = 0; + 2'b11: b_gt_a = abs_b_gt_abs_a; + endcase + + if (abs) + b_gt_a = abs_b_gt_abs_a; + end + + always_ff @(posedge clk) begin + if (b_gt_a ^ swap) begin + out.max <= in.b; + out.min <= in.a; + out.max_class <= in.b_class; + out.min_class <= in.a_class; + end else begin + out.max <= in.a; + out.min <= in.b; + out.max_class <= in.a_class; + out.min_class <= in.b_class; + end + + if (zero_min) begin + out.min <= 0; + out.min_class <= classify_float(0); + end + + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + + if (copy_flags) begin + out.slow <= in.slow | in.overflow; + out.zero <= in.zero; + end else begin + out.slow <= in.slow_in; + out.zero <= 0; + end + end + +endmodule + +// Stage 4: exp_shift amount +module gfx_shader_fpint_expdiff +import gfx::*; +( + input logic clk, + + input fpint_minmax_expdiff in, + + output fpint_expdiff_shiftr out +); + + float_exp exp_delta; + + assign exp_delta = in.max.exp - in.min.exp; + + always_ff @(posedge clk) begin + out.max <= in.max; + out.min <= in.min; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + out.max_class <= in.max_class; + out.min_class <= in.min_class; + + out.exp_shift <= exp_delta[$bits(out.exp_shift) - 1:0]; + if (exp_delta > {{($bits(exp_delta) - $bits(FPINT_MAX_SHIFT)){1'b0}}, FPINT_MAX_SHIFT}) + out.exp_shift <= FPINT_MAX_SHIFT; + end + +endmodule + +// Stage 5: shifts y abs(max) para enteros con signo +module gfx_shader_fpint_shiftr +import gfx::*; +( + input logic clk, + + input fpint_expdiff_shiftr in, + input logic int_signed, + + output fpint_shiftr_addsub out +); + + always_ff @(posedge clk) begin + out.min <= in.min; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + out.sticky <= in.sticky; + out.min_class <= in.min_class; + + out.max_mant <= float_prepare_round(in.max, in.max_class); + out.min_mant <= float_prepare_round(in.min, in.min_class) >> in.exp_shift; + out.sticky_mask <= {($bits(out.min_mant)){1'b1}} << in.exp_shift; + + out.max <= in.max; + out.int_sign <= in.max[$bits(in.max) - 1]; + + if (int_signed & in.max[$bits(in.max) - 1]) + out.max <= -in.max; + end + +endmodule + +// Stage 6: suma de mantisas +module gfx_shader_fpint_addsub +import gfx::*; +( + input logic clk, + + input fpint_shiftr_addsub in, + input logic copy_flags, + int_operand, + + output fpint_addsub_clz out +); + + localparam int INT_SHIFT_REF = $bits(word) - 2; + + function word fp_add_sub_arg(float_mant_ext arg); + fp_add_sub_arg = {1'b0, arg, {($bits(fp_add_sub_arg) - $bits(arg) - 1){1'b0}}}; + endfunction + + always_ff @(posedge clk) begin + out.max <= in.max; + out.slow <= in.slow; + out.zero <= in.zero; + out.guard <= in.guard; + out.round <= in.round; + + if (int_operand) begin + out.max.exp <= FLOAT_EXP_BIAS + INT_SHIFT_REF[$bits(float_exp) - 1:0]; + out.max.sign <= in.int_sign; + end + + if (copy_flags) + out.sticky <= in.sticky; + else + out.sticky <= |(float_prepare_round(in.min, in.min_class) & ~in.sticky_mask); + + if (int_operand) + out.add_sub <= in.max; + else if (in.max.sign ^ in.min.sign) + out.add_sub <= fp_add_sub_arg(in.max_mant) - fp_add_sub_arg(in.min_mant); + else + out.add_sub <= fp_add_sub_arg(in.max_mant) + fp_add_sub_arg(in.min_mant); + end + +endmodule + +// Stages 7-10: encontrar el 1 más significativo +module gfx_shader_fpint_clz +import gfx::*; +( + input logic clk, + + input fpint_addsub_clz in, + input logic force_nop, + + output fpint_clz_shiftl out +); + + word clz_in; + fpint_clz_hold hold[FPINT_CLZ_STAGES]; + + assign out.hold = hold[FPINT_CLZ_STAGES - 1]; + + gfx_clz #($bits(word)) clz + ( + .clk(clk), + .clz(out.shift), + .value(clz_in) + ); + + always_comb begin + clz_in = in.add_sub; + if (force_nop) + clz_in[$bits(clz_in) - 1:$bits(clz_in) - 2] = 2'b01; + end + + always_ff @(posedge clk) begin + hold[0] <= in; + + for (int i = 1; i < FPINT_CLZ_STAGES; ++i) + hold[i] <= hold[i - 1]; + end + +endmodule + +// Stage 11: normalización +module gfx_shader_fpint_shiftl +import gfx::*; +( + input logic clk, + + input fpint_clz_shiftl in, + input logic copy_flags, + + output fpint_shiftl_round out +); + + localparam int CLZ_EXTEND_BITS = $bits(float_exp) - $bits(in.shift) + 1; + + word normalized; + + assign normalized = in.hold.add_sub << in.shift; + + always_ff @(posedge clk) begin + out.slow <= in.hold.slow; + out.zero <= in.hold.zero; + out.sticky <= in.hold.sticky; + out.val.sign <= in.hold.max.sign; + + {out.val.mant, out.guard, out.round, out.sticky_last} <= + normalized[$bits(normalized) - 2:$bits(normalized) - $bits(float_mant) - 4]; + + {out.overflow, out.val.exp} <= + {1'b0, in.hold.max.exp} - {{CLZ_EXTEND_BITS{1'b0}}, in.shift} + 1; + + if (in.shift[$bits(in.shift) - 1]) + out.zero <= 1; + + if (copy_flags) begin + out.guard <= in.hold.guard; + out.round <= in.hold.round; + out.overflow <= 0; + out.sticky_last <= 0; + end + end + +endmodule + +// Stage 12: redondeo +module gfx_shader_fpint_round +import gfx::*; +( + input logic clk, + + input fpint_shiftl_round in, + input logic copy_flags, + enable, + + output fpint_round_rnorm out +); + + always_ff @(posedge clk) begin + out.val <= in.val; + out.slow <= in.slow | (~copy_flags & in.overflow & ~in.zero); + out.zero <= in.zero; + out.exp_step <= 0; + + // Este es el modo de redondeo más usual: round to nearest, ties to even + if (enable & in.guard & (in.round | in.sticky | in.sticky_last | in.val.mant[0])) + {out.exp_step, out.val.mant} <= {1'b0, out.val.mant} + 1; + end + +endmodule + +// Stage 13: ajuste de exponente por redondeo +module gfx_shader_fpint_rnorm +import gfx::*; +( + input logic clk, + + input fpint_round_rnorm in, + + output fpint_rnorm_encode out +); + + always_ff @(posedge clk) begin + out.slow <= in.slow; + out.zero <= in.zero; + out.overflow <= 0; + out.val.mant <= in.val.mant; + out.val.sign <= in.val.sign; + + if (in.exp_step) + {out.overflow, out.val.exp} <= {1'b0, in.val.exp} + 1; + else + out.val.exp <= in.val.exp; + end + +endmodule + +// Stage 14: salida y codificación de ceros y NaNs +module gfx_shader_fpint_encode +import gfx::*; +( + input logic clk, + + input fpint_rnorm_encode in, + input logic enable, + + output float q +); + + always_ff @(posedge clk) begin + q <= in.val; + + if (enable) begin + if (&in.val.exp | in.slow | in.overflow) begin + q.exp <= FLOAT_EXP_MAX; + q.mant <= 1; + end else if (in.zero) begin + q.exp <= 0; + q.mant <= 0; + end + end + end + +endmodule diff --git a/rtl/gfx/gfx_shader_front.sv b/rtl/gfx/gfx_shader_front.sv new file mode 100644 index 0000000..52074fd --- /dev/null +++ b/rtl/gfx/gfx_shader_front.sv @@ -0,0 +1,746 @@ +typedef struct +{ + logic valid, + retry; + gfx::group_id group; + gfx_isa::insn_word insn; +} front_wave; + +typedef struct +{ + gfx::xgpr_num dest; + logic dest_scalar; +} front_reg_passthru; + +typedef logic[4:0] icache_line_num; + +typedef logic[$bits(gfx::oword_ptr) - $bits(icache_line_num) - 1:0] icache_tag; + +typedef struct packed +{ + icache_tag tag; + icache_line_num line; +} icache_line_tag; + +typedef struct packed +{ + icache_line_tag line_tag; + logic[2:0] word_num; +} icache_ptr; + +module gfx_shader_front +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axib.m fetch_mem, + + input logic icache_flush, + + gfx_regfile_io.read reg_read, + gfx_regfile_io.bind_ reg_bind, + + gfx_front_back.front front +); + + word fetch_insn, port_insn; + logic fetch_hit, p0_writeback; + front_wave bind_wave, dec_wave, port_dec_wave; + front_reg_passthru reg_passthru; + + assign front.execute.wave.dest = reg_passthru.dest; + assign front.execute.wave.dest_scalar = reg_passthru.dest_scalar; + + gfx_shader_bind bind_ + ( + .clk, + .rst_n, + .mem(fetch_mem), + .wave(bind_wave), + .regs(reg_bind), + .loop_valid(front.loop.valid), + .loop_group(front.loop.group), + .icache_flush + ); + + gfx_shader_read_regs reg_dec + ( + .clk, + .rst_n, + .in(bind_wave), + .out(dec_wave), + .read(reg_read), + .passthru(reg_passthru) + ); + + gfx_shader_decode_class class_dec + ( + .clk, + .rst_n, + .wave(dec_wave), + .out_group(front.execute.wave.group), + .port_wave(port_dec_wave), + .dispatch(front.dispatch), + .p0_writeback + ); + + gfx_shader_decode_fpint p0_dec + ( + .clk, + .op(front.execute.p0), + .insn(port_dec_wave.insn), + .writeback(p0_writeback) + ); + +endmodule + +module gfx_shader_bind +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axib.m mem, + + input logic icache_flush, + + input logic loop_valid, + input group_id loop_group, + + gfx_regfile_io.bind_ regs, + + output front_wave wave +); + + localparam int ICACHE_STAGES = 6; + localparam int BIND_STAGES = REGFILE_STAGES + ICACHE_STAGES; + + gfx_beats #($bits(group_id)) runnable_in(), runnable_out(); + + logic ar_stall, request_ready, request_valid, valids[BIND_STAGES]; + group_id groups[BIND_STAGES]; + icache_line_tag araddr, request_addr; + + assign mem.bready = 0; + assign mem.wvalid = 0; + assign mem.awvalid = 0; + + assign mem.arlen = ($bits(mem.arlen))'($bits(oword) / $bits(word) - 1); + assign mem.araddr = {araddr, ($clog2($bits(oword)) - $clog2($bits(word)) + SUBWORD_BITS)'('0)}; + assign mem.arburst = 2'b01; // Incremental mode + + assign runnable_in.tx.data = loop_group; + assign runnable_in.tx.valid = loop_valid; + + assign regs.pc_front_group = runnable_out.rx.data; + assign runnable_out.rx.ready = 1; + + assign wave.group = groups[$size(groups) - 1]; + + gfx_skid_buf #($bits(araddr)) ar_skid + ( + .clk, + .in(request_addr), + .out(araddr), + .stall(ar_stall) + ); + + gfx_skid_flow ar_flow + ( + .clk, + .rst_n, + .stall(ar_stall), + .in_ready(request_ready), + .in_valid(request_valid), + .out_ready(mem.arready), + .out_valid(mem.arvalid) + ); + + //TODO: Podríamos quitar ~25 entries sin afectar throughput, latencia o correctitud + gfx_fifo #(.WIDTH($bits(group_id)), .DEPTH(1 << $bits(group_id))) runnable + ( + .clk, + .rst_n, + .in(runnable_in.rx), + .out(runnable_out.tx) + ); + + gfx_shader_bind_icache icache + ( + .clk, + .rst_n, + + .icache_flush, + .read_addr(regs.pc_front), + .read_valid(valids[REGFILE_STAGES - 1]), + + .request_addr, + .request_valid, + .request_ready, + + .fetch_data(mem.rdata), + .fetch_last(mem.rlast), + .fetch_valid(mem.rvalid), + .fetch_ready(mem.rready), + + .insn(wave.insn), + .insn_retry(wave.retry), + .insn_valid(wave.valid) + ); + + always_ff @(posedge clk) begin + groups[0] <= runnable_out.rx.data; + for (int i = 1; i < $size(groups); ++i) + groups[i] <= groups[i - 1]; + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) + for (int i = 0; i < $size(valids); ++i) + valids[i] <= 0; + else begin + valids[0] <= runnable_out.rx.valid; + for (int i = 1; i < $size(valids); ++i) + valids[i] <= valids[i - 1]; + end + +endmodule + +module gfx_shader_bind_icache +import gfx::*; +( + input logic clk, + rst_n, + + input logic icache_flush, + + input logic read_valid, + input icache_ptr read_addr, + + input logic fetch_last, + fetch_valid, + input word fetch_data, + output logic fetch_ready, + + input logic request_ready, + output logic request_valid, + output icache_line_tag request_addr, + + output logic insn_valid, + insn_retry, + output word insn +); + + // Dan Gisselquist limita a (1 << 3) bursts por defecto. + // Ver LGMAXBURST en axixbar.v + localparam int PENDING_FIFO_DEPTH = 8; + + enum int unsigned + { + FLUSH, + RUN + } state; + + struct + { + logic valid, + accessed, + hit; + icache_tag tag; + oword data; + } cache[1 << $bits(icache_line_num)], read, read_hold; + + gfx_beats #($bits(icache_line_tag)) pending_in(), pending_out(); + + logic accessed_write, accessed_write_enable, burst, fetch_done, hit_write, + in_flush, hit_commit, hit_write_enable, retry_4, retry_5, rollback, + tag_hit, valid_1, valid_2, valid_3, valid_4, valid_5, valid_write, + valid_write_enable; + + icache_ptr read_addr_1, read_addr_2, read_addr_3, read_addr_4, read_addr_5; + icache_tag tag_write; + icache_line_num accessed_write_line, flush_ptr, hit_write_line, valid_write_line; + icache_line_tag pending_pop; + + oword data_write; + word[1:0] data_5; + word[7:0] fetch_shift; + qword[1:0] data_3; + udword[1:0] data_4; + + assign data_3 = read.data; + assign tag_hit = read.tag == read_addr_3.line_tag.tag; + assign fetch_ready = ~fetch_done; + assign pending_pop = pending_out.rx.data; + + assign request_addr = read_addr_4.line_tag; + assign request_valid = burst & pending_in.tx.ready; + assign pending_in.tx.data = read_addr_4.line_tag; + assign pending_in.tx.valid = burst & request_ready; + assign pending_out.rx.ready = fetch_done & ~hit_commit & ~rollback; + + gfx_fifo #(.WIDTH($bits(icache_line_tag)), .DEPTH(PENDING_FIFO_DEPTH)) pending + ( + .clk, + .rst_n, + .in(pending_in.rx), + .out(pending_out.tx) + ); + + always_comb + unique case (state) + FLUSH: in_flush = 1; + RUN: in_flush = 0; + endcase + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + state <= FLUSH; + flush_ptr <= '0; + fetch_done <= 0; + + valid_1 <= 0; + valid_2 <= 0; + valid_3 <= 0; + valid_4 <= 0; + valid_5 <= 0; + + burst <= 0; + end else begin + unique case (state) + FLUSH: + if (~icache_flush & &flush_ptr) + state <= RUN; + + RUN: + if (icache_flush) + state <= FLUSH; + endcase + + flush_ptr <= flush_ptr + 1; + if (icache_flush) + flush_ptr <= '0; + + if (fetch_done) + fetch_done <= hit_commit | ~pending_out.rx.valid | rollback; + else if (fetch_ready & fetch_valid) + fetch_done <= fetch_last; + + valid_1 <= read_valid; + valid_2 <= valid_1; + valid_3 <= valid_2; + valid_4 <= valid_3; + valid_5 <= valid_4; + + burst <= valid_3 & ~tag_hit & ~read.accessed & (~read.valid | read.hit); + end + + always_ff @(posedge clk) begin + tag_write <= pending_pop.tag; + data_write <= fetch_shift; + + valid_write <= 1; + valid_write_line <= pending_pop.line; + valid_write_enable <= fetch_done & ~hit_commit & pending_out.rx.valid & ~rollback; + + accessed_write <= 0; + accessed_write_enable <= 1; + + if (rollback) + accessed_write_line <= read_addr_5.line_tag.line; + else if (fetch_done & ~hit_commit & pending_out.rx.valid) + accessed_write_line <= pending_pop.line; + else begin + accessed_write <= 1; + accessed_write_line <= read_addr.line_tag.line; + accessed_write_enable <= read_valid; + end + + hit_write <= hit_commit; + if (hit_commit) begin + hit_write_line <= read_addr_4.line_tag.line; + hit_write_enable <= 1; + end else begin + hit_write_line <= pending_pop.line; + hit_write_enable <= fetch_done & pending_out.rx.valid & ~rollback; + end + + if (in_flush) begin + valid_write <= 0; + valid_write_line <= flush_ptr; + valid_write_enable <= 1; + + accessed_write <= 0; + accessed_write_line <= flush_ptr; + accessed_write_enable <= 1; + + hit_write <= 0; + hit_write_line <= flush_ptr; + hit_write_enable <= 1; + end + + if (valid_write_enable) begin + cache[valid_write_line].tag <= tag_write; + cache[valid_write_line].data <= data_write; + cache[valid_write_line].valid <= valid_write; + end + + if (accessed_write_enable) + cache[accessed_write_line].accessed <= accessed_write; + + if (hit_write_enable) + cache[hit_write_line].hit <= hit_write; + + read_addr_1 <= read_addr; + + read_hold <= cache[read_addr_1.line_tag.line]; + read_addr_2 <= read_addr_1; + + read <= read_hold; + read_addr_3 <= read_addr_2; + + data_4 <= data_3[read_addr_3.word_num[2]]; + retry_4 <= ~tag_hit | ~read.valid; + hit_commit <= valid_3 & tag_hit & read.valid; + read_addr_4 <= read_addr_3; + + data_5 <= data_4[read_addr_4.word_num[1]]; + retry_5 <= retry_4; + rollback <= burst & (~request_valid | ~pending_in.tx.valid); + read_addr_5 <= read_addr_4; + + insn <= data_5[read_addr_5.word_num[0]]; + insn_retry <= retry_5; + insn_valid <= valid_5; + + if (fetch_ready & fetch_valid) begin + fetch_shift[0] <= fetch_data; + for (int i = 1; i < $size(fetch_shift); ++i) + fetch_shift[i] <= fetch_shift[i - 1]; + end + end + +endmodule + +module gfx_shader_read_regs +import gfx::*; +import gfx_isa::*; +( + input logic clk, + rst_n, + + input front_wave in, + + gfx_regfile_io.read read, + + output front_wave out, + output front_reg_passthru passthru +); + + // + 1 por next-cycle de read.op + localparam int PASSTHRU_DEPTH = REG_READ_STAGES + 1 - 2; + localparam int HOLD_DEPTH = PASSTHRU_DEPTH - 2; + + logic reg_rev; + logic valid[HOLD_DEPTH]; + front_wave out_hold[HOLD_DEPTH]; + front_reg_passthru passthru_hold[PASSTHRU_DEPTH]; + + assign passthru = passthru_hold[$size(passthru_hold) - 1]; + + assign reg_rev = in.insn.reg_rev; + + always_comb begin + out = out_hold[$size(out_hold) - 1]; + out.valid = valid[$size(valid) - 1]; + end + + always_ff @(posedge clk) begin + out_hold[0] <= in; + for (int i = 1; i < $size(out_hold); ++i) + out_hold[i] <= out_hold[i - 1]; + + passthru_hold[0].dest <= in.insn.dst_src.rr.rd; + unique case (in.insn.reg_mode) + REGS_SVS, REGS_SSS: + passthru_hold[0].dest_scalar <= 1; + + REGS_VVS, REGS_VVV: + passthru_hold[0].dest_scalar <= 0; + endcase + + for (int i = 1; i < $size(passthru_hold); ++i) + passthru_hold[i] <= passthru_hold[i - 1]; + + read.op.group <= in.group; + + read.op.b_imm <= in.insn.dst_src.rr.b.imm; + read.op.a_sgpr <= in.insn.dst_src.rr.ra.sgpr; + read.op.b_sgpr <= in.insn.dst_src.rr.b.read.r.sgpr; + read.op.a_vgpr <= in.insn.dst_src.rr.ra.vgpr.num; + read.op.b_vgpr <= in.insn.dst_src.rr.b.read.r.vgpr.num; + read.op.b_is_imm <= in.insn.dst_src.rr.b_is_imm; + read.op.b_is_const <= in.insn.dst_src.rr.b.read.from_consts; + read.op.scalar_rev <= reg_rev; + + unique case (in.insn.reg_mode) + REGS_SVS, REGS_VVS: begin + read.op.a_scalar <= reg_rev; + read.op.b_scalar <= ~reg_rev; + end + + REGS_SSS: begin + read.op.a_scalar <= 1; + read.op.b_scalar <= 1; + end + + REGS_VVV: begin + read.op.a_scalar <= 0; + read.op.b_scalar <= 0; + end + endcase + end + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) + for (int i = 0; i < HOLD_DEPTH; ++i) + valid[i] <= 0; + else begin + valid[0] <= in.valid; + + for (int i = 1; i < HOLD_DEPTH; ++i) + valid[i] <= valid[i - 1]; + end + +endmodule + +module gfx_shader_decode_class +import gfx::*; +import gfx_isa::*; +( + input logic clk, + rst_n, + + input front_wave wave, + output front_wave port_wave, + output group_id out_group, + + output shader_dispatch dispatch, + output logic p0_writeback +); + + logic is_fsu, is_mem, is_group, hold_valid, retry; + front_wave hold_wave; + + assign p0_writeback = ~(is_mem | is_fsu | is_group | retry); + + always_comb begin + port_wave = hold_wave; + port_wave.valid = hold_valid; + end + + always_ff @(posedge clk) begin + hold_wave <= wave; + out_group <= port_wave.group; + end + + always_ff @(posedge clk or negedge rst_n) + // Intencionalmente repetitivo + if (~rst_n) begin + is_fsu <= 0; + is_mem <= 0; + is_group <= 0; + + retry <= 0; + hold_valid <= 0; + + dispatch <= '0; + end else begin + is_fsu <= 0; + is_mem <= 0; + is_group <= 0; + + retry <= wave.retry; + hold_valid <= wave.valid; + + unique case (wave.insn.insn_class) + INSN_FPINT: ; // p0 no tiene ready + INSN_MEM: is_mem <= 1; + INSN_SFU: is_fsu <= 1; + INSN_GROUP: is_group <= 1; + + default: + {is_mem, is_fsu, is_group} <= 'x; + endcase + + dispatch.p1 <= is_mem; + dispatch.p2 <= is_fsu; + dispatch.p3 <= is_group; + + if (~hold_valid | retry) begin + dispatch.p1 <= 0; + dispatch.p2 <= 0; + dispatch.p3 <= 0; + end + + dispatch.valid <= hold_valid; + end + +endmodule + +module gfx_shader_decode_fpint +import gfx::*; +import gfx_isa::*; +( + input logic clk, + + input insn_word insn, + input logic writeback, + + output fpint_op op +); + + always_ff @(posedge clk) begin + unique case (insn.by_class.fpint.op) + INSN_FPINT_MOV: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 1; + op.mnorm_zero_b <= 1; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + INSN_FPINT_FMUL: begin + op.setup_mul_float <= 1; + op.setup_unit_b <= 0; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 0; + op.mnorm_put_mul <= 1; + op.mnorm_zero_flags <= 0; + op.mnorm_zero_b <= 1; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + INSN_FPINT_IMUL: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 0; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 1; + op.mnorm_zero_b <= 1; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 0; + op.encode_enable <= 0; + end + + INSN_FPINT_FADD: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 0; + op.mnorm_zero_b <= 0; + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 0; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 0; + op.clz_force_nop <= 0; + op.shiftl_copy_flags <= 0; + op.round_copy_flags <= 0; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + INSN_FPINT_FMAX, INSN_FPINT_FMIN: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 0; + op.mnorm_zero_b <= 0; + op.minmax_abs <= 0; + op.minmax_swap <= insn.by_class.fpint.op == INSN_FPINT_FMIN; + op.minmax_zero_min <= 1; + op.minmax_copy_flags <= 1; + op.shiftr_int_signed <= 0; + op.addsub_int_operand <= 0; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 1; + op.shiftl_copy_flags <= 1; + op.round_copy_flags <= 1; + op.round_enable <= 0; + op.encode_enable <= 0; + end + + INSN_FPINT_FCVT: begin + op.setup_mul_float <= 0; + op.setup_unit_b <= 1; + op.mnorm_put_hi <= 0; + op.mnorm_put_lo <= 1; + op.mnorm_put_mul <= 0; + op.mnorm_zero_flags <= 1; + op.mnorm_zero_b <= 1; + + op.minmax_abs <= 1; + op.minmax_swap <= 0; + op.minmax_zero_min <= 0; + op.minmax_copy_flags <= 0; + op.shiftr_int_signed <= 1; + op.addsub_int_operand <= 1; + op.addsub_copy_flags <= 1; + op.clz_force_nop <= 0; + op.shiftl_copy_flags <= 0; + op.round_copy_flags <= 0; + op.round_enable <= 1; + op.encode_enable <= 1; + end + + default: + op <= 'x; + endcase + + op.writeback <= writeback; + end + +endmodule diff --git a/rtl/gfx/gfx_shader_group.sv b/rtl/gfx/gfx_shader_group.sv new file mode 100644 index 0000000..e668877 --- /dev/null +++ b/rtl/gfx/gfx_shader_group.sv @@ -0,0 +1,17 @@ +module gfx_shader_group +import gfx::*; +( + input logic clk, + rst_n, + + input group_op op, + input wave_exec wave, + + gfx_regfile_io.ab read_data, + + gfx_shake.rx in_shake, + + gfx_wb.tx wb +); + +endmodule diff --git a/rtl/gfx/gfx_shader_mem.sv b/rtl/gfx/gfx_shader_mem.sv new file mode 100644 index 0000000..403c9e4 --- /dev/null +++ b/rtl/gfx/gfx_shader_mem.sv @@ -0,0 +1,17 @@ +module gfx_shader_mem +import gfx::*; +( + input logic clk, + rst_n, + + input mem_op op, + input wave_exec wave, + + gfx_regfile_io.ab read_data, + + gfx_shake.rx in_shake, + + gfx_wb.tx wb +); + +endmodule diff --git a/rtl/gfx/gfx_shader_regs.sv b/rtl/gfx/gfx_shader_regs.sv new file mode 100644 index 0000000..ef3a129 --- /dev/null +++ b/rtl/gfx/gfx_shader_regs.sv @@ -0,0 +1,302 @@ +module gfx_shader_regs +import gfx::*; +( + input logic clk, + + gfx_regfile_io.regs io +); + + // verilator tracing_off + + localparam PC_TABLE_PORTS = 2; + localparam MASK_TABLE_PORTS = 1; + + word hold_imm[REGFILE_STAGES], imm_out, read_a_data_sgpr, read_b_data_scalar, + read_b_data_sgpr, read_const, read_a_data_vgpr[SHADER_LANES], + read_b_data_vgpr[SHADER_LANES], sgpr_out_a, sgpr_out_b; + + group_id mask_read_groups[MASK_TABLE_PORTS], pc_read_groups[PC_TABLE_PORTS]; + word_ptr pc_read[PC_TABLE_PORTS]; + lane_mask mask_read[MASK_TABLE_PORTS]; + + logic a_scalar_out, b_is_const_out, b_is_imm_out, b_scalar_out, scalar_rev_out; + group_id hold_read_group_1, hold_read_group_2; + sgpr_num hold_read_a_sgpr; + vgpr_num hold_read_a_vgpr_1, hold_read_a_vgpr_2, hold_read_b_vgpr_1, hold_read_b_vgpr_2; + logic[REGFILE_STAGES - 1:0] hold_b_is_imm, hold_b_is_const; + logic[REGFILE_STAGES + 1 - 1:0] hold_scalar_rev; + logic[REGFILE_STAGES + 2 - 1:0] hold_a_scalar, hold_b_scalar; + + assign io.pc_back = pc_read[0]; + assign io.pc_front = pc_read[1]; + assign pc_read_groups[0] = io.pc_back_group; + assign pc_read_groups[1] = io.pc_front_group; + + assign io.mask_back = mask_read[0]; + assign pc_read_groups[0] = io.mask_back_group; + + assign imm_out = hold_imm[$size(hold_imm) - 1]; + assign a_scalar_out = hold_a_scalar[$bits(hold_a_scalar) - 1]; + assign b_scalar_out = hold_b_scalar[$bits(hold_b_scalar) - 1]; + assign b_is_imm_out = hold_b_is_imm[$bits(hold_b_is_imm) - 1]; + assign b_is_const_out = hold_b_is_const[$bits(hold_b_is_const) - 1]; + assign scalar_rev_out = hold_scalar_rev[$bits(hold_scalar_rev) - 1]; + + gfx_shader_table #(.DATA_WIDTH($bits(word_ptr)), .READ_PORTS(PC_TABLE_PORTS)) pc_table + ( + .clk, + .read(pc_read), + .write(io.pc_wb), + .read_groups(pc_read_groups), + .write_group(io.pc_wb_group), + .write_enable(io.pc_wb_write) + ); + + gfx_shader_table #(.DATA_WIDTH($bits(lane_mask)), .READ_PORTS(MASK_TABLE_PORTS)) mask_table + ( + .clk, + .read(mask_read), + .write(io.mask_wb), + .read_groups(mask_read_groups), + .write_group(io.mask_wb_group), + .write_enable(io.mask_wb_write) + ); + + gfx_shader_consts consts + ( + .clk, + .num(io.op.b_sgpr), + .value(read_const) + ); + + gfx_shader_regfile #($bits(group_id) + $bits(sgpr_num)) sgprs + ( + .clk, + + .read_a_num({hold_read_group_1, hold_read_a_sgpr}), + .read_b_num({io.op.group, io.op.b_sgpr}), + .read_a_data(read_a_data_sgpr), + .read_b_data(read_b_data_sgpr), + + .write(io.sgpr_write.write), + .write_num({io.sgpr_write.group, io.sgpr_write.sgpr}), + .write_data(io.sgpr_write.data) + ); + + generate + for (genvar i = 0; i < SHADER_LANES; ++i) begin: vgprs + gfx_shader_regfile #($bits(group_id) + $bits(vgpr_num)) vgprs + ( + .clk, + + .read_a_num({hold_read_group_2, hold_read_a_vgpr_2}), + .read_b_num({hold_read_group_2, hold_read_b_vgpr_2}), + .read_a_data(read_a_data_vgpr[i]), + .read_b_data(read_b_data_vgpr[i]), + + .write(io.vgpr_write.mask[i]), + .write_num({io.vgpr_write.group, io.vgpr_write.vgpr}), + .write_data(io.vgpr_write.data[i]) + ); + end + endgenerate + + always_ff @(posedge clk) begin + hold_imm[0] <= {{($bits(word) - $bits(io.op.b_imm)){1'b0}}, io.op.b_imm}; + hold_a_scalar[0] <= io.op.a_scalar; + hold_b_scalar[0] <= io.op.b_scalar; + hold_b_is_imm[0] <= io.op.b_is_imm; + hold_b_is_const[0] <= io.op.b_is_const; + hold_scalar_rev[0] <= io.op.scalar_rev; + + for (int i = 1; i < REGFILE_STAGES; ++i) begin + hold_imm[i] <= hold_imm[i - 1]; + hold_a_scalar[i] <= hold_a_scalar[i - 1]; + hold_b_scalar[i] <= hold_b_scalar[i - 1]; + hold_b_is_imm[i] <= hold_b_is_imm[i - 1]; + hold_b_is_const[i] <= hold_b_is_const[i - 1]; + hold_scalar_rev[i] <= hold_scalar_rev[i - 1]; + end + + for (int i = REGFILE_STAGES; i < REGFILE_STAGES + 2; ++i) begin + hold_a_scalar[i] <= hold_a_scalar[i - 1]; + hold_b_scalar[i] <= hold_b_scalar[i - 1]; + end + + hold_scalar_rev[REGFILE_STAGES] <= hold_scalar_rev[REGFILE_STAGES - 1]; + + hold_read_a_sgpr <= io.op.a_sgpr; + hold_read_group_1 <= io.op.group; + hold_read_group_2 <= hold_read_group_1; + + hold_read_a_vgpr_1 <= io.op.a_vgpr; + hold_read_a_vgpr_2 <= hold_read_a_vgpr_1; + + hold_read_b_vgpr_1 <= io.op.b_vgpr; + hold_read_b_vgpr_2 <= hold_read_b_vgpr_1; + + if (b_is_imm_out) + read_b_data_scalar <= imm_out; + else if (b_is_const_out) + read_b_data_scalar <= read_const; + else + read_b_data_scalar <= read_b_data_sgpr; + + if (scalar_rev_out) begin + sgpr_out_a <= read_b_data_scalar; + sgpr_out_b <= read_a_data_sgpr; + end else begin + sgpr_out_a <= read_a_data_sgpr; + sgpr_out_b <= read_b_data_scalar; + end + + for (int i = 0; i < SHADER_LANES; ++i) begin + io.a[i] <= a_scalar_out ? sgpr_out_a : read_a_data_vgpr[i]; + io.b[i] <= b_scalar_out ? sgpr_out_b : read_a_data_vgpr[i]; + end + end + +endmodule + +module gfx_shader_consts +import gfx::*; +( + input logic clk, + + input sgpr_num num, + output word value +); + + word hold_out, rom[1 << $bits(sgpr_num)]; + sgpr_num hold_in; + + always_ff @(posedge clk) begin + value <= hold_out; + hold_in <= num; + hold_out <= rom[hold_in]; + end + + initial begin + rom[0] = 'hffff_ffff; // -1 + rom[1] = 'h7fff_ffff; // 2^31 - 1, útil para abs de fp + rom[2] = 'h8000_0000; // 2^31, útil para neg de fp + rom[3] = 'h3f80_0000; // +1.0 + rom[4] = 'hbf80_0000; // -1.0 + end + +endmodule + +module gfx_shader_regfile +import gfx::*; +#(int DEPTH_LOG = 0) +( + input logic clk, + + input logic[DEPTH_LOG - 1:0] read_a_num, + read_b_num, + output word read_a_data, + read_b_data, + + input logic write, + input logic[DEPTH_LOG - 1:0] write_num, + input word write_data +); + + gfx_shader_regfile_port #(DEPTH_LOG) a + ( + .clk, + .write, + .read_num(read_a_num), + .read_data(read_a_data), + .write_num, + .write_data + ); + + gfx_shader_regfile_port #(DEPTH_LOG) b + ( + .clk, + .write, + .read_num(read_b_num), + .read_data(read_b_data), + .write_num, + .write_data + ); + +endmodule + +module gfx_shader_regfile_port +import gfx::*; +#(int DEPTH_LOG = 0) +( + input logic clk, + + input logic[DEPTH_LOG - 1:0] read_num, + output word read_data, + + input logic write, + input logic[DEPTH_LOG - 1:0] write_num, + input word write_data +); + + word file[1 << DEPTH_LOG], hold_read_data, hold_write_data; + logic hold_write; + logic[DEPTH_LOG - 1:0] hold_read_num, hold_write_num; + + // hold_write no necesita rst_n porque cualquier write inicial es inofensivo + + always_ff @(posedge clk) begin + hold_write <= write; + hold_read_num <= read_num; + hold_write_num <= write_num; + hold_write_data <= write_data; + + hold_read_data <= file[hold_read_num]; + if (hold_write) + file[hold_write_num] <= hold_write_data; + + read_data <= hold_read_data; + end + +endmodule + +module gfx_shader_table +import gfx::*; +#(int DATA_WIDTH = 0, + int READ_PORTS = 0) +( + input logic clk, + + input group_id write_group, + read_groups[READ_PORTS], + + input logic[DATA_WIDTH - 1:0] write, + input logic write_enable, + + output logic[DATA_WIDTH - 1:0] read[READ_PORTS] +); + + genvar i; + + generate + for (i = 0; i < READ_PORTS; ++i) begin: ports + logic write_enable_hold; + group_id read_group_hold, write_group_hold; + logic[DATA_WIDTH - 1:0] data[1 << $bits(group_id)], read_hold, write_hold; + + always_ff @(posedge clk) begin + write_hold <= write; + read_group_hold <= read_groups[i]; + write_group_hold <= write_group; + write_enable_hold <= write_enable; + + read_hold <= data[read_group_hold]; + + if (write_enable_hold) + data[write_group_hold] <= write_hold; + + read[i] <= read_hold; + end + end + endgenerate + +endmodule diff --git a/rtl/gfx/gfx_shader_schedif.rdl b/rtl/gfx/gfx_shader_schedif.rdl new file mode 100644 index 0000000..c846da9 --- /dev/null +++ b/rtl/gfx/gfx_shader_schedif.rdl @@ -0,0 +1,91 @@ +addrmap gfx_shader_schedif { + name = "Scheduler<->core interface"; + + default hw = r; + default sw = w; + default regwidth = 32; + + reg { + name = "Shader core control register"; + + field { + desc = "Set this field to flush the instruction cache"; + + singlepulse; + } IFLUSH[0:0] = 0; + } CORE @ 0x00; + + reg { + name = "Wavefront setup control register"; + + default hw = na; + default sw = r; + default precedence = hw; + + field { + desc = "Wavefront group number"; + + hw = r; + sw = rw; + } GROUP[5:0]; + + field { + desc = "Destination SGPR number"; + + hw = r; + sw = rw; + } XGPR[11:8]; + + field { + desc = "PC table update done, group submitted"; + + rclr; + hwset; + } SUBMIT_DONE[16:16] = 0; + + field { + desc = "General-purpose register update done"; + + rclr; + hwset; + } GPR_DONE[17:17] = 0; + + field { + desc = "Lane mask update done"; + + rclr; + hwset; + } MASK_DONE[18:18] = 0; + } SETUP_CTRL @ 0x04; + + reg { + name = "SGPR/VGPR write register"; + + field { + desc = "Value to write"; + + swmod; + } VALUE[31:0]; + } SETUP_GPR @ 0x08; + + reg { + name = "Lane mask write register"; + + field { + desc = "Mask value to write"; + + swmod; + } MASK[15:0]; + } SETUP_MASK @ 0x0c; + + reg { + name = "Group submit register"; + + field { + desc = "Initial group program counter, submits group on write"; + + swmod; + } PC[31:2]; + } SETUP_SUBMIT @ 0x10; +}; + diff --git a/rtl/gfx/gfx_shader_setup.sv b/rtl/gfx/gfx_shader_setup.sv new file mode 100644 index 0000000..f46fb66 --- /dev/null +++ b/rtl/gfx/gfx_shader_setup.sv @@ -0,0 +1,37 @@ +interface gfx_shader_setup +import gfx::*;; + + struct + { + group_id group; + word_ptr pc; + xgpr_num gpr; + word gpr_value; + lane_mask mask; + logic pc_set, + gpr_set, + mask_set; + } write; + + struct + { + logic gpr, + mask, + submit; + } set_done; + + modport core + ( + input write, + + output set_done + ); + + modport sched + ( + input set_done, + + output write + ); + +endinterface diff --git a/rtl/gfx/gfx_shader_sfu.sv b/rtl/gfx/gfx_shader_sfu.sv new file mode 100644 index 0000000..d65e522 --- /dev/null +++ b/rtl/gfx/gfx_shader_sfu.sv @@ -0,0 +1,17 @@ +module gfx_shader_sfu +import gfx::*; +( + input logic clk, + rst_n, + + input sfu_op op, + input wave_exec wave, + + gfx_regfile_io.ab read_data, + + gfx_shake.rx in_shake, + + gfx_wb.tx wb +); + +endmodule diff --git a/rtl/gfx/gfx_shake.sv b/rtl/gfx/gfx_shake.sv new file mode 100644 index 0000000..baae0c3 --- /dev/null +++ b/rtl/gfx/gfx_shake.sv @@ -0,0 +1,24 @@ +interface gfx_shake; + + logic ready; + logic valid; + + modport tx + ( + input ready, + output valid + ); + + modport rx + ( + input valid, + output ready + ); + + modport peek + ( + input ready, + valid + ); + +endinterface diff --git a/rtl/gfx/gfx_sim_debug.sv b/rtl/gfx/gfx_sim_debug.sv new file mode 100644 index 0000000..4b4622a --- /dev/null +++ b/rtl/gfx/gfx_sim_debug.sv @@ -0,0 +1,50 @@ +module gfx_sim_debug +import gfx::*; +( + input logic clk, + rst_n, + + gfx_axil.s axis +); + + enum int unsigned + { + INPUT, + STALL + } state; + + assign axis.rvalid = 0; + assign axis.arready = 0; + assign axis.awready = 1; + + always_comb + unique case (state) + INPUT: begin + axis.wready = 1; + axis.bvalid = axis.wvalid; + end + + STALL: begin + axis.wready = 0; + axis.bvalid = 1; + end + endcase + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) + state <= INPUT; + else + unique case (state) + INPUT: + if (axis.wvalid) begin + $display("%c", axis.wdata[7:0]); + if (~axis.bready) + state <= STALL; + end + + STALL: + if (axis.bready) + state <= INPUT; + endcase + +endmodule diff --git a/rtl/gfx/gfx_skid_buf.sv b/rtl/gfx/gfx_skid_buf.sv new file mode 100644 index 0000000..e3e5247 --- /dev/null +++ b/rtl/gfx/gfx_skid_buf.sv @@ -0,0 +1,20 @@ +module gfx_skid_buf +#(int WIDTH = 0) +( + input logic clk, + + input logic[WIDTH - 1:0] in, + input logic stall, + + output logic[WIDTH - 1:0] out +); + + logic[WIDTH - 1:0] skid; + + assign out = stall ? skid : in; + + always_ff @(posedge clk) + if (~stall) + skid <= in; + +endmodule diff --git a/rtl/gfx/gfx_skid_flow.sv b/rtl/gfx/gfx_skid_flow.sv new file mode 100644 index 0000000..7890ae3 --- /dev/null +++ b/rtl/gfx/gfx_skid_flow.sv @@ -0,0 +1,31 @@ +module gfx_skid_flow +( + input logic clk, + rst_n, + + input logic in_valid, + out_ready, + + output logic in_ready, + out_valid, + stall +); + + logic was_ready, was_valid; + + assign stall = ~in_ready; + assign in_ready = was_ready | ~was_valid; + assign out_valid = in_valid | stall; + + always_ff @(posedge clk or negedge rst_n) + if (~rst_n) begin + was_ready <= 0; + was_valid <= 0; + end else begin + was_ready <= out_ready; + + if (~stall) + was_valid <= in_valid; + end + +endmodule diff --git a/rtl/gfx/gfx_wb.sv b/rtl/gfx/gfx_wb.sv new file mode 100644 index 0000000..20c7c64 --- /dev/null +++ b/rtl/gfx/gfx_wb.sv @@ -0,0 +1,51 @@ +interface gfx_wb; + + import gfx::*; + + word lanes[SHADER_LANES]; + logic mask_update, pc_inc, pc_update, ready, scalar, valid, writeback; + group_id group; + xgpr_num dest; + lane_mask mask; + pc_offset pc_add; + + modport tx + ( + input ready, + + output dest, + group, + lanes, + valid, + scalar, + writeback, + + mask, + mask_update, + + pc_add, + pc_inc, + pc_update + ); + + modport rx + ( + input dest, + group, + lanes, + valid, + scalar, + writeback, + + mask, + mask_update, + + pc_add, + pc_inc, + pc_update, + + output ready + ); + + +endinterface diff --git a/rtl/gfx/gfx_xbar_sched.sv b/rtl/gfx/gfx_xbar_sched.sv new file mode 100644 index 0000000..95e4afb --- /dev/null +++ b/rtl/gfx/gfx_xbar_sched.sv @@ -0,0 +1,146 @@ +module gfx_xbar_sched +import gfx::*; +( + input logic clk, + srst_n, + + gfx_axil.s sched, + + gfx_axil.m debug, + gfx_axil.m bootrom, + gfx_axil.m shader_0 +); + + localparam word BOOTROM_MASK = 32'hfff0_0000; + localparam word DEBUG_BASE = 32'h0020_0000; + localparam word DEBUG_MASK = 32'hfff0_0000; + localparam word SHADER_0_BASE = 32'h0100_0000; + localparam word SHADER_0_MASK = 32'hfff0_0000; + + defparam xbar.NM = 1; + defparam xbar.NS = 3; + defparam xbar.OPT_LOWPOWER = 0; + + defparam xbar.SLAVE_ADDR = { + SHADER_0_BASE, + DEBUG_BASE, + BOOTROM_BASE + }; + + defparam xbar.SLAVE_MASK = { + SHADER_0_MASK, + DEBUG_MASK, + BOOTROM_MASK + }; + + axilxbar xbar + ( + .S_AXI_ACLK(clk), + .S_AXI_ARESETN(srst_n), + + .S_AXI_AWVALID(sched.awvalid), + .S_AXI_AWREADY(sched.awready), + .S_AXI_AWADDR(sched.awaddr), + .S_AXI_AWPROT('0), + + .S_AXI_WVALID(sched.wvalid), + .S_AXI_WREADY(sched.wready), + .S_AXI_WDATA(sched.wdata), + .S_AXI_WSTRB('1), + + .S_AXI_BVALID(sched.bvalid), + .S_AXI_BREADY(sched.bready), + .S_AXI_BRESP(), + + .S_AXI_ARVALID(sched.arvalid), + .S_AXI_ARREADY(sched.arready), + .S_AXI_ARADDR(sched.araddr), + .S_AXI_ARPROT('0), + + .S_AXI_RVALID(sched.rvalid), + .S_AXI_RREADY(sched.rready), + .S_AXI_RDATA(sched.rdata), + .S_AXI_RRESP(), + + .M_AXI_AWADDR({ + shader_0.awaddr, + debug.awaddr, + bootrom.awaddr + }), + .M_AXI_AWPROT(), + .M_AXI_AWVALID({ + shader_0.awvalid, + debug.awvalid, + bootrom.awvalid + }), + .M_AXI_AWREADY({ + shader_0.awready, + debug.awready, + bootrom.awready + }), + + .M_AXI_WDATA({ + shader_0.wdata, + debug.wdata, + bootrom.wdata + }), + .M_AXI_WSTRB(), + .M_AXI_WVALID({ + shader_0.wvalid, + debug.wvalid, + bootrom.wvalid + }), + .M_AXI_WREADY({ + shader_0.wready, + debug.wready, + bootrom.wready + }), + + .M_AXI_BRESP('0), + .M_AXI_BVALID({ + shader_0.bvalid, + debug.bvalid, + bootrom.bvalid + }), + .M_AXI_BREADY({ + shader_0.bready, + debug.bready, + bootrom.bready + }), + + .M_AXI_ARADDR({ + shader_0.araddr, + debug.araddr, + bootrom.araddr + }), + .M_AXI_ARPROT(), + .M_AXI_ARVALID({ + shader_0.arvalid, + debug.arvalid, + bootrom.arvalid + }), + .M_AXI_ARREADY({ + shader_0.arready, + debug.arready, + bootrom.arready + }), + + .M_AXI_RDATA({ + shader_0.rdata, + debug.rdata, + bootrom.rdata + }), + .M_AXI_RRESP('0), + .M_AXI_RVALID({ + shader_0.rvalid, + debug.rvalid, + bootrom.rvalid + }), + .M_AXI_RREADY({ + shader_0.rready, + debug.rready, + bootrom.rready + }) + ); + +endmodule diff --git a/rtl/gfx/mod.mk b/rtl/gfx/mod.mk new file mode 100644 index 0000000..7525276 --- /dev/null +++ b/rtl/gfx/mod.mk @@ -0,0 +1,18 @@ +cores := gfx_shader_schedif + +define core + $(this)/deps := axixbar gfx_shader_schedif picorv32 + + $(this)/rtl_top := gfx_top + $(this)/rtl_dirs := . + $(this)/rtl_files := gfx_isa.sv gfx_pkg.sv +endef + +define core/gfx_shader_schedif + $(this)/hooks := regblock + + $(this)/regblock_rdl := gfx_shader_schedif.rdl + $(this)/regblock_top := gfx_shader_schedif + $(this)/regblock_args := --default-reset arst_n + $(this)/regblock_cpuif := axi4-lite +endef |
