summaryrefslogtreecommitdiff
path: root/platform/wavelet3d/gfx_shader_front.sv
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--platform/wavelet3d/gfx_shader_front.sv718
1 files changed, 718 insertions, 0 deletions
diff --git a/platform/wavelet3d/gfx_shader_front.sv b/platform/wavelet3d/gfx_shader_front.sv
new file mode 100644
index 0000000..5ad0203
--- /dev/null
+++ b/platform/wavelet3d/gfx_shader_front.sv
@@ -0,0 +1,718 @@
+typedef struct
+{
+ logic valid,
+ retry;
+ gfx::group_id group;
+ gfx_isa::insn_word insn;
+} shader_front_wave;
+
+typedef logic[4:0] icache_line_num;
+
+typedef logic[$bits(gfx::oword_ptr) - $bits(icache_line_num) - 1:0] icache_tag;
+
+typedef struct packed
+{
+ icache_tag tag;
+ icache_line_num line;
+} icache_line_tag;
+
+typedef struct packed
+{
+ icache_line_tag line_tag;
+ logic[2:0] word_num;
+} icache_ptr;
+
+module gfx_shader_front
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_axib.m fetch_mem,
+
+ input logic icache_flush,
+
+ gfx_regfile_io.read reg_read,
+ gfx_regfile_io.bind_ reg_bind,
+
+ gfx_front_back.front front
+);
+
+ word fetch_insn, port_insn;
+ logic fetch_hit, p0_writeback;
+ shader_front_wave bind_wave, dec_wave, port_dec_wave;
+
+ gfx_shader_bind bind_
+ (
+ .clk,
+ .rst_n,
+ .mem(fetch_mem),
+ .wave(bind_wave),
+ .regs(reg_bind),
+ .loop_valid(front.loop.valid),
+ .loop_group(front.loop.group),
+ .icache_flush
+ );
+
+ gfx_shader_read_regs reg_dec
+ (
+ .clk,
+ .rst_n,
+ .in(bind_wave),
+ .out(dec_wave),
+ .read(reg_read)
+ );
+
+ gfx_shader_decode_class class_dec
+ (
+ .clk,
+ .rst_n,
+ .wave(dec_wave),
+ .out_group(front.execute.group),
+ .port_wave(port_dec_wave),
+ .dispatch(front.dispatch),
+ .p0_writeback
+ );
+
+ gfx_shader_decode_fpint p0_dec
+ (
+ .clk,
+ .op(front.execute.p0),
+ .insn(port_dec_wave.insn),
+ .writeback(p0_writeback)
+ );
+
+endmodule
+
+module gfx_shader_bind
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ gfx_axib.m mem,
+
+ input logic icache_flush,
+
+ input logic loop_valid,
+ input group_id loop_group,
+
+ gfx_regfile_io.bind_ regs,
+
+ output shader_front_wave wave
+);
+
+ localparam int ICACHE_STAGES = 6;
+ localparam int BIND_STAGES = REGFILE_STAGES + ICACHE_STAGES;
+
+ gfx_beats #($bits(group_id)) runnable_in(), runnable_out();
+
+ logic ar_stall, request_ready, request_valid, valids[BIND_STAGES];
+ group_id groups[BIND_STAGES];
+ icache_line_tag araddr, request_addr;
+
+ assign mem.bready = 0;
+ assign mem.wvalid = 0;
+ assign mem.awvalid = 0;
+
+ assign mem.arlen = ($bits(mem.arlen))'($bits(oword) / $bits(word) - 1);
+ assign mem.araddr = {araddr, ($clog2($bits(oword)) - $clog2($bits(word)) + SUBWORD_BITS)'('0)};
+ assign mem.arburst = 2'b01; // Incremental mode
+
+ assign runnable_in.tx.data = loop_group;
+ assign runnable_in.tx.valid = loop_valid;
+
+ assign regs.pc_front_group = runnable_out.rx.data;
+ assign runnable_out.rx.ready = 1;
+
+ assign wave.group = groups[$size(groups) - 1];
+
+ gfx_skid_buf #($bits(araddr)) ar_skid
+ (
+ .clk,
+ .in(request_addr),
+ .out(araddr),
+ .stall(ar_stall)
+ );
+
+ gfx_skid_flow ar_flow
+ (
+ .clk,
+ .rst_n,
+ .stall(ar_stall),
+ .in_ready(request_ready),
+ .in_valid(request_valid),
+ .out_ready(mem.arready),
+ .out_valid(mem.arvalid)
+ );
+
+ //TODO: PodrĂ­amos quitar ~25 entries sin afectar throughput, latencia o correctitud
+ gfx_fifo #(.WIDTH($bits(group_id)), .DEPTH(1 << $bits(group_id))) runnable
+ (
+ .clk,
+ .rst_n,
+ .in(runnable_in.rx),
+ .out(runnable_out.tx)
+ );
+
+ gfx_shader_bind_icache icache
+ (
+ .clk,
+ .rst_n,
+
+ .icache_flush,
+ .read_addr(regs.pc_front),
+ .read_valid(valids[REGFILE_STAGES - 1]),
+
+ .request_addr,
+ .request_valid,
+ .request_ready,
+
+ .fetch_data(mem.rdata),
+ .fetch_last(mem.rlast),
+ .fetch_valid(mem.rvalid),
+ .fetch_ready(mem.rready),
+
+ .insn(wave.insn),
+ .insn_retry(wave.retry),
+ .insn_valid(wave.valid)
+ );
+
+ always_ff @(posedge clk) begin
+ groups[0] <= runnable_out.rx.data;
+ for (int i = 1; i < $size(groups); ++i)
+ groups[i] <= groups[i - 1];
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n)
+ for (int i = 0; i < $size(valids); ++i)
+ valids[i] <= 0;
+ else begin
+ valids[0] <= runnable_out.rx.valid;
+ for (int i = 1; i < $size(valids); ++i)
+ valids[i] <= valids[i - 1];
+ end
+
+endmodule
+
+module gfx_shader_bind_icache
+import gfx::*;
+(
+ input logic clk,
+ rst_n,
+
+ input logic icache_flush,
+
+ input logic read_valid,
+ input icache_ptr read_addr,
+
+ input logic fetch_last,
+ fetch_valid,
+ input word fetch_data,
+ output logic fetch_ready,
+
+ input logic request_ready,
+ output logic request_valid,
+ output icache_line_tag request_addr,
+
+ output logic insn_valid,
+ insn_retry,
+ output word insn
+);
+
+ // Dan Gisselquist limita a (1 << 3) bursts por defecto.
+ // Ver LGMAXBURST en axixbar.v
+ localparam int PENDING_FIFO_DEPTH = 8;
+
+ enum int unsigned
+ {
+ FLUSH,
+ RUN
+ } state;
+
+ struct
+ {
+ logic valid,
+ accessed,
+ hit;
+ icache_tag tag;
+ oword data;
+ } cache[1 << $bits(icache_line_num)], read, read_hold;
+
+ gfx_beats #($bits(icache_line_tag)) pending_in(), pending_out();
+
+ logic accessed_write, accessed_write_enable, burst, fetch_done, hit_write,
+ in_flush, hit_commit, hit_write_enable, retry_4, retry_5, rollback,
+ tag_hit, valid_1, valid_2, valid_3, valid_4, valid_5, valid_write,
+ valid_write_enable;
+
+ icache_ptr read_addr_1, read_addr_2, read_addr_3, read_addr_4, read_addr_5;
+ icache_tag tag_write;
+ icache_line_num accessed_write_line, flush_ptr, hit_write_line, valid_write_line;
+ icache_line_tag pending_pop;
+
+ oword data_write;
+ word[1:0] data_5;
+ word[7:0] fetch_shift;
+ qword[1:0] data_3;
+ udword[1:0] data_4;
+
+ assign data_3 = read.data;
+ assign tag_hit = read.tag == read_addr_3.line_tag.tag;
+ assign fetch_ready = ~fetch_done;
+ assign pending_pop = pending_out.rx.data;
+
+ assign request_addr = read_addr_4.line_tag;
+ assign request_valid = burst & pending_in.tx.ready;
+ assign pending_in.tx.data = read_addr_4.line_tag;
+ assign pending_in.tx.valid = burst & request_ready;
+ assign pending_out.rx.ready = fetch_done & ~hit_commit & ~rollback;
+
+ gfx_fifo #(.WIDTH($bits(icache_line_tag)), .DEPTH(PENDING_FIFO_DEPTH)) pending
+ (
+ .clk,
+ .rst_n,
+ .in(pending_in.rx),
+ .out(pending_out.tx)
+ );
+
+ always_comb
+ unique case (state)
+ FLUSH: in_flush = 1;
+ RUN: in_flush = 0;
+ endcase
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n) begin
+ state <= FLUSH;
+ flush_ptr <= '0;
+ fetch_done <= 0;
+
+ valid_1 <= 0;
+ valid_2 <= 0;
+ valid_3 <= 0;
+ valid_4 <= 0;
+ valid_5 <= 0;
+
+ burst <= 0;
+ end else begin
+ unique case (state)
+ FLUSH:
+ if (~icache_flush & &flush_ptr)
+ state <= RUN;
+
+ RUN:
+ if (icache_flush)
+ state <= FLUSH;
+ endcase
+
+ flush_ptr <= flush_ptr + 1;
+ if (icache_flush)
+ flush_ptr <= '0;
+
+ if (fetch_done)
+ fetch_done <= hit_commit | ~pending_out.rx.valid | rollback;
+ else if (fetch_ready & fetch_valid)
+ fetch_done <= fetch_last;
+
+ valid_1 <= read_valid;
+ valid_2 <= valid_1;
+ valid_3 <= valid_2;
+ valid_4 <= valid_3;
+ valid_5 <= valid_4;
+
+ burst <= valid_3 & ~tag_hit & ~read.accessed & (~read.valid | read.hit);
+ end
+
+ always_ff @(posedge clk) begin
+ tag_write <= pending_pop.tag;
+ data_write <= fetch_shift;
+
+ valid_write <= 1;
+ valid_write_line <= pending_pop.line;
+ valid_write_enable <= fetch_done & ~hit_commit & pending_out.rx.valid & ~rollback;
+
+ accessed_write <= 0;
+ accessed_write_enable <= 1;
+
+ if (rollback)
+ accessed_write_line <= read_addr_5.line_tag.line;
+ else if (fetch_done & ~hit_commit & pending_out.rx.valid)
+ accessed_write_line <= pending_pop.line;
+ else begin
+ accessed_write <= 1;
+ accessed_write_line <= read_addr.line_tag.line;
+ accessed_write_enable <= read_valid;
+ end
+
+ hit_write <= hit_commit;
+ if (hit_commit) begin
+ hit_write_line <= read_addr_4.line_tag.line;
+ hit_write_enable <= 1;
+ end else begin
+ hit_write_line <= pending_pop.line;
+ hit_write_enable <= fetch_done & pending_out.rx.valid & ~rollback;
+ end
+
+ if (in_flush) begin
+ valid_write <= 0;
+ valid_write_line <= flush_ptr;
+ valid_write_enable <= 1;
+
+ accessed_write <= 0;
+ accessed_write_line <= flush_ptr;
+ accessed_write_enable <= 1;
+
+ hit_write <= 0;
+ hit_write_line <= flush_ptr;
+ hit_write_enable <= 1;
+ end
+
+ if (valid_write_enable) begin
+ cache[valid_write_line].tag <= tag_write;
+ cache[valid_write_line].data <= data_write;
+ cache[valid_write_line].valid <= valid_write;
+ end
+
+ if (accessed_write_enable)
+ cache[accessed_write_line].accessed <= accessed_write;
+
+ if (hit_write_enable)
+ cache[hit_write_line].hit <= hit_write;
+
+ read_addr_1 <= read_addr;
+
+ read_hold <= cache[read_addr_1.line_tag.line];
+ read_addr_2 <= read_addr_1;
+
+ read <= read_hold;
+ read_addr_3 <= read_addr_2;
+
+ data_4 <= data_3[read_addr_3.word_num[2]];
+ retry_4 <= ~tag_hit | ~read.valid;
+ hit_commit <= valid_3 & tag_hit & read.valid;
+ read_addr_4 <= read_addr_3;
+
+ data_5 <= data_4[read_addr_4.word_num[1]];
+ retry_5 <= retry_4;
+ rollback <= burst & (~request_valid | ~pending_in.tx.valid);
+ read_addr_5 <= read_addr_4;
+
+ insn <= data_5[read_addr_5.word_num[0]];
+ insn_retry <= retry_5;
+ insn_valid <= valid_5;
+
+ if (fetch_ready & fetch_valid) begin
+ fetch_shift[0] <= fetch_data;
+ for (int i = 1; i < $size(fetch_shift); ++i)
+ fetch_shift[i] <= fetch_shift[i - 1];
+ end
+ end
+
+endmodule
+
+module gfx_shader_read_regs
+import gfx::*;
+import gfx_isa::*;
+(
+ input logic clk,
+ rst_n,
+
+ input shader_front_wave in,
+
+ gfx_regfile_io.read read,
+
+ output shader_front_wave out
+);
+
+ localparam int HOLD_DEPTH = REG_READ_STAGES + 1 - 2;
+
+ logic reg_rev;
+ logic hold_valid[HOLD_DEPTH];
+ shader_front_wave hold[HOLD_DEPTH];
+
+ assign reg_rev = in.insn.reg_rev;
+
+ always_comb begin
+ out = hold[$size(hold) - 1];
+ out.valid = hold_valid[$size(hold_valid) - 1];
+ end
+
+ always_ff @(posedge clk) begin
+ hold[0] <= in;
+
+ for (int i = 1; i < HOLD_DEPTH; ++i)
+ hold[i] <= hold[i - 1];
+
+ read.op.group <= in.group;
+
+ read.op.b_imm <= in.insn.dst_src.rr.b.imm;
+ read.op.a_sgpr <= in.insn.dst_src.rr.ra.sgpr;
+ read.op.b_sgpr <= in.insn.dst_src.rr.b.read.r.sgpr;
+ read.op.a_vgpr <= in.insn.dst_src.rr.ra.vgpr.num;
+ read.op.b_vgpr <= in.insn.dst_src.rr.b.read.r.vgpr.num;
+ read.op.b_is_imm <= in.insn.dst_src.rr.b_is_imm;
+ read.op.b_is_const <= in.insn.dst_src.rr.b.read.from_consts;
+ read.op.scalar_rev <= reg_rev;
+
+ unique case (in.insn.reg_mode)
+ REGS_SVS, REGS_VVS: begin
+ read.op.a_scalar <= reg_rev;
+ read.op.b_scalar <= ~reg_rev;
+ end
+
+ REGS_SSS: begin
+ read.op.a_scalar <= 1;
+ read.op.b_scalar <= 1;
+ end
+
+ REGS_VVV: begin
+ read.op.a_scalar <= 0;
+ read.op.b_scalar <= 0;
+ end
+ endcase
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ if (~rst_n)
+ for (int i = 1; i < HOLD_DEPTH; ++i)
+ hold_valid[i] <= 0;
+ else begin
+ hold_valid[0] <= in.valid;
+
+ for (int i = 1; i < HOLD_DEPTH; ++i)
+ hold_valid[i] <= hold_valid[i - 1];
+ end
+
+endmodule
+
+module gfx_shader_decode_class
+import gfx::*;
+import gfx_isa::*;
+(
+ input logic clk,
+ rst_n,
+
+ input shader_front_wave wave,
+ output shader_front_wave port_wave,
+ output group_id out_group,
+
+ output shader_dispatch dispatch,
+ output logic p0_writeback
+);
+
+ logic is_fsu, is_mem, is_group, hold_valid, retry;
+ shader_front_wave hold_wave;
+
+ assign p0_writeback = ~(is_mem | is_fsu | is_group | retry);
+
+ always_comb begin
+ port_wave = hold_wave;
+ port_wave.valid = hold_valid;
+ end
+
+ always_ff @(posedge clk) begin
+ hold_wave <= wave;
+ out_group <= port_wave.group;
+ end
+
+ always_ff @(posedge clk or negedge rst_n)
+ // Intencionalmente repetitivo
+ if (~rst_n) begin
+ is_fsu <= 0;
+ is_mem <= 0;
+ is_group <= 0;
+
+ retry <= 0;
+ hold_valid <= 0;
+
+ dispatch <= '0;
+ end else begin
+ is_fsu <= 0;
+ is_mem <= 0;
+ is_group <= 0;
+
+ retry <= wave.retry;
+ hold_valid <= wave.valid;
+
+ unique case (wave.insn.insn_class)
+ INSN_FPINT: ; // p0 no tiene ready
+ INSN_MEM: is_mem <= 1;
+ INSN_SFU: is_fsu <= 1;
+ INSN_GROUP: is_group <= 1;
+
+ default:
+ {is_mem, is_fsu, is_group} <= 'x;
+ endcase
+
+ dispatch.p1 <= is_mem;
+ dispatch.p2 <= is_fsu;
+ dispatch.p3 <= is_group;
+
+ if (~hold_valid | retry) begin
+ dispatch.p1 <= 0;
+ dispatch.p2 <= 0;
+ dispatch.p3 <= 0;
+ end
+
+ dispatch.valid <= hold_valid;
+ end
+
+endmodule
+
+module gfx_shader_decode_fpint
+import gfx::*;
+import gfx_isa::*;
+(
+ input logic clk,
+
+ input insn_word insn,
+ input logic writeback,
+
+ output fpint_op op
+);
+
+ always_ff @(posedge clk) begin
+ unique case (insn.by_class.fpint.op)
+ INSN_FPINT_MOV: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 1;
+ op.mnorm_zero_b <= 1;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ INSN_FPINT_FMUL: begin
+ op.setup_mul_float <= 1;
+ op.setup_unit_b <= 0;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 0;
+ op.mnorm_put_mul <= 1;
+ op.mnorm_zero_flags <= 0;
+ op.mnorm_zero_b <= 1;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ INSN_FPINT_IMUL: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 0;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 1;
+ op.mnorm_zero_b <= 1;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 0;
+ op.encode_enable <= 0;
+ end
+
+ INSN_FPINT_FADD: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 0;
+ op.mnorm_zero_b <= 0;
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 0;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 0;
+ op.clz_force_nop <= 0;
+ op.shiftl_copy_flags <= 0;
+ op.round_copy_flags <= 0;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ INSN_FPINT_FMAX, INSN_FPINT_FMIN: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 0;
+ op.mnorm_zero_b <= 0;
+ op.minmax_abs <= 0;
+ op.minmax_swap <= insn.by_class.fpint.op == INSN_FPINT_FMIN;
+ op.minmax_zero_min <= 1;
+ op.minmax_copy_flags <= 1;
+ op.shiftr_int_signed <= 0;
+ op.addsub_int_operand <= 0;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 1;
+ op.shiftl_copy_flags <= 1;
+ op.round_copy_flags <= 1;
+ op.round_enable <= 0;
+ op.encode_enable <= 0;
+ end
+
+ INSN_FPINT_FCVT: begin
+ op.setup_mul_float <= 0;
+ op.setup_unit_b <= 1;
+ op.mnorm_put_hi <= 0;
+ op.mnorm_put_lo <= 1;
+ op.mnorm_put_mul <= 0;
+ op.mnorm_zero_flags <= 1;
+ op.mnorm_zero_b <= 1;
+
+ op.minmax_abs <= 1;
+ op.minmax_swap <= 0;
+ op.minmax_zero_min <= 0;
+ op.minmax_copy_flags <= 0;
+ op.shiftr_int_signed <= 1;
+ op.addsub_int_operand <= 1;
+ op.addsub_copy_flags <= 1;
+ op.clz_force_nop <= 0;
+ op.shiftl_copy_flags <= 0;
+ op.round_copy_flags <= 0;
+ op.round_enable <= 1;
+ op.encode_enable <= 1;
+ end
+
+ default:
+ op <= 'x;
+ endcase
+
+ op.writeback <= writeback;
+ end
+
+endmodule