From bbc29883f358249b65cf0dac78d06c3b8cc594c4 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sun, 20 Nov 2022 20:17:50 +0100 Subject: [PATCH 1/3] [hardware] Handle WAW and WAR `vload` hazards in the `VLDU` Before this commit, all the hazards (RAW, WAR, WAW) are handled by the operand requesters that throttle access to source reg elements. Even if the hazard is a WAR/WAW, the suboptimal but efficient way to deal with it is to slow down the source reg fetch. If an instruction does not have source regs, this cannot happen. For example, load instructions. Therefore, all the instructions that do not have vector source operands are stalled in the sequencer. Loads are super common, and stalling in the main sequencer means that all the instructions after the load are also stalled and cannot start their execution. Therefore, now they are processed, and the hazard check is done inside the VLDU. The write-back request is masked until there is no more any hazards on that load instruction. --- hardware/src/ara.sv | 13 ++++++++++++- hardware/src/ara_sequencer.sv | 28 +++++++++++++++++++++++++--- hardware/src/vlsu/vldu.sv | 16 +++++++++++++++- hardware/src/vlsu/vlsu.sv | 6 ++++++ 4 files changed, 58 insertions(+), 5 deletions(-) diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 0da855cf2..9139bb61b 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -139,6 +139,10 @@ module ara import ara_pkg::*; #( logic [NrVInsn-1:0][NrVInsn-1:0] global_hazard_table; // Ready for lane 0 (scalar operand fwd) logic pe_scalar_resp_ready; + // VLDU Hazard checking + vid_t vldu_commit_id; + logic vldu_commit_id_valid; + logic vldu_hazard; // Mask unit operands elen_t [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand; @@ -180,7 +184,11 @@ module ara import ara_pkg::*; #( // Interface with the address generator .addrgen_ack_i (addrgen_ack ), .addrgen_error_i (addrgen_error ), - .addrgen_error_vl_i (addrgen_error_vl ) + .addrgen_error_vl_i (addrgen_error_vl ), + // Interface with the VLDU for hazard handling + .vldu_commit_id_i (vldu_commit_id ), + .vldu_commit_id_valid_i(vldu_commit_id_valid ), + .vldu_hazard_o (vldu_hazard ) ); // Scalar move support @@ -347,6 +355,9 @@ module ara import ara_pkg::*; #( .addrgen_ack_o (addrgen_ack ), .addrgen_error_o (addrgen_error ), .addrgen_error_vl_o (addrgen_error_vl ), + .commit_id_o (vldu_commit_id ), + .commit_id_valid_o (vldu_commit_id_valid ), + .hazard_i (vldu_hazard ), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 348c01107..8355a97de 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -41,7 +41,11 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Interface with the Address Generation input logic addrgen_ack_i, input logic addrgen_error_i, - input vlen_t addrgen_error_vl_i + input vlen_t addrgen_error_vl_i, + // Interface with the VLDU to handle load WAW and WAR hazards + input vid_t vldu_commit_id_i, + input logic vldu_commit_id_valid_i, + output logic vldu_hazard_o ); /////////////////////////////////// @@ -261,6 +265,9 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i write_list_d = write_list_q; global_hazard_table_d = global_hazard_table_o; + // No hazard check requested + vldu_hazard_o = 1'b0; + // Maintain request pe_req_d = '0; pe_req_valid_d = 1'b0; @@ -370,10 +377,13 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i pe_req_d.hazard_vs1 | pe_req_d.hazard_vs2; // We only issue instructions that take no operands if they have no hazards. + // Exception to this rule: loads, as they are super common. WAW and WAR hazards + // on load instructions are handled in the VLDU. // Moreover, SLIDE instructions cannot be always chained // ToDo: optimize the case for vslide1down, vslide1up (wait 2 cycles, then chain) - if (!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && - |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} || + if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} && + !(is_load(pe_req_d.op))) || (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) begin @@ -453,6 +463,18 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end endcase + // Load-related hazards handling + // Loads are masters on the x-bar to write the in-lane VRF. Nevertheless, + // they can have WAR or WAW dependencies. When there is a load in the load + // unit, its hazard bit is always checked and cleared here as soon as the + // dependency does not exist anymore. Whenever the hazard bit is set, + // the load cannot issue requests. + // It's safe to pipeline vldu_hazard_o if the timing is tight. + // (if so, add a sync signal) + if (vldu_commit_id_valid_i) begin + vldu_hazard_o = |global_hazard_table_o[vldu_commit_id_i]; + end + // Update the global hazard table for (int id = 0; id < NrVInsn; id++) global_hazard_table_d[id] &= vinsn_running_d; end : p_sequencer diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 51042ed8e..61b26623a 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -33,6 +33,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrVInsn-1:0] pe_vinsn_running_i, output logic pe_req_ready_o, output pe_resp_t pe_resp_o, + // Hazard handling to main sequencer + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, input logic axi_addrgen_req_valid_i, @@ -101,6 +105,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic vinsn_commit_valid; assign vinsn_commit = vinsn_queue_q.vinsn[vinsn_queue_q.commit_pnt]; assign vinsn_commit_valid = (vinsn_queue_q.commit_cnt != '0); + // To the main sequencer, for hazard checking + assign commit_id_valid_o = vinsn_commit_valid; + assign commit_id_o = vinsn_commit.id; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin @@ -354,7 +361,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( ////////////////////////////////// for (int lane = 0; lane < NrLanes; lane++) begin: result_write - ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; + // Create a request only if there are no more hazards on vd (check vs1 since the info about + // hazard vd is also there) + ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane] && + !vinsn_commit.hazard_vs1; ldu_result_addr_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].addr; ldu_result_id_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].id; ldu_result_wdata_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].wdata; @@ -415,6 +425,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d.commit_pnt].vtype.vsew); end + // Update the Vd hazard bit for the current instruction + // hazard_vs1, hazard_vs2, hazard_vm all contain the info about hazard_vd, so work on one of them (vs1) + if (commit_id_valid_o) vinsn_queue_d.vinsn[vinsn_queue_q.commit_pnt].hazard_vs1 &= {NrVInsn{hazard_i}}; + ////////////////////////////// // Accept new instruction // ////////////////////////////// diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index aa2e05283..448b53a87 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -44,6 +44,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output logic addrgen_ack_o, output logic addrgen_error_o, output vlen_t addrgen_error_vl_o, + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -172,6 +175,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_vinsn_running_i (pe_vinsn_running_i ), .pe_req_ready_o (pe_req_ready_o[OffsetLoad]), .pe_resp_o (pe_resp_o[OffsetLoad] ), + .commit_id_o (commit_id_o ), + .commit_id_valid_o (commit_id_valid_o ), + .hazard_i (hazard_i ), // Interface with the address generator .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), From 497a435ed9a69b659d6e4e66f43fe928cf206c91 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 5 Dec 2022 20:32:44 +0100 Subject: [PATCH 2/3] [hardware] Handle slide1x and widening hazards with a special protocol Slide1Up/Down were blocked in the main sequencer when they had specific hazards. Now, these hazards are handled downstream, waiting for 1 cycle of stall and then continuing with the usual protocol. WAW hazards for widening instructions are also handled better now, discriminating between real widening instructions and reductions. --- hardware/include/ara_pkg.sv | 5 + hardware/src/ara_dispatcher.sv | 80 +++++++++- hardware/src/ara_sequencer.sv | 7 +- hardware/src/lane/lane_sequencer.sv | 208 +++++++++++++------------ hardware/src/lane/operand_requester.sv | 89 +++++++---- 5 files changed, 252 insertions(+), 137 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index da119ddc1..54cfc8d09 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -306,6 +306,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -403,6 +405,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -905,6 +909,7 @@ package ara_pkg; logic scale_vl; // Rescale vl taking into account the new and old EEW resize_e cvt_resize; // Resizing of FP conversions + logic special_hazard; // Widening and vslide1x instructions have different hazard stall policies logic is_reduct; // Is this a reduction? diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index d505ca4d7..c1027f330 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -683,6 +683,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin ara_req_d.op = ara_pkg::VWREDSUM; @@ -692,6 +693,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1302,6 +1304,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1312,6 +1315,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1322,6 +1326,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1331,6 +1336,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1340,6 +1346,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1349,6 +1356,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1396,6 +1404,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1404,6 +1413,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1412,6 +1422,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1420,6 +1431,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1429,6 +1441,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1438,6 +1451,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1447,6 +1461,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1456,6 +1471,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1464,6 +1480,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1472,6 +1489,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1480,6 +1498,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1510,6 +1529,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1569,6 +1589,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -1579,6 +1601,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRXUNARY0 // vmv.s.x @@ -1627,6 +1651,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1635,6 +1660,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1643,6 +1669,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1651,6 +1678,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1660,6 +1688,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1669,6 +1698,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1678,6 +1708,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1687,6 +1718,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1695,6 +1727,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1703,6 +1736,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1711,6 +1745,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1721,6 +1756,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; @@ -1731,6 +1767,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VWMACCUS ara_req_d.op = ara_pkg::VMACC; @@ -1741,6 +1778,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; @@ -1751,6 +1789,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1885,6 +1924,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000: begin // Widening VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1892,6 +1932,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01001: begin // Widening VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1899,6 +1940,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01010: begin // Widening VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1906,6 +1948,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01011: begin // Widening VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1913,6 +1956,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01100: begin // Widening VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1920,6 +1964,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01110: begin // Widening VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1927,6 +1972,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01111: begin // Widening VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -2036,6 +2082,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VFWREDUSUM ara_req_d.op = ara_pkg::VFWREDUSUM; @@ -2045,7 +2093,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; @@ -2054,6 +2103,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VFWREDOSUM ara_req_d.op = ara_pkg::VFWREDOSUM; @@ -2063,7 +2114,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; @@ -2073,6 +2125,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; @@ -2082,6 +2136,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; @@ -2089,6 +2145,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; @@ -2098,6 +2156,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; @@ -2107,6 +2167,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; @@ -2116,6 +2178,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; @@ -2125,6 +2189,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -2221,6 +2287,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -2228,9 +2296,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001111: begin // vfslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; - // Request will need reshuffling - ara_req_d.scale_vl = 1'b1; + ara_req_d.eew_vs2 = vtype_q.vsew; + // Request will need reshuffling + ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRFUNARY0 // vmv.s.f diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 8355a97de..539e3d2b3 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -361,6 +361,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i fp_rm : ara_req_i.fp_rm, wide_fp_imm : ara_req_i.wide_fp_imm, cvt_resize : ara_req_i.cvt_resize, + special_hazard: ara_req_i.special_hazard, scale_vl : ara_req_i.scale_vl, vl : ara_req_i.vl, vstart : ara_req_i.vstart, @@ -384,8 +385,10 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} && !(is_load(pe_req_d.op))) || - (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || - (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) + (pe_req_d.op == VSLIDEUP && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || + (pe_req_d.op == VSLIDEDOWN && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) begin ara_req_ready_o = 1'b0; pe_req_valid_d = 1'b0; diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 386b9823c..5e102877e 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -288,42 +288,44 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: unique case (pe_req.vfu) VFU_Alu: begin operand_request_i[AluA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // In case of reduction, AluA opqueue will keep the scalar element - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluA] = pe_req.use_vs1; operand_request_i[AluB] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluB] = pe_req.use_vs2; @@ -346,66 +348,69 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: end VFU_MFpu: begin operand_request_i[MulFPUA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUA] = pe_req.use_vs1; operand_request_i[MulFPUB] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : (pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : (pe_req.swap_vs2_vd_op ? pe_req.hazard_vd : (pe_req.hazard_vs2 | pe_req.hazard_vd)), - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default: '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ? pe_req.use_vd_op : pe_req.use_vs2; operand_request_i[MulFPUC] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, - conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, + conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req.vtype, - hazard : pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req.vtype, + hazard : pe_req.swap_vs2_vd_op ? (pe_req.hazard_vs2 | pe_req.hazard_vd) : pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUC] = pe_req.swap_vs2_vd_op ? pe_req.use_vs2 : pe_req.use_vd_op; @@ -447,17 +452,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Load indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -503,17 +509,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Store indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -524,16 +531,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VFU_SlideUnit: begin operand_request_i[SlideAddrGenA] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, - conv : pe_req.conversion_vs2, - target_fu: ALU_SLDU, - scale_vl : pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - default : '0 + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, + conv : pe_req.conversion_vs2, + target_fu : ALU_SLDU, + special_hazard : pe_req.special_hazard, + scale_vl : pe_req.scale_vl, + vtype : pe_req.vtype, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + default : '0 }; operand_request_push[SlideAddrGenA] = pe_req.use_vs2; diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 54590fbc3..5e124f2e8 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -245,15 +245,27 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // In case of a WAW with a previous instruction, // read once every two writes of the previous instruction logic is_widening; + // Does this instruction have a special hazard protocol? + logic special_hazard; // One-bit counters logic [NrVInsn-1:0] waw_hazard_counter; } requester_d, requester_q; + // Asserted if the SLDU requester is registering a new instruction + logic new_sldu_insn; + logic has_stalled_d, has_stalled_q; // Is there a hazard during this cycle? + // WAW with widening instructions are special: wait for 2 writes instead of 1 + // Slide1Up/Down with hazards should wait one cycle before being handled normally logic stall; - assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & - (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter))); + assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & ((~{NrVInsn{requester_q.is_widening}} & + requester_q.special_hazard) | requester_q.waw_hazard_counter))) | + (~has_stalled_q & requester_q.special_hazard & |requester_q.hazard); + + // For every instruction, it signals if the requester has already stalled once + // This is needed for vslide1x stall handling + assign has_stalled_d = new_sldu_insn ? 1'b0 : (stall ? 1'b1 : has_stalled_q); // Did we get a grant? logic [NrBanks-1:0] operand_requester_gnt; @@ -269,6 +281,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( state_d = state_q; requester_d = requester_q; + new_sldu_insn = 1'b0; + // Make no requests to the VRF operand_payload[requester] = '0; for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0; @@ -288,6 +302,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -312,22 +330,24 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), + id : operand_request_i[requester].id, + addr : vaddr(operand_request_i[requester].vs, NrLanes) + + (operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), // For memory operations, the number of elements initially refers to the new EEW (vsew here), // but the requester must refer to the old EEW (eew here) // This reasoning cannot be applied also to widening instructions, which modify vsew // treating it as the EEW of vd - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE, + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, default: '0 }; // The length should be at least one after the rescaling @@ -381,6 +401,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -401,18 +425,21 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - default: '0 + id : operand_request_i[requester].id, + addr : vaddr(operand_request_i[requester].vs, NrLanes) + + (operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, + default : '0 }; // The length should be at least one after the rescaling if (requester_d.len == '0) @@ -428,11 +455,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - state_q <= IDLE; - requester_q <= '0; + state_q <= IDLE; + requester_q <= '0; + has_stalled_q <= 1'b0; end else begin - state_q <= state_d; - requester_q <= requester_d; + state_q <= state_d; + requester_q <= requester_d; + has_stalled_q <= has_stalled_d; end end end : gen_operand_requester From 6b229498bd4fdf117ebcd2c3c414b9661a3b7f50 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Fri, 20 Jan 2023 15:59:54 +0100 Subject: [PATCH 3/3] [CHANGELOG] Update Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index eca9a7d27..0bacde833 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -160,6 +160,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Generate data for `fmatmul` at compile time - SIMD multipliers are now power gated - Roll-back to Verilator v4.214 + - Handle WAW and WAR `vload` hazards in the `VLDU` + - Handle slide1x and widening hazards with a special protocol ## 2.2.0 - 2021-11-02