diff --git a/CHANGELOG.md b/CHANGELOG.md index eca9a7d27..0bacde833 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -160,6 +160,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Generate data for `fmatmul` at compile time - SIMD multipliers are now power gated - Roll-back to Verilator v4.214 + - Handle WAW and WAR `vload` hazards in the `VLDU` + - Handle slide1x and widening hazards with a special protocol ## 2.2.0 - 2021-11-02 diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index da119ddc1..54cfc8d09 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -306,6 +306,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -403,6 +405,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -905,6 +909,7 @@ package ara_pkg; logic scale_vl; // Rescale vl taking into account the new and old EEW resize_e cvt_resize; // Resizing of FP conversions + logic special_hazard; // Widening and vslide1x instructions have different hazard stall policies logic is_reduct; // Is this a reduction? diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 0da855cf2..9139bb61b 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -139,6 +139,10 @@ module ara import ara_pkg::*; #( logic [NrVInsn-1:0][NrVInsn-1:0] global_hazard_table; // Ready for lane 0 (scalar operand fwd) logic pe_scalar_resp_ready; + // VLDU Hazard checking + vid_t vldu_commit_id; + logic vldu_commit_id_valid; + logic vldu_hazard; // Mask unit operands elen_t [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand; @@ -180,7 +184,11 @@ module ara import ara_pkg::*; #( // Interface with the address generator .addrgen_ack_i (addrgen_ack ), .addrgen_error_i (addrgen_error ), - .addrgen_error_vl_i (addrgen_error_vl ) + .addrgen_error_vl_i (addrgen_error_vl ), + // Interface with the VLDU for hazard handling + .vldu_commit_id_i (vldu_commit_id ), + .vldu_commit_id_valid_i(vldu_commit_id_valid ), + .vldu_hazard_o (vldu_hazard ) ); // Scalar move support @@ -347,6 +355,9 @@ module ara import ara_pkg::*; #( .addrgen_ack_o (addrgen_ack ), .addrgen_error_o (addrgen_error ), .addrgen_error_vl_o (addrgen_error_vl ), + .commit_id_o (vldu_commit_id ), + .commit_id_valid_o (vldu_commit_id_valid ), + .hazard_i (vldu_hazard ), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index d505ca4d7..c1027f330 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -683,6 +683,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin ara_req_d.op = ara_pkg::VWREDSUM; @@ -692,6 +693,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1302,6 +1304,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1312,6 +1315,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1322,6 +1326,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1331,6 +1336,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1340,6 +1346,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1349,6 +1356,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1396,6 +1404,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1404,6 +1413,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1412,6 +1422,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1420,6 +1431,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1429,6 +1441,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1438,6 +1451,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1447,6 +1461,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1456,6 +1471,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1464,6 +1480,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1472,6 +1489,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1480,6 +1498,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1510,6 +1529,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1569,6 +1589,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -1579,6 +1601,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRXUNARY0 // vmv.s.x @@ -1627,6 +1651,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1635,6 +1660,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1643,6 +1669,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1651,6 +1678,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1660,6 +1688,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1669,6 +1698,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1678,6 +1708,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1687,6 +1718,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1695,6 +1727,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1703,6 +1736,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1711,6 +1745,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1721,6 +1756,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; @@ -1731,6 +1767,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VWMACCUS ara_req_d.op = ara_pkg::VMACC; @@ -1741,6 +1778,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; @@ -1751,6 +1789,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1885,6 +1924,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000: begin // Widening VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1892,6 +1932,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01001: begin // Widening VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1899,6 +1940,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01010: begin // Widening VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1906,6 +1948,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01011: begin // Widening VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1913,6 +1956,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01100: begin // Widening VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1920,6 +1964,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01110: begin // Widening VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1927,6 +1972,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01111: begin // Widening VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -2036,6 +2082,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VFWREDUSUM ara_req_d.op = ara_pkg::VFWREDUSUM; @@ -2045,7 +2093,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; @@ -2054,6 +2103,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VFWREDOSUM ara_req_d.op = ara_pkg::VFWREDOSUM; @@ -2063,7 +2114,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; @@ -2073,6 +2125,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; @@ -2082,6 +2136,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; @@ -2089,6 +2145,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; @@ -2098,6 +2156,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; @@ -2107,6 +2167,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; @@ -2116,6 +2178,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; @@ -2125,6 +2189,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -2221,6 +2287,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -2228,9 +2296,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001111: begin // vfslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; - // Request will need reshuffling - ara_req_d.scale_vl = 1'b1; + ara_req_d.eew_vs2 = vtype_q.vsew; + // Request will need reshuffling + ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRFUNARY0 // vmv.s.f diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 348c01107..539e3d2b3 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -41,7 +41,11 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Interface with the Address Generation input logic addrgen_ack_i, input logic addrgen_error_i, - input vlen_t addrgen_error_vl_i + input vlen_t addrgen_error_vl_i, + // Interface with the VLDU to handle load WAW and WAR hazards + input vid_t vldu_commit_id_i, + input logic vldu_commit_id_valid_i, + output logic vldu_hazard_o ); /////////////////////////////////// @@ -261,6 +265,9 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i write_list_d = write_list_q; global_hazard_table_d = global_hazard_table_o; + // No hazard check requested + vldu_hazard_o = 1'b0; + // Maintain request pe_req_d = '0; pe_req_valid_d = 1'b0; @@ -354,6 +361,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i fp_rm : ara_req_i.fp_rm, wide_fp_imm : ara_req_i.wide_fp_imm, cvt_resize : ara_req_i.cvt_resize, + special_hazard: ara_req_i.special_hazard, scale_vl : ara_req_i.scale_vl, vl : ara_req_i.vl, vstart : ara_req_i.vstart, @@ -370,12 +378,17 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i pe_req_d.hazard_vs1 | pe_req_d.hazard_vs2; // We only issue instructions that take no operands if they have no hazards. + // Exception to this rule: loads, as they are super common. WAW and WAR hazards + // on load instructions are handled in the VLDU. // Moreover, SLIDE instructions cannot be always chained // ToDo: optimize the case for vslide1down, vslide1up (wait 2 cycles, then chain) - if (!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && - |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} || - (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || - (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) + if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} && + !(is_load(pe_req_d.op))) || + (pe_req_d.op == VSLIDEUP && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || + (pe_req_d.op == VSLIDEDOWN && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) begin ara_req_ready_o = 1'b0; pe_req_valid_d = 1'b0; @@ -453,6 +466,18 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end endcase + // Load-related hazards handling + // Loads are masters on the x-bar to write the in-lane VRF. Nevertheless, + // they can have WAR or WAW dependencies. When there is a load in the load + // unit, its hazard bit is always checked and cleared here as soon as the + // dependency does not exist anymore. Whenever the hazard bit is set, + // the load cannot issue requests. + // It's safe to pipeline vldu_hazard_o if the timing is tight. + // (if so, add a sync signal) + if (vldu_commit_id_valid_i) begin + vldu_hazard_o = |global_hazard_table_o[vldu_commit_id_i]; + end + // Update the global hazard table for (int id = 0; id < NrVInsn; id++) global_hazard_table_d[id] &= vinsn_running_d; end : p_sequencer diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 386b9823c..5e102877e 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -288,42 +288,44 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: unique case (pe_req.vfu) VFU_Alu: begin operand_request_i[AluA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // In case of reduction, AluA opqueue will keep the scalar element - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluA] = pe_req.use_vs1; operand_request_i[AluB] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluB] = pe_req.use_vs2; @@ -346,66 +348,69 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: end VFU_MFpu: begin operand_request_i[MulFPUA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUA] = pe_req.use_vs1; operand_request_i[MulFPUB] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : (pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : (pe_req.swap_vs2_vd_op ? pe_req.hazard_vd : (pe_req.hazard_vs2 | pe_req.hazard_vd)), - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default: '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ? pe_req.use_vd_op : pe_req.use_vs2; operand_request_i[MulFPUC] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, - conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, + conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req.vtype, - hazard : pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req.vtype, + hazard : pe_req.swap_vs2_vd_op ? (pe_req.hazard_vs2 | pe_req.hazard_vd) : pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUC] = pe_req.swap_vs2_vd_op ? pe_req.use_vs2 : pe_req.use_vd_op; @@ -447,17 +452,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Load indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -503,17 +509,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Store indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -524,16 +531,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VFU_SlideUnit: begin operand_request_i[SlideAddrGenA] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, - conv : pe_req.conversion_vs2, - target_fu: ALU_SLDU, - scale_vl : pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - default : '0 + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, + conv : pe_req.conversion_vs2, + target_fu : ALU_SLDU, + special_hazard : pe_req.special_hazard, + scale_vl : pe_req.scale_vl, + vtype : pe_req.vtype, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + default : '0 }; operand_request_push[SlideAddrGenA] = pe_req.use_vs2; diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 54590fbc3..5e124f2e8 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -245,15 +245,27 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // In case of a WAW with a previous instruction, // read once every two writes of the previous instruction logic is_widening; + // Does this instruction have a special hazard protocol? + logic special_hazard; // One-bit counters logic [NrVInsn-1:0] waw_hazard_counter; } requester_d, requester_q; + // Asserted if the SLDU requester is registering a new instruction + logic new_sldu_insn; + logic has_stalled_d, has_stalled_q; // Is there a hazard during this cycle? + // WAW with widening instructions are special: wait for 2 writes instead of 1 + // Slide1Up/Down with hazards should wait one cycle before being handled normally logic stall; - assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & - (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter))); + assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & ((~{NrVInsn{requester_q.is_widening}} & + requester_q.special_hazard) | requester_q.waw_hazard_counter))) | + (~has_stalled_q & requester_q.special_hazard & |requester_q.hazard); + + // For every instruction, it signals if the requester has already stalled once + // This is needed for vslide1x stall handling + assign has_stalled_d = new_sldu_insn ? 1'b0 : (stall ? 1'b1 : has_stalled_q); // Did we get a grant? logic [NrBanks-1:0] operand_requester_gnt; @@ -269,6 +281,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( state_d = state_q; requester_d = requester_q; + new_sldu_insn = 1'b0; + // Make no requests to the VRF operand_payload[requester] = '0; for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0; @@ -288,6 +302,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -312,22 +330,24 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), + id : operand_request_i[requester].id, + addr : vaddr(operand_request_i[requester].vs, NrLanes) + + (operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), // For memory operations, the number of elements initially refers to the new EEW (vsew here), // but the requester must refer to the old EEW (eew here) // This reasoning cannot be applied also to widening instructions, which modify vsew // treating it as the EEW of vd - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE, + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, default: '0 }; // The length should be at least one after the rescaling @@ -381,6 +401,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -401,18 +425,21 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - default: '0 + id : operand_request_i[requester].id, + addr : vaddr(operand_request_i[requester].vs, NrLanes) + + (operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, + default : '0 }; // The length should be at least one after the rescaling if (requester_d.len == '0) @@ -428,11 +455,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - state_q <= IDLE; - requester_q <= '0; + state_q <= IDLE; + requester_q <= '0; + has_stalled_q <= 1'b0; end else begin - state_q <= state_d; - requester_q <= requester_d; + state_q <= state_d; + requester_q <= requester_d; + has_stalled_q <= has_stalled_d; end end end : gen_operand_requester diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 51042ed8e..61b26623a 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -33,6 +33,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrVInsn-1:0] pe_vinsn_running_i, output logic pe_req_ready_o, output pe_resp_t pe_resp_o, + // Hazard handling to main sequencer + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, input logic axi_addrgen_req_valid_i, @@ -101,6 +105,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic vinsn_commit_valid; assign vinsn_commit = vinsn_queue_q.vinsn[vinsn_queue_q.commit_pnt]; assign vinsn_commit_valid = (vinsn_queue_q.commit_cnt != '0); + // To the main sequencer, for hazard checking + assign commit_id_valid_o = vinsn_commit_valid; + assign commit_id_o = vinsn_commit.id; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin @@ -354,7 +361,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( ////////////////////////////////// for (int lane = 0; lane < NrLanes; lane++) begin: result_write - ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; + // Create a request only if there are no more hazards on vd (check vs1 since the info about + // hazard vd is also there) + ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane] && + !vinsn_commit.hazard_vs1; ldu_result_addr_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].addr; ldu_result_id_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].id; ldu_result_wdata_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].wdata; @@ -415,6 +425,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d.commit_pnt].vtype.vsew); end + // Update the Vd hazard bit for the current instruction + // hazard_vs1, hazard_vs2, hazard_vm all contain the info about hazard_vd, so work on one of them (vs1) + if (commit_id_valid_o) vinsn_queue_d.vinsn[vinsn_queue_q.commit_pnt].hazard_vs1 &= {NrVInsn{hazard_i}}; + ////////////////////////////// // Accept new instruction // ////////////////////////////// diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index aa2e05283..448b53a87 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -44,6 +44,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output logic addrgen_ack_o, output logic addrgen_error_o, output vlen_t addrgen_error_vl_o, + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -172,6 +175,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_vinsn_running_i (pe_vinsn_running_i ), .pe_req_ready_o (pe_req_ready_o[OffsetLoad]), .pe_resp_o (pe_resp_o[OffsetLoad] ), + .commit_id_o (commit_id_o ), + .commit_id_valid_o (commit_id_valid_o ), + .hazard_i (hazard_i ), // Interface with the address generator .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ),