FPU: Improve zero result detection and simplify final states

This improves detection of results that are exactly zero in FINISH state by noting that on entry to FINISH state, if R is zero then X must also be zero, so no rounding needs to be done and no underflow exists. Therefore we can set rcls_op = RCLS_TZERO to test for zero and exit early if R = 0. The RCLS_TZERO test now tests the whole of R just in case. The rest of the following states have been streamlined and simplified. In cases of underflow, we only need to take action before rounding in the UE=0 case (disabled underflow exception), where we need to denormalize before rounding. For enabled underflow cases we just use the existing NORMALIZE state, which lets us remove NORM_UFLOW state. On entry to ROUNDING state, R can be zero or denorm only for round to integer instructions (fri*) or for disabled underflow exception cases. Note that in case of underflow with UE=0, the exception is only actually signalled if there is loss of accuracy, i.e. if FPSCR[FI] will be set. This is now done at the end of ROUNDING state. For underflow with UE=1, we go to a new ROUND_UFLOW_EN state to adjust the exponent from ROUNDING, ROUNDING_2 or ROUNDING_3 state. In the ROUNDING* states, we avoid shifting left to normalize a result with exponent <= -1022, because if we did we would then just need to denormalize again. This lets us get rid of DENORM state. Finally, noticing that DO_FRSP_2 state does much the same as FINISH state lets us remove DO_FRSP_2 state and go to FINISH state from DO_FRSP. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2026-01-11 23:43:15 +00:00 · 2025-12-14 08:42:23 +11:00 · 2025-12-14 08:42:23 +11:00 · 1ad8848655
commit 1ad8848655
parent f8a11420ca
3 changed files with 66 additions and 72 deletions
--- a/fpu.vhdl
+++ b/fpu.vhdl
@ -51,7 +51,7 @@ architecture behaviour of fpu is
                     DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                     DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                     DO_FCFID, DO_FCTI,
-                     DO_FRSP, DO_FRSP_2, DO_FRI,
+                     DO_FRSP, DO_FRI,
                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
                     DO_FRE,
                     DO_FSEL,
@ -72,9 +72,9 @@ architecture behaviour of fpu is
                     INT_SHIFT, INT_ROUND, INT_ISHIFT,
                     INT_FINAL, INT_CHECK, INT_OFLOW,
                     FINISH, NORMALIZE,
-                     ROUND_UFLOW, NORM_UFLOW, ROUND_OFLOW_DIS, ROUND_OFLOW_EN,
+                     ROUND_UFLOW_DIS, ROUND_UFLOW_EN,
                     ROUND_OFLOW_DIS, ROUND_OFLOW_EN,
                     ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3,
                     DENORM,
                     RENORM_A, RENORM_B, RENORM_C,
                     RENORM_1, RENORM_2,
                     IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3,
@ -776,6 +776,9 @@ begin
                end if;
            else
                assert not (r.state /= IDLE and e_in.valid = '1') severity failure;
                assert not (rin.state = FINISH and rin.r = 64x"0" and rin.x = '1');
                assert not (rin.state = ROUNDING and rin.r(UNIT_BIT) = '0' and
                            not (rin.tiny = '1' or rin.zero_fri = '1'));
                r <= rin;
            end if;
        end if;
@ -1630,22 +1633,7 @@ begin
                set_r := '1';
                re_sel2 <= REXP2_B;
                re_set_result <= '1';
-                v.state := DO_FRSP_2;
+                v.state := FINISH;
            when DO_FRSP_2 =>
                -- r.shift = 0
                -- set shift to exponent - -126 (for ROUND_UFLOW state)
                rs_sel1 <= RSH1_B;
                rs_con2 <= RSCON2_MINEXP;
                rs_neg2 <= '1';
                set_x := '1';   -- uses r.r and r.shift
                if exp_tiny = '1' then
                    v.state := ROUND_UFLOW;
                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
                    v.state := ROUND_OFLOW_DIS;
                else
                    v.state := ROUNDING;
                end if;
            when DO_FCTI =>
                -- instr bit 9: 1=dword 0=word
@ -2414,17 +2402,20 @@ begin
            when FINISH =>
                -- r.shift = 0
                -- set shift to new_exp - min_exp (N.B. rs_norm overrides this)
                -- assert that if r.r = 0 then r.x = 0 also
                rs_sel1 <= RSH1_NE;
                rs_con2 <= RSCON2_MINEXP;
                rs_neg2 <= '1';
                rcls_op <= RCLS_TZERO;
                if r.r(63 downto UNIT_BIT) /= std_ulogic_vector(to_unsigned(1, 64 - UNIT_BIT)) then
                    rs_norm <= '1';
                    v.state := NORMALIZE;
                else
                    set_x := '1';
                    set_xs := r.is_multiply;
-                    if exp_tiny = '1' then
+                    v.tiny := exp_tiny;
-                        v.state := ROUND_UFLOW;
+                    if exp_tiny = '1' and r.fpscr(FPSCR_UE) = '0' then
                        v.state := ROUND_UFLOW_DIS;
                    elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
                        v.state := ROUND_OFLOW_DIS;
                    else
@ -2445,51 +2436,25 @@ begin
                rs_neg2 <= '1';
                set_x := '1';
                set_xs := r.is_multiply;
-                if exp_tiny = '1' then
+                v.tiny := exp_tiny;
-                    v.state := ROUND_UFLOW;
+                if exp_tiny = '1' and r.fpscr(FPSCR_UE) = '0' then
                    v.state := ROUND_UFLOW_DIS;
                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
                    v.state := ROUND_OFLOW_DIS;
                else
                    v.state := ROUNDING;
                end if;
-            when ROUND_UFLOW =>
+            when ROUND_UFLOW_DIS =>
                -- r.shift = - amount by which exponent underflows
-                v.tiny := '1';
+                -- disabled underflow exception case
                -- have to denormalize before rounding
                opsel_r <= RES_SHIFT;
                set_r := '0';
                if r.fpscr(FPSCR_UE) = '0' then
                    -- disabled underflow exception case
                    -- have to denormalize before rounding
                    set_r := '1';
                    re_sel2 <= REXP2_NE;
                    re_set_result <= '1';
                    set_x := '1';
                    v.state := ROUNDING;
                else
                    -- enabled underflow exception case
                    -- if denormalized, have to normalize before rounding
                    v.fpscr(FPSCR_UX) := '1';
                    re_sel1 <= REXP1_R;
                    re_con2 <= RECON2_BIAS;
                    re_set_result <= '1';
                    if r.r(UNIT_BIT) = '0' then
                        rs_norm <= '1';
                        v.state := NORM_UFLOW;
                    else
                        v.state := ROUNDING;
                    end if;
                end if;
            when NORM_UFLOW =>
                -- normalize for UE=1 underflow case
                -- r.shift = clz(r.r) - 7
                opsel_r <= RES_SHIFT;
                set_r := '1';
                re_sel2 <= REXP2_NE;
                re_set_result <= '1';
                set_x := '1';
                set_xs := r.is_multiply;
                v.state := ROUNDING;
            when ROUND_OFLOW_DIS =>
@ -2508,6 +2473,8 @@ begin
                arith_done := '1';
            when ROUNDING =>
                -- r.r can be zero or denorm here for fri* instructions,
                -- and for disabled underflow exception cases.
                opsel_mask <= '1';
                set_r := '1';
                round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
@ -2520,10 +2487,22 @@ begin
                    -- increment the LSB for the precision
                    v.state := ROUND_INC;
                elsif r.r(UNIT_BIT) = '0' then
-                    -- result after masking could be zero, or could be a
+                    -- Result after masking could be zero, or could be a
-                    -- denormalized result that needs to be renormalized
+                    -- denormalized result that needs to be renormalized,
-                    rs_norm <= '1';
+                    -- but only for fri* instructions and for disabled
                    -- underflow exception cases.
                    -- For fri* instructions, result_exp is 52.
                    -- For disabled underflow exception cases for DP operations,
                    -- result_exp is -1022 and there is no point renormalizing
                    -- since it will just get denormalized again, but we do need
                    -- to check for a zero result in a subsequent cycle
                    -- after R is masked.
                    if r.result_exp > to_signed(-1022, EXP_BITS) then
                        rs_norm <= '1';
                    end if;
                    v.state := ROUNDING_3;
                elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then
                    v.state := ROUND_UFLOW_EN;
                elsif r.result_exp > max_exp then
                    v.state := ROUND_OFLOW_EN;
                else
@ -2531,9 +2510,9 @@ begin
                end if;
                if round(0) = '1' and r.zero_fri = '0' then
                    v.fpscr(FPSCR_XX) := '1';
-                    if r.tiny = '1' then
+                end if;
-                        v.fpscr(FPSCR_UX) := '1';
+                if round(0) = '1' and r.tiny = '1' then
-                    end if;
+                    v.fpscr(FPSCR_UX) := '1';
                end if;
            when ROUND_INC =>
@ -2544,18 +2523,30 @@ begin
            when ROUNDING_2 =>
                -- Check for overflow during rounding
                -- r.shift = 0
-                if r.r(UNIT_BIT + 1) = '1' or r.r(UNIT_BIT) = '0' then
+                if r.r(UNIT_BIT + 1) = '1' then
                    -- Do CLZ so we can renormalize the result
                    rs_norm <= '1';
                    v.state := ROUNDING_3;
                elsif r.r(UNIT_BIT) = '0' then
                    -- R is non-zero (we just incremented it)
                    -- If result_exp is -1022 here, don't normalize since
                    -- we would then need to denormalize again.
                    if r.result_exp > to_signed(-1022, EXP_BITS) then
                        rs_norm <= '1';
                    end if;
                    v.state := ROUNDING_3;
                elsif exp_huge = '1' then
                    v.state := ROUND_OFLOW_EN;
                elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then
                    v.state := ROUND_UFLOW_EN;
                else
                    arith_done := '1';
                end if;
            when ROUNDING_3 =>
-                -- r.shift = clz(r.r) - 7
+                -- r.shift = clz(r.r) - 7 (or 0, or -7, if r.r is 0)
                -- Note clz may be done on the value before being masked
                -- to the result precision.
                opsel_r <= RES_SHIFT;
                set_r := '1';
                re_sel2 <= REXP2_NE;
@ -2572,20 +2563,12 @@ begin
                    v.state := ROUND_OFLOW_DIS;
                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '1' then
                    v.state := ROUND_OFLOW_EN;
-                elsif new_exp < to_signed(-1022, EXP_BITS) then
+                elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then
-                    v.state := DENORM;
+                    v.state := ROUND_UFLOW_EN;
                else
                    arith_done := '1';
                end if;
            when DENORM =>
                -- r.shift = result_exp - -1022
                opsel_r <= RES_SHIFT;
                set_r := '1';
                re_sel2 <= REXP2_NE;
                re_set_result <= '1';
                arith_done := '1';
            when ROUND_OFLOW_EN =>
                -- enabled overflow exception
                -- rounding and normalization has been done
@ -2596,6 +2579,15 @@ begin
                re_set_result <= '1';
                arith_done := '1';
            when ROUND_UFLOW_EN =>
                -- enabled underflow exception
                -- rounding and normalization has been done
                v.fpscr(FPSCR_UX) := '1';
                re_sel1 <= REXP1_R;
                re_con2 <= RECON2_BIAS;
                re_set_result <= '1';
                arith_done := '1';
            when DO_IDIVMOD =>
                opsel_a <= AIN_B;
                opsel_aabs <= '1';
@ -3196,7 +3188,7 @@ begin
                    when others =>
                end case;
            when RCLS_TZERO =>
-                if or (r.r(UNIT_BIT + 2 downto 0)) = '0' then
+                if or (r.r) = '0' then
                    v.result_class := ZERO;
                    arith_done := '1';
                end if;
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@ -1627,6 +1627,8 @@ struct fmavals {
 	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
 	{ 0x41efffffffe00000, 0xc1efffffffe00000, 0x43f0000000000000, FPS_RN_CEIL,
 	  0x41fffffffff00000, 0xc3ffffffffe00000, 0xc1fffffffff00000, 0x43ffffffffe00000 },
 	{ 0x3ff0000000000000, 0x000060fbffffefc1, 0x000060fbffffefc1, FPS_RN_NEAR,
 	  0x0000c1f7ffffdf82, 0x0000000000000000, 0x8000c1f7ffffdf82, 0x8000000000000000 },
 };
 int test23(long arg)
--- a/tests/test_fpu.bin
+++ b/tests/test_fpu.bin