Merge pull request #427 from paulusmack/fixes

Various FPU and warning fixes
2026-05-08 00:46:45 +00:00 · 2024-04-08 21:45:52 -07:00
parent 18911455c6 0ceace927c
commit 4b1e7c8d75
12 changed files with 108 additions and 37 deletions
--- a/countbits.vhdl
+++ b/countbits.vhdl
@@ -50,9 +50,11 @@ architecture behaviour of bit_counter is
 begin
    countzero_r: process(clk)
    begin
-        if rising_edge(clk) and stall = '0' then
-            inp_r <= inp;
-            sum_r <= sum;
+        if rising_edge(clk) then
+            if stall = '0' then
+                inp_r <= inp;
+                sum_r <= sum;
+            end if;
        end if;
    end process;

--- a/fetch1.vhdl
+++ b/fetch1.vhdl
@@ -102,9 +102,6 @@ architecture behaviour of fetch1 is
    signal itlb_pte : tlb_pte_t;
    signal itlb_hit : std_ulogic;

-    -- Privilege bit from PTE EAA field
-    signal eaa_priv  : std_ulogic;
-
    -- Simple hash for direct-mapped TLB index
    function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
        variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
@@ -155,7 +152,7 @@ begin
        attribute ram_style of btc_memory : signal is "block";

        signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0);
-        attribute ram_style of btc_valids : signal is "distributed";
+        -- attribute ram_style of btc_valids : signal is "distributed";

        signal btc_wr : std_ulogic;
        signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0);
--- a/fpga/arty_a7.xdc
+++ b/fpga/arty_a7.xdc
@@ -171,15 +171,15 @@ set_property -dict { PACKAGE_PIN R15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_po
 set_property -dict { PACKAGE_PIN P15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io33 }];
 set_property -dict { PACKAGE_PIN R16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io34 }];
 set_property -dict { PACKAGE_PIN N16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io35 }];
-set_property -dict { PACKAGE_PIN N14 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io36 }];
-set_property -dict { PACKAGE_PIN U17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io37 }];
-set_property -dict { PACKAGE_PIN T18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io38 }];
-set_property -dict { PACKAGE_PIN R18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io39 }];
-set_property -dict { PACKAGE_PIN P18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io40 }];
-set_property -dict { PACKAGE_PIN N17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io41 }];
-set_property -dict { PACKAGE_PIN M17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io42 }]; # A
-set_property -dict { PACKAGE_PIN L18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io43 }]; # SCL
-set_property -dict { PACKAGE_PIN M18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io44 }]; # SDA
+#set_property -dict { PACKAGE_PIN N14 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io36 }];
+#set_property -dict { PACKAGE_PIN U17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io37 }];
+#set_property -dict { PACKAGE_PIN T18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io38 }];
+#set_property -dict { PACKAGE_PIN R18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io39 }];
+#set_property -dict { PACKAGE_PIN P18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io40 }];
+#set_property -dict { PACKAGE_PIN N17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io41 }];
+#set_property -dict { PACKAGE_PIN M17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io42 }]; # A
+#set_property -dict { PACKAGE_PIN L18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io43 }]; # SCL
+#set_property -dict { PACKAGE_PIN M18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io44 }]; # SDA
 #set_property -dict { PACKAGE_PIN C2  IOSTANDARD LVCMOS33 } [get_ports { shield_rst }];

 #set_property -dict { PACKAGE_PIN C1  IOSTANDARD LVCMOS33 } [get_ports { spi_hdr_ss }];
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@@ -206,6 +206,9 @@ architecture behaviour of toplevel is
    signal ddram_clk_p_vec : std_logic_vector(0 downto 0);
    signal ddram_clk_n_vec : std_logic_vector(0 downto 0);

+    signal uart1_rxd : std_ulogic;
+    signal uart1_txd : std_ulogic;
+
    -- Fixup various memory sizes based on generics
    function get_bram_size return natural is
    begin
@@ -266,8 +269,8 @@ begin
            uart0_rxd         => uart_main_rx,

 	    -- UART1 signals
-	    --uart1_txd         => uart_pmod_tx,
-	    --uart1_rxd         => uart_pmod_rx,
+            uart1_txd         => uart1_txd,
+            uart1_rxd         => uart1_rxd,

            -- SPI signals
            spi_flash_sck     => spi_sck,
@@ -302,7 +305,7 @@ begin
            wishbone_dma_out     => wb_sddma_out
            );

-    --uart_pmod_rts_n <= '0';
+    uart1_txd <= '1';

    -- SPI Flash
    --
@@ -415,8 +418,9 @@ begin
                );

        -- Generate SoC reset
-        soc_rst_gen: process(system_clk)
+        soc_rst_gen: process(system_clk, ext_rst_n)
        begin
+            -- XXX why does this need to be an asynchronous reset?
            if ext_rst_n = '0' then
                soc_rst <= '1';
            elsif rising_edge(system_clk) then
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -953,7 +953,6 @@ begin
            v.denorm := '0';
            v.is_subtract := '0';
            v.add_bsmall := '0';
-            v.doing_ftdiv := "00";
            v.int_ovf := '0';
            v.div_close := '0';

@@ -1007,7 +1006,7 @@ begin
        elsif new_exp < min_exp then
            exp_tiny := '1';
        end if;
-	if is_X(new_exp) or is_X(min_exp) then
+	if is_X(new_exp) or is_X(max_exp) then
 	    exp_huge := 'X';
 	elsif new_exp > max_exp then
            exp_huge := '1';
@@ -1038,6 +1037,7 @@ begin

        v.update_fprf := '0';
        v.first := '0';
+        v.doing_ftdiv := "00";
        v.opsel_a := AIN_R;
        opsel_ainv <= '0';
        opsel_mask <= '0';
@@ -1147,8 +1147,10 @@ begin
                v.instr_done := '1';

            when DO_FTDIV =>
-                v.instr_done := '1';
                v.cr_result := "0000";
+                -- set result_exp to the exponent of B
+                re_sel2 <= REXP2_B;
+                re_set_result <= '1';
                if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
                    (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then
                    v.cr_result(2) := '1';
@@ -1157,6 +1159,7 @@ begin
                    r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or
                    (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) then
                    v.cr_result(1) := '1';
+                    v.instr_done := '1';
                else
                    v.doing_ftdiv := "11";
                    v.first := '1';
@@ -1173,7 +1176,7 @@ begin
                end if;
                if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
                    or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then
-                    v.cr_result(1) := '0';
+                    v.cr_result(1) := '1';
                end if;

            when DO_FCMP =>
@@ -2148,6 +2151,9 @@ begin
                v.state := NORMALIZE;

            when FTDIV_1 =>
+                -- We go through this state up to two times; the first sees if
+                -- B.exponent is in the range [-1021,1020], and the second tests
+                -- whether B.exp - A.exp is in the range [-1022,1020].
                v.cr_result(1) := exp_tiny or exp_huge;
                -- set shift to a.exp
                rs_sel2 <= RSH2_A;
--- a/icache.vhdl
+++ b/icache.vhdl
@@ -403,12 +403,12 @@ begin
            variable snoop_addr : real_addr_t;
            variable next_raddr : real_addr_t;
        begin
-            replace_way := to_unsigned(0, WAY_BITS);
-            if NUM_WAYS > 1 then
-                -- Get victim way from plru
-                replace_way := plru_victim;
-            end if;
            if rising_edge(clk) then
+                replace_way := to_unsigned(0, WAY_BITS);
+                if NUM_WAYS > 1 then
+                    -- Get victim way from plru
+                    replace_way := plru_victim;
+                end if;
                -- Read tags using NIA for next cycle
                if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then
                    next_raddr := i_in.next_rpn & i_in.next_nia(MIN_LG_PGSZ - 1 downto 0);
@@ -649,6 +649,7 @@ begin
    begin
        if rising_edge(clk) then
            ev.icache_miss <= '0';
+            ev.itlb_miss_resolved <= '0';
            r.recv_valid <= '0';
 	    -- On reset, clear all valid bits to force misses
            if rst = '1' then
--- a/microwatt.core
+++ b/microwatt.core
@@ -62,14 +62,13 @@ filesets:
      - fpga/pp_soc_uart.vhd
      - fpga/pp_utilities.vhd
      - fpga/firmware.hex : {copyto : firmware.hex, file_type : user}
+      - nonrandom.vhdl
    file_type : vhdlSource-2008

  xilinx_specific:
    files:
      - xilinx-mult.vhdl : {file_type : vhdlSource-2008}
      - xilinx-mult-32s.vhdl : {file_type : vhdlSource-2008}
-      - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008}
-      - fpga/fpga-random.xdc : {file_type : xdc}

  debug_xilinx:
    files:
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1665,6 +1665,65 @@ int fpu_test_25(void)
 	return 0;
 }

+struct ftvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	int cr_ftdiv;
+	int cr_ftsqrt;
+} ftvals[] = {
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0, 0 },
+	{ 0x0000000000000000, 0x3ff0000000000000, 0, 6 },
+	{ 0xfff0000000000000, 0x3ff0000000000000, 6, 6 },
+	{ 0x7ff1234560000000, 0x3ff0000000000000, 2, 2 },
+	{ 0x3ff0000000000000, 0xfff0000000000000, 6, 0 },
+	{ 0x3ff0000000000000, 0x8000000000000000, 6, 0 },
+	{ 0x3ff0000000000000, 0x7ff9234560000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x0020000000000000, 0, 0 },
+	{ 0x3ff0000000000000, 0x0010000000000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x0001000000000000, 6, 0 },
+	{ 0x3ff0000000000000, 0x7fb1234500000000, 0, 0 },
+	{ 0x3ff0000000000000, 0x7fc1234500000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x7fd1234500000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x7fe1234500000000, 2, 0 },
+	{ 0x6000000000000000, 0x2000000000000000, 2, 0 },
+	{ 0x5ff0000000000000, 0x2000000000000000, 2, 0 },
+	{ 0x5fe0000000000000, 0x2000000000000000, 0, 0 },
+	{ 0x2000000000000000, 0x5fc0000000000000, 0, 0 },
+	{ 0x2000000000000000, 0x5fd0000000000000, 2, 0 },
+	{ 0x0360000000000000, 0x4320000000000000, 0, 0 },
+	{ 0x0350000000000000, 0x4310000000000000, 2, 2 },
+	{ 0x0010000000000000, 0x3fd0000000000000, 2, 2 },
+	{ 0x0001000000000000, 0x3fd0000000000000, 2, 6 },
+	{ 0xbff0000000000000, 0x3ff0000000000000, 0, 2 },
+	{ 0x3fd0000000000000, 0x0001000000000000, 6, 0 },
+};
+
+int test26(long arg)
+{
+	long i;
+	int cr;
+	struct ftvals *vp = ftvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(ftvals) / sizeof(ftvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%1); lfd 6,8(%1); ftdiv 5,5,6; ftsqrt 4,5; mfcr %0" :
+		    "=r" (cr) : "b" (&vp->val_a) : "cr4", "cr5");
+		if (((cr >> 8) & 0xf) != vp->cr_ftdiv ||
+		    ((cr >> 12) & 0x1f) != vp->cr_ftsqrt) {
+			print_hex(i, 2, " ");
+			print_hex(cr, 8, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_26(void)
+{
+	enable_fp();
+	return trapit(0, test26);
+}
+
 int fail = 0;

 void do_test(int num, int (*test)(void))
@@ -1715,6 +1774,7 @@ int main(void)
 	do_test(23, fpu_test_23);
 	do_test(24, fpu_test_24);
 	do_test(25, fpu_test_25);
+	do_test(26, fpu_test_26);

 	return fail;
 }
--- a/tests/test_fpu.bin
+++ b/tests/test_fpu.bin
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -23,3 +23,4 @@ test 22:PASS
 test 23:PASS
 test 24:PASS
 test 25:PASS
+test 26:PASS
--- a/xics.vhdl
+++ b/xics.vhdl
@@ -386,15 +386,14 @@ begin
    reg_write: process(clk)
        variable be_in  : std_ulogic_vector(31 downto 0);
    begin
-        -- Byteswapped input
-        be_in := bswap(wb_in.dat);
-
        if rising_edge(clk) then
            if rst = '1' then
                for i in 0 to SRC_NUM - 1 loop
                    xives(i) <= (pri => pri_masked);
                end loop;
            elsif wb_valid = '1' and wb_in.we = '1' then
+                -- Byteswapped input
+                be_in := bswap(wb_in.dat);
                if reg_is_xive then
                    -- TODO: When adding support for other bits, make sure to
                    -- properly implement wb_in.sel to allow partial writes.
--- a/xilinx-mult-32s.vhdl
+++ b/xilinx-mult-32s.vhdl
@@ -286,9 +286,11 @@ begin

    process(clk)
    begin
-        if rising_edge(clk) and stall = '0' then
-            m_out.valid <= m_in.valid;
-            product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+        if rising_edge(clk) then
+            if stall = '0' then
+                m_out.valid <= m_in.valid;
+                product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+            end if;
        end if;
    end process;