mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-05-04 15:26:12 +00:00
The goal is to have the icache fit in BRAM by latching the output into a register. In order to avoid timing issues , we need to give the BRAM a full cycle on reads, and thus we souce the BRAM address directly from fetch1 latched NIA. (Note: This will be problematic if/when we want to hash the address, we'll probably be better off having fetch1 latch a fully hashed address along with the normal one, so the icache can use the former to address the BRAM and pass the latter along) One difficulty is that we cannot really stall the icache without adding more combo logic that would break the "one full cycle" BRAM model. This means that on stalls from decode, by the time we stall fetch1, it has already gone to the next address, which the icache is already latching. We work around this by having a "stash" buffer in fetch2 that will stash away the icache output on a stall, and override the output of the icache with the content of the stash buffer when unstalling. This requires a rewrite of the stop/step debug logic as well. We now do most of the hard work in fetch1 which makes more sense. Note: Vivado is still not inferring an built-in output register for the BRAMs. I don't want to add another cycle... I don't fully understand why it wouldn't be able to treat current_row as such but clearly it won't. At least the timing seems good enough now for 100Mhz, possibly more. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
258 lines
7.9 KiB
VHDL
258 lines
7.9 KiB
VHDL
library ieee;
|
|
use ieee.std_logic_1164.all;
|
|
use ieee.numeric_std.all;
|
|
|
|
library work;
|
|
use work.common.all;
|
|
use work.wishbone_types.all;
|
|
|
|
-- 64 bit direct mapped icache. All instructions are 4B aligned.
|
|
|
|
entity icache is
|
|
generic (
|
|
-- Line size in 64bit doublewords
|
|
LINE_SIZE_DW : natural := 8;
|
|
-- Number of lines
|
|
NUM_LINES : natural := 32
|
|
);
|
|
port (
|
|
clk : in std_ulogic;
|
|
rst : in std_ulogic;
|
|
|
|
i_in : in Fetch1ToIcacheType;
|
|
i_out : out IcacheToFetch2Type;
|
|
|
|
stall_out : out std_ulogic;
|
|
flush_in : in std_ulogic;
|
|
|
|
wishbone_out : out wishbone_master_out;
|
|
wishbone_in : in wishbone_slave_out
|
|
);
|
|
end entity icache;
|
|
|
|
architecture rtl of icache is
|
|
function log2(i : natural) return integer is
|
|
variable tmp : integer := i;
|
|
variable ret : integer := 0;
|
|
begin
|
|
while tmp > 1 loop
|
|
ret := ret + 1;
|
|
tmp := tmp / 2;
|
|
end loop;
|
|
return ret;
|
|
end function;
|
|
|
|
function ispow2(i : integer) return boolean is
|
|
begin
|
|
if to_integer(to_unsigned(i, 32) and to_unsigned(i - 1, 32)) = 0 then
|
|
return true;
|
|
else
|
|
return false;
|
|
end if;
|
|
end function;
|
|
|
|
constant LINE_SIZE : natural := LINE_SIZE_DW*8;
|
|
constant OFFSET_BITS : natural := log2(LINE_SIZE);
|
|
constant INDEX_BITS : natural := log2(NUM_LINES);
|
|
constant TAG_BITS : natural := 64 - OFFSET_BITS - INDEX_BITS;
|
|
|
|
subtype cacheline_type is std_logic_vector((LINE_SIZE*8)-1 downto 0);
|
|
type cacheline_array is array(0 to NUM_LINES-1) of cacheline_type;
|
|
|
|
subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0);
|
|
type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type;
|
|
|
|
-- Storage. Hopefully "cachelines" is a BRAM, the rest is LUTs
|
|
signal cachelines : cacheline_array;
|
|
signal tags : cacheline_tag_array;
|
|
signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0);
|
|
attribute ram_style : string;
|
|
attribute ram_style of cachelines : signal is "block";
|
|
attribute ram_decomp : string;
|
|
attribute ram_decomp of cachelines : signal is "power";
|
|
|
|
-- Cache reload state machine
|
|
type state_type is (IDLE, WAIT_ACK);
|
|
|
|
type reg_internal_type is record
|
|
-- Cache hit state (1 cycle BRAM access)
|
|
hit_line : cacheline_type;
|
|
hit_nia : std_ulogic_vector(63 downto 0);
|
|
hit_smark : std_ulogic;
|
|
hit_valid : std_ulogic;
|
|
|
|
-- Cache miss state (reload state machine)
|
|
state : state_type;
|
|
wb : wishbone_master_out;
|
|
store_index : integer range 0 to (NUM_LINES-1);
|
|
store_mask : std_ulogic_vector(LINE_SIZE_DW-1 downto 0);
|
|
end record;
|
|
|
|
signal r : reg_internal_type;
|
|
|
|
-- Async signals decoding incoming requests
|
|
signal req_index : integer range 0 to NUM_LINES-1;
|
|
signal req_tag : std_ulogic_vector(TAG_BITS-1 downto 0);
|
|
signal req_word : integer range 0 to LINE_SIZE_DW*2-1;
|
|
signal req_is_hit : std_ulogic;
|
|
|
|
-- Return the cache line index (tag index) for an address
|
|
function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is
|
|
begin
|
|
return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS)));
|
|
end;
|
|
|
|
-- Return the word index in a cache line for an address
|
|
function get_word(addr: std_ulogic_vector(63 downto 0)) return integer is
|
|
begin
|
|
return to_integer(unsigned(addr(OFFSET_BITS-1 downto 2)));
|
|
end;
|
|
|
|
-- Read a word in a cache line for an address
|
|
function read_word(word: integer; data: cacheline_type) return std_ulogic_vector is
|
|
begin
|
|
return data((word+1)*32-1 downto word*32);
|
|
end;
|
|
|
|
-- Calculate the tag value from the address
|
|
function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
|
|
begin
|
|
return addr(63 downto OFFSET_BITS+INDEX_BITS);
|
|
end;
|
|
|
|
begin
|
|
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
|
|
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
|
|
|
|
icache_comb : process(all)
|
|
begin
|
|
-- Calculate next index and tag index
|
|
req_index <= get_index(i_in.nia);
|
|
req_tag <= get_tag(i_in.nia);
|
|
req_word <= get_word(i_in.nia);
|
|
|
|
-- Test if pending request is a hit
|
|
if tags(req_index) = req_tag then
|
|
req_is_hit <= tags_valid(req_index);
|
|
else
|
|
req_is_hit <= '0';
|
|
end if;
|
|
|
|
-- Output instruction from current cache line
|
|
--
|
|
-- Note: This is a mild violation of our design principle of having pipeline
|
|
-- stages output from a clean latch. In this case we output the result
|
|
-- of a mux. The alternative would be output an entire cache line
|
|
-- which I prefer not to do just yet.
|
|
--
|
|
i_out.valid <= r.hit_valid;
|
|
i_out.insn <= read_word(get_word(r.hit_nia), r.hit_line);
|
|
i_out.nia <= r.hit_nia;
|
|
i_out.stop_mark <= r.hit_smark;
|
|
|
|
-- This needs to match the latching of a new request in icache_hit
|
|
stall_out <= not req_is_hit;
|
|
|
|
-- Wishbone requests output (from the cache miss reload machine)
|
|
wishbone_out <= r.wb;
|
|
end process;
|
|
|
|
icache_hit : process(clk)
|
|
begin
|
|
if rising_edge(clk) then
|
|
-- Assume we have nothing valid first
|
|
r.hit_valid <= '0';
|
|
|
|
-- Are we free to latch a new request ?
|
|
--
|
|
-- Note: this test needs to match the equation for generating stall_out
|
|
--
|
|
if i_in.req = '1' and req_is_hit = '1' and flush_in = '0' then
|
|
-- Read the cache line (BRAM read port) and remember the NIA
|
|
r.hit_line <= cachelines(req_index);
|
|
r.hit_nia <= i_in.nia;
|
|
r.hit_smark <= i_in.stop_mark;
|
|
r.hit_valid <= '1';
|
|
|
|
report "cache hit nia:" & to_hstring(i_in.nia) &
|
|
" SM:" & std_ulogic'image(i_in.stop_mark) &
|
|
" idx:" & integer'image(req_index) &
|
|
" tag:" & to_hstring(req_tag);
|
|
end if;
|
|
|
|
-- Flush requested ? discard...
|
|
if flush_in then
|
|
r.hit_valid <= '0';
|
|
end if;
|
|
end if;
|
|
end process;
|
|
|
|
icache_miss : process(clk)
|
|
variable store_dword : std_ulogic_vector(OFFSET_BITS-4 downto 0);
|
|
begin
|
|
if rising_edge(clk) then
|
|
if rst = '1' then
|
|
tags_valid <= (others => '0');
|
|
r.store_mask <= (others => '0');
|
|
r.state <= IDLE;
|
|
r.wb.cyc <= '0';
|
|
r.wb.stb <= '0';
|
|
|
|
-- We only ever do reads on wishbone
|
|
r.wb.dat <= (others => '0');
|
|
r.wb.sel <= "11111111";
|
|
r.wb.we <= '0';
|
|
end if;
|
|
|
|
-- State machine
|
|
case r.state is
|
|
when IDLE =>
|
|
-- We need to read a cache line
|
|
if i_in.req = '1' and req_is_hit = '0' then
|
|
|
|
report "cache miss nia:" & to_hstring(i_in.nia) &
|
|
" SM:" & std_ulogic'image(i_in.stop_mark) &
|
|
" idx:" & integer'image(req_index) &
|
|
" tag:" & to_hstring(req_tag);
|
|
|
|
r.state <= WAIT_ACK;
|
|
r.store_mask <= (0 => '1', others => '0');
|
|
r.store_index <= req_index;
|
|
|
|
-- Force misses while reloading that line
|
|
tags_valid(req_index) <= '0';
|
|
tags(req_index) <= req_tag;
|
|
|
|
-- Prep for first dword read
|
|
r.wb.adr <= i_in.nia(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0');
|
|
r.wb.cyc <= '1';
|
|
r.wb.stb <= '1';
|
|
end if;
|
|
when WAIT_ACK =>
|
|
if wishbone_in.ack = '1' then
|
|
-- Store the current dword in both the cache
|
|
for i in 0 to LINE_SIZE_DW-1 loop
|
|
if r.store_mask(i) = '1' then
|
|
cachelines(r.store_index)(63 + i*64 downto i*64) <= wishbone_in.dat;
|
|
end if;
|
|
end loop;
|
|
|
|
-- That was the last word ? We are done
|
|
if r.store_mask(LINE_SIZE_DW-1) = '1' then
|
|
r.state <= IDLE;
|
|
tags_valid(r.store_index) <= '1';
|
|
r.wb.cyc <= '0';
|
|
r.wb.stb <= '0';
|
|
else
|
|
store_dword := r.wb.adr(OFFSET_BITS-1 downto 3);
|
|
store_dword := std_ulogic_vector(unsigned(store_dword) + 1);
|
|
r.wb.adr(OFFSET_BITS-1 downto 3) <= store_dword;
|
|
end if;
|
|
-- Advance to next word
|
|
r.store_mask <= r.store_mask(LINE_SIZE_DW-2 downto 0) & '0';
|
|
end if;
|
|
end case;
|
|
end if;
|
|
end process;
|
|
end;
|