mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-04-30 13:52:20 +00:00
fetch/icache: Fit icache in BRAM
The goal is to have the icache fit in BRAM by latching the output into a register. In order to avoid timing issues , we need to give the BRAM a full cycle on reads, and thus we souce the BRAM address directly from fetch1 latched NIA. (Note: This will be problematic if/when we want to hash the address, we'll probably be better off having fetch1 latch a fully hashed address along with the normal one, so the icache can use the former to address the BRAM and pass the latter along) One difficulty is that we cannot really stall the icache without adding more combo logic that would break the "one full cycle" BRAM model. This means that on stalls from decode, by the time we stall fetch1, it has already gone to the next address, which the icache is already latching. We work around this by having a "stash" buffer in fetch2 that will stash away the icache output on a stall, and override the output of the icache with the content of the stash buffer when unstalling. This requires a rewrite of the stop/step debug logic as well. We now do most of the hard work in fetch1 which makes more sense. Note: Vivado is still not inferring an built-in output register for the BRAMs. I don't want to add another cycle... I don't fully understand why it wouldn't be able to treat current_row as such but clearly it won't. At least the timing seems good enough now for 100Mhz, possibly more. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
13
common.vhdl
13
common.vhdl
@@ -12,17 +12,16 @@ package common is
|
|||||||
carry: std_ulogic;
|
carry: std_ulogic;
|
||||||
end record;
|
end record;
|
||||||
|
|
||||||
type Fetch1ToFetch2Type is record
|
type Fetch1ToIcacheType is record
|
||||||
nia: std_ulogic_vector(63 downto 0);
|
|
||||||
end record;
|
|
||||||
|
|
||||||
type Fetch2ToIcacheType is record
|
|
||||||
req: std_ulogic;
|
req: std_ulogic;
|
||||||
addr: std_ulogic_vector(63 downto 0);
|
stop_mark: std_ulogic;
|
||||||
|
nia: std_ulogic_vector(63 downto 0);
|
||||||
end record;
|
end record;
|
||||||
|
|
||||||
type IcacheToFetch2Type is record
|
type IcacheToFetch2Type is record
|
||||||
ack: std_ulogic;
|
valid: std_ulogic;
|
||||||
|
stop_mark: std_ulogic;
|
||||||
|
nia: std_ulogic_vector(63 downto 0);
|
||||||
insn: std_ulogic_vector(31 downto 0);
|
insn: std_ulogic_vector(31 downto 0);
|
||||||
end record;
|
end record;
|
||||||
|
|
||||||
|
|||||||
46
core.vhdl
46
core.vhdl
@@ -33,11 +33,10 @@ end core;
|
|||||||
|
|
||||||
architecture behave of core is
|
architecture behave of core is
|
||||||
-- fetch signals
|
-- fetch signals
|
||||||
signal fetch1_to_fetch2: Fetch1ToFetch2Type;
|
|
||||||
signal fetch2_to_decode1: Fetch2ToDecode1Type;
|
signal fetch2_to_decode1: Fetch2ToDecode1Type;
|
||||||
|
|
||||||
-- icache signals
|
-- icache signals
|
||||||
signal fetch2_to_icache : Fetch2ToIcacheType;
|
signal fetch1_to_icache : Fetch1ToIcacheType;
|
||||||
signal icache_to_fetch2 : IcacheToFetch2Type;
|
signal icache_to_fetch2 : IcacheToFetch2Type;
|
||||||
|
|
||||||
-- decode signals
|
-- decode signals
|
||||||
@@ -74,8 +73,8 @@ architecture behave of core is
|
|||||||
|
|
||||||
-- local signals
|
-- local signals
|
||||||
signal fetch1_stall_in : std_ulogic;
|
signal fetch1_stall_in : std_ulogic;
|
||||||
|
signal icache_stall_out : std_ulogic;
|
||||||
signal fetch2_stall_in : std_ulogic;
|
signal fetch2_stall_in : std_ulogic;
|
||||||
signal fetch2_stall_out : std_ulogic;
|
|
||||||
signal decode1_stall_in : std_ulogic;
|
signal decode1_stall_in : std_ulogic;
|
||||||
signal decode2_stall_out : std_ulogic;
|
signal decode2_stall_out : std_ulogic;
|
||||||
|
|
||||||
@@ -107,27 +106,12 @@ begin
|
|||||||
rst => core_rst,
|
rst => core_rst,
|
||||||
stall_in => fetch1_stall_in,
|
stall_in => fetch1_stall_in,
|
||||||
flush_in => flush,
|
flush_in => flush,
|
||||||
e_in => execute1_to_fetch1,
|
|
||||||
f_out => fetch1_to_fetch2
|
|
||||||
);
|
|
||||||
|
|
||||||
fetch1_stall_in <= fetch2_stall_out or decode2_stall_out;
|
|
||||||
|
|
||||||
fetch2_0: entity work.fetch2
|
|
||||||
port map (
|
|
||||||
clk => clk,
|
|
||||||
rst => core_rst,
|
|
||||||
stall_in => fetch2_stall_in,
|
|
||||||
stall_out => fetch2_stall_out,
|
|
||||||
flush_in => flush,
|
|
||||||
i_in => icache_to_fetch2,
|
|
||||||
i_out => fetch2_to_icache,
|
|
||||||
stop_in => dbg_core_stop,
|
stop_in => dbg_core_stop,
|
||||||
f_in => fetch1_to_fetch2,
|
e_in => execute1_to_fetch1,
|
||||||
f_out => fetch2_to_decode1
|
i_out => fetch1_to_icache
|
||||||
);
|
);
|
||||||
|
|
||||||
fetch2_stall_in <= decode2_stall_out;
|
fetch1_stall_in <= icache_stall_out or decode2_stall_out;
|
||||||
|
|
||||||
icache_0: entity work.icache
|
icache_0: entity work.icache
|
||||||
generic map(
|
generic map(
|
||||||
@@ -137,13 +121,27 @@ begin
|
|||||||
port map(
|
port map(
|
||||||
clk => clk,
|
clk => clk,
|
||||||
rst => icache_rst,
|
rst => icache_rst,
|
||||||
i_in => fetch2_to_icache,
|
i_in => fetch1_to_icache,
|
||||||
i_out => icache_to_fetch2,
|
i_out => icache_to_fetch2,
|
||||||
|
flush_in => flush,
|
||||||
|
stall_out => icache_stall_out,
|
||||||
wishbone_out => wishbone_insn_out,
|
wishbone_out => wishbone_insn_out,
|
||||||
wishbone_in => wishbone_insn_in
|
wishbone_in => wishbone_insn_in
|
||||||
);
|
);
|
||||||
|
|
||||||
icache_rst <= rst or dbg_icache_rst;
|
icache_rst <= rst or dbg_icache_rst;
|
||||||
|
|
||||||
|
fetch2_0: entity work.fetch2
|
||||||
|
port map (
|
||||||
|
clk => clk,
|
||||||
|
rst => core_rst,
|
||||||
|
stall_in => fetch2_stall_in,
|
||||||
|
flush_in => flush,
|
||||||
|
i_in => icache_to_fetch2,
|
||||||
|
f_out => fetch2_to_decode1
|
||||||
|
);
|
||||||
|
|
||||||
|
fetch2_stall_in <= decode2_stall_out;
|
||||||
|
|
||||||
decode1_0: entity work.decode1
|
decode1_0: entity work.decode1
|
||||||
port map (
|
port map (
|
||||||
@@ -274,7 +272,7 @@ begin
|
|||||||
icache_rst => dbg_icache_rst,
|
icache_rst => dbg_icache_rst,
|
||||||
terminate => terminate,
|
terminate => terminate,
|
||||||
core_stopped => dbg_core_is_stopped,
|
core_stopped => dbg_core_is_stopped,
|
||||||
nia => fetch1_to_fetch2.nia,
|
nia => fetch1_to_icache.nia,
|
||||||
terminated_out => terminated_out
|
terminated_out => terminated_out
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -91,15 +91,15 @@ begin
|
|||||||
reg_write: process(clk)
|
reg_write: process(clk)
|
||||||
begin
|
begin
|
||||||
if rising_edge(clk) then
|
if rising_edge(clk) then
|
||||||
|
-- Reset the 1-cycle "do" signals
|
||||||
|
do_step <= '0';
|
||||||
|
do_reset <= '0';
|
||||||
|
do_icreset <= '0';
|
||||||
|
|
||||||
if (rst) then
|
if (rst) then
|
||||||
stopping <= '0';
|
stopping <= '0';
|
||||||
terminated <= '0';
|
terminated <= '0';
|
||||||
else
|
else
|
||||||
-- Reset the 1-cycle "do" signals
|
|
||||||
do_step <= '0';
|
|
||||||
do_reset <= '0';
|
|
||||||
do_icreset <= '0';
|
|
||||||
|
|
||||||
-- Edge detect on dmi_req for 1-shot pulses
|
-- Edge detect on dmi_req for 1-shot pulses
|
||||||
dmi_req_1 <= dmi_req;
|
dmi_req_1 <= dmi_req;
|
||||||
if dmi_req = '1' and dmi_req_1 = '0' then
|
if dmi_req = '1' and dmi_req_1 = '0' then
|
||||||
|
|||||||
90
fetch1.vhdl
90
fetch1.vhdl
@@ -16,51 +16,111 @@ entity fetch1 is
|
|||||||
-- Control inputs:
|
-- Control inputs:
|
||||||
stall_in : in std_ulogic;
|
stall_in : in std_ulogic;
|
||||||
flush_in : in std_ulogic;
|
flush_in : in std_ulogic;
|
||||||
|
stop_in : in std_ulogic;
|
||||||
|
|
||||||
-- redirect from execution unit
|
-- redirect from execution unit
|
||||||
e_in : in Execute1ToFetch1Type;
|
e_in : in Execute1ToFetch1Type;
|
||||||
|
|
||||||
-- fetch data out
|
-- Request to icache
|
||||||
f_out : out Fetch1ToFetch2Type
|
i_out : out Fetch1ToIcacheType
|
||||||
);
|
);
|
||||||
end entity fetch1;
|
end entity fetch1;
|
||||||
|
|
||||||
architecture behaviour of fetch1 is
|
architecture behaviour of fetch1 is
|
||||||
signal r, r_next : Fetch1ToFetch2Type;
|
type stop_state_t is (RUNNING, STOPPED, RESTARTING);
|
||||||
|
type reg_internal_t is record
|
||||||
|
stop_state: stop_state_t;
|
||||||
|
end record;
|
||||||
|
signal r, r_next : Fetch1ToIcacheType;
|
||||||
|
signal r_int, r_next_int : reg_internal_t;
|
||||||
begin
|
begin
|
||||||
|
|
||||||
regs : process(clk)
|
regs : process(clk)
|
||||||
begin
|
begin
|
||||||
if rising_edge(clk) then
|
if rising_edge(clk) then
|
||||||
if rst = '1' or e_in.redirect = '1' or stall_in = '0' then
|
if r /= r_next then
|
||||||
r <= r_next;
|
report "fetch1 rst:" & std_ulogic'image(rst) &
|
||||||
|
" R:" & std_ulogic'image(e_in.redirect) &
|
||||||
|
" S:" & std_ulogic'image(stall_in) &
|
||||||
|
" T:" & std_ulogic'image(stop_in) &
|
||||||
|
" nia:" & to_hstring(r_next.nia) &
|
||||||
|
" SM:" & std_ulogic'image(r_next.stop_mark);
|
||||||
end if;
|
end if;
|
||||||
|
r <= r_next;
|
||||||
|
r_int <= r_next_int;
|
||||||
end if;
|
end if;
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
comb : process(all)
|
comb : process(all)
|
||||||
variable v : Fetch1ToFetch2Type;
|
variable v : Fetch1ToIcacheType;
|
||||||
|
variable v_int : reg_internal_t;
|
||||||
|
variable increment : boolean;
|
||||||
begin
|
begin
|
||||||
v := r;
|
v := r;
|
||||||
|
v_int := r_int;
|
||||||
|
|
||||||
if rst = '1' then
|
if rst = '1' then
|
||||||
v.nia := RESET_ADDRESS;
|
v.nia := RESET_ADDRESS;
|
||||||
|
v_int.stop_state := RUNNING;
|
||||||
elsif e_in.redirect = '1' then
|
elsif e_in.redirect = '1' then
|
||||||
v.nia := e_in.redirect_nia;
|
v.nia := e_in.redirect_nia;
|
||||||
else
|
elsif stall_in = '0' then
|
||||||
v.nia := std_logic_vector(unsigned(v.nia) + 4);
|
|
||||||
|
-- For debug stop/step to work properly we need a little bit of
|
||||||
|
-- trickery here. If we just stop incrementing and send stop marks
|
||||||
|
-- when stop_in is set, then we'll increment on the cycle it clears
|
||||||
|
-- and end up never executing the instruction we were stopped on.
|
||||||
|
--
|
||||||
|
-- Avoid this along with the opposite issue when stepping (stop is
|
||||||
|
-- cleared for only one cycle) is handled by the state machine below
|
||||||
|
--
|
||||||
|
-- By default, increment addresses
|
||||||
|
increment := true;
|
||||||
|
case v_int.stop_state is
|
||||||
|
when RUNNING =>
|
||||||
|
-- If we are running and stop_in is set, then stop incrementing,
|
||||||
|
-- we are now stopped.
|
||||||
|
if stop_in = '1' then
|
||||||
|
increment := false;
|
||||||
|
v_int.stop_state := STOPPED;
|
||||||
|
end if;
|
||||||
|
when STOPPED =>
|
||||||
|
-- When stopped, never increment. If stop is cleared, go to state
|
||||||
|
-- "restarting" but still don't increment that cycle. stop_in is
|
||||||
|
-- now 0 so we'll send the NIA down without a stop mark.
|
||||||
|
increment := false;
|
||||||
|
if stop_in = '0' then
|
||||||
|
v_int.stop_state := RESTARTING;
|
||||||
|
end if;
|
||||||
|
when RESTARTING =>
|
||||||
|
-- We have just sent the NIA down, we can start incrementing again.
|
||||||
|
-- If stop_in is still not set, go back to running normally.
|
||||||
|
-- If stop_in is set again (that was a one-cycle "step"), go
|
||||||
|
-- back to "stopped" state which means we'll stop incrementing
|
||||||
|
-- on the next cycle. This ensures we increment the PC once after
|
||||||
|
-- sending one instruction without a stop mark. Since stop_in is
|
||||||
|
-- now set, the new PC will be sent with a stop mark and thus not
|
||||||
|
-- executed.
|
||||||
|
if stop_in = '0' then
|
||||||
|
v_int.stop_state := RUNNING;
|
||||||
|
else
|
||||||
|
v_int.stop_state := STOPPED;
|
||||||
|
end if;
|
||||||
|
end case;
|
||||||
|
|
||||||
|
if increment then
|
||||||
|
v.nia := std_logic_vector(unsigned(v.nia) + 4);
|
||||||
|
end if;
|
||||||
end if;
|
end if;
|
||||||
|
|
||||||
|
v.req := not rst;
|
||||||
|
v.stop_mark := stop_in;
|
||||||
|
|
||||||
r_next <= v;
|
r_next <= v;
|
||||||
|
r_next_int <= v_int;
|
||||||
|
|
||||||
-- Update outputs to the icache
|
-- Update outputs to the icache
|
||||||
f_out <= r;
|
i_out <= r;
|
||||||
|
|
||||||
report "fetch1 rst:" & std_ulogic'image(rst) &
|
|
||||||
" R:" & std_ulogic'image(e_in.redirect) &
|
|
||||||
" S:" & std_ulogic'image(stall_in) &
|
|
||||||
" nia_next:" & to_hstring(r_next.nia) &
|
|
||||||
" nia:" & to_hstring(r.nia);
|
|
||||||
|
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
|
|||||||
85
fetch2.vhdl
85
fetch2.vhdl
@@ -12,55 +12,108 @@ entity fetch2 is
|
|||||||
rst : in std_ulogic;
|
rst : in std_ulogic;
|
||||||
|
|
||||||
stall_in : in std_ulogic;
|
stall_in : in std_ulogic;
|
||||||
stall_out : out std_ulogic;
|
|
||||||
|
|
||||||
flush_in : in std_ulogic;
|
flush_in : in std_ulogic;
|
||||||
stop_in : in std_ulogic;
|
|
||||||
|
|
||||||
|
-- Results from icache
|
||||||
i_in : in IcacheToFetch2Type;
|
i_in : in IcacheToFetch2Type;
|
||||||
i_out : out Fetch2ToIcacheType;
|
|
||||||
|
|
||||||
f_in : in Fetch1ToFetch2Type;
|
|
||||||
|
|
||||||
|
-- Output to decode
|
||||||
f_out : out Fetch2ToDecode1Type
|
f_out : out Fetch2ToDecode1Type
|
||||||
);
|
);
|
||||||
end entity fetch2;
|
end entity fetch2;
|
||||||
|
|
||||||
architecture behaviour of fetch2 is
|
architecture behaviour of fetch2 is
|
||||||
|
|
||||||
|
-- The icache cannot stall, so we need to stash a cycle
|
||||||
|
-- of output from it when we stall.
|
||||||
|
type reg_internal_type is record
|
||||||
|
stash : IcacheToFetch2Type;
|
||||||
|
stash_valid : std_ulogic;
|
||||||
|
stopped : std_ulogic;
|
||||||
|
end record;
|
||||||
|
|
||||||
|
signal r_int, rin_int : reg_internal_type;
|
||||||
signal r, rin : Fetch2ToDecode1Type;
|
signal r, rin : Fetch2ToDecode1Type;
|
||||||
|
|
||||||
begin
|
begin
|
||||||
regs : process(clk)
|
regs : process(clk)
|
||||||
begin
|
begin
|
||||||
if rising_edge(clk) then
|
if rising_edge(clk) then
|
||||||
|
|
||||||
|
if (r /= rin) then
|
||||||
|
report "fetch2 rst:" & std_ulogic'image(rst) &
|
||||||
|
" S:" & std_ulogic'image(stall_in) &
|
||||||
|
" F:" & std_ulogic'image(flush_in) &
|
||||||
|
" T:" & std_ulogic'image(rin.stop_mark) &
|
||||||
|
" V:" & std_ulogic'image(rin.valid) &
|
||||||
|
" nia:" & to_hstring(rin.nia);
|
||||||
|
end if;
|
||||||
|
|
||||||
-- Output state remains unchanged on stall, unless we are flushing
|
-- Output state remains unchanged on stall, unless we are flushing
|
||||||
if rst = '1' or flush_in = '1' or stall_in = '0' then
|
if rst = '1' or flush_in = '1' or stall_in = '0' then
|
||||||
r <= rin;
|
r <= rin;
|
||||||
end if;
|
end if;
|
||||||
|
|
||||||
|
-- Internal state is updated on every clock
|
||||||
|
r_int <= rin_int;
|
||||||
end if;
|
end if;
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
comb : process(all)
|
comb : process(all)
|
||||||
variable v : Fetch2ToDecode1Type;
|
variable v : Fetch2ToDecode1Type;
|
||||||
|
variable v_int : reg_internal_type;
|
||||||
|
variable v_i_in : IcacheToFetch2Type;
|
||||||
begin
|
begin
|
||||||
v := r;
|
v := r;
|
||||||
|
v_int := r_int;
|
||||||
|
|
||||||
-- asynchronous icache lookup
|
-- If stalling, stash away the current input from the icache
|
||||||
i_out.req <= '1';
|
if stall_in = '1' and v_int.stash_valid = '0' then
|
||||||
i_out.addr <= f_in.nia;
|
v_int.stash := i_in;
|
||||||
v.valid := i_in.ack;
|
v_int.stash_valid := '1';
|
||||||
v.nia := f_in.nia;
|
end if;
|
||||||
v.insn := i_in.insn;
|
|
||||||
stall_out <= stop_in or not i_in.ack;
|
-- If unstalling, source input from the stash and invalidate it,
|
||||||
|
-- otherwise source normally from the icache.
|
||||||
|
--
|
||||||
|
v_i_in := i_in;
|
||||||
|
if v_int.stash_valid = '1' and stall_in = '0' then
|
||||||
|
v_i_in := v_int.stash;
|
||||||
|
v_int.stash_valid := '0';
|
||||||
|
end if;
|
||||||
|
|
||||||
|
v.valid := v_i_in.valid;
|
||||||
|
v.stop_mark := v_i_in.stop_mark;
|
||||||
|
v.nia := v_i_in.nia;
|
||||||
|
v.insn := v_i_in.insn;
|
||||||
|
|
||||||
|
-- Clear stash internal valid bit on flush. We still mark
|
||||||
|
-- the stash itself as valid since we still want to override
|
||||||
|
-- whatever comes form icache when unstalling, but we'll
|
||||||
|
-- override it with something invalid.
|
||||||
|
--
|
||||||
|
if flush_in = '1' then
|
||||||
|
v_int.stash.valid := '0';
|
||||||
|
end if;
|
||||||
|
|
||||||
|
-- If we are flushing or the instruction comes with a stop mark
|
||||||
|
-- we tag it as invalid so it doesn't get decoded and executed
|
||||||
|
if flush_in = '1' or v.stop_mark = '1' then
|
||||||
|
|
||||||
if flush_in = '1' or stop_in = '1' then
|
|
||||||
v.valid := '0';
|
v.valid := '0';
|
||||||
end if;
|
end if;
|
||||||
v.stop_mark := stop_in;
|
|
||||||
|
-- Clear stash on reset
|
||||||
|
if rst = '1' then
|
||||||
|
v_int.stash_valid := '0';
|
||||||
|
end if;
|
||||||
|
|
||||||
-- Update registers
|
-- Update registers
|
||||||
rin <= v;
|
rin <= v;
|
||||||
|
rin_int <= v_int;
|
||||||
|
|
||||||
-- Update outputs
|
-- Update outputs
|
||||||
f_out <= r;
|
f_out <= r;
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
end architecture behaviour;
|
end architecture behaviour;
|
||||||
|
|||||||
190
icache.vhdl
190
icache.vhdl
@@ -19,9 +19,12 @@ entity icache is
|
|||||||
clk : in std_ulogic;
|
clk : in std_ulogic;
|
||||||
rst : in std_ulogic;
|
rst : in std_ulogic;
|
||||||
|
|
||||||
i_in : in Fetch2ToIcacheType;
|
i_in : in Fetch1ToIcacheType;
|
||||||
i_out : out IcacheToFetch2Type;
|
i_out : out IcacheToFetch2Type;
|
||||||
|
|
||||||
|
stall_out : out std_ulogic;
|
||||||
|
flush_in : in std_ulogic;
|
||||||
|
|
||||||
wishbone_out : out wishbone_master_out;
|
wishbone_out : out wishbone_master_out;
|
||||||
wishbone_in : in wishbone_slave_out
|
wishbone_in : in wishbone_slave_out
|
||||||
);
|
);
|
||||||
@@ -59,113 +62,194 @@ architecture rtl of icache is
|
|||||||
subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0);
|
subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0);
|
||||||
type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type;
|
type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type;
|
||||||
|
|
||||||
signal cachelines : cacheline_array := (others => (others => '0'));
|
-- Storage. Hopefully "cachelines" is a BRAM, the rest is LUTs
|
||||||
signal tags : cacheline_tag_array := (others => (others => '0'));
|
signal cachelines : cacheline_array;
|
||||||
signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0) := (others => '0');
|
signal tags : cacheline_tag_array;
|
||||||
|
signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0);
|
||||||
attribute ram_style : string;
|
attribute ram_style : string;
|
||||||
attribute ram_style of cachelines : signal is "block";
|
attribute ram_style of cachelines : signal is "block";
|
||||||
|
|
||||||
attribute ram_decomp : string;
|
attribute ram_decomp : string;
|
||||||
attribute ram_decomp of cachelines : signal is "power";
|
attribute ram_decomp of cachelines : signal is "power";
|
||||||
|
|
||||||
|
-- Cache reload state machine
|
||||||
type state_type is (IDLE, WAIT_ACK);
|
type state_type is (IDLE, WAIT_ACK);
|
||||||
|
|
||||||
type reg_internal_type is record
|
type reg_internal_type is record
|
||||||
state : state_type;
|
-- Cache hit state (1 cycle BRAM access)
|
||||||
w : wishbone_master_out;
|
hit_line : cacheline_type;
|
||||||
store_index : integer range 0 to (NUM_LINES-1);
|
hit_nia : std_ulogic_vector(63 downto 0);
|
||||||
store_word : integer range 0 to (LINE_SIZE-1);
|
hit_smark : std_ulogic;
|
||||||
|
hit_valid : std_ulogic;
|
||||||
|
|
||||||
|
-- Cache miss state (reload state machine)
|
||||||
|
state : state_type;
|
||||||
|
wb : wishbone_master_out;
|
||||||
|
store_index : integer range 0 to (NUM_LINES-1);
|
||||||
|
store_mask : std_ulogic_vector(LINE_SIZE_DW-1 downto 0);
|
||||||
end record;
|
end record;
|
||||||
|
|
||||||
signal r : reg_internal_type;
|
signal r : reg_internal_type;
|
||||||
|
|
||||||
signal read_index : integer range 0 to NUM_LINES-1;
|
-- Async signals decoding incoming requests
|
||||||
signal read_tag : std_ulogic_vector(63-OFFSET_BITS-INDEX_BITS downto 0);
|
signal req_index : integer range 0 to NUM_LINES-1;
|
||||||
signal read_miss : boolean;
|
signal req_tag : std_ulogic_vector(TAG_BITS-1 downto 0);
|
||||||
|
signal req_word : integer range 0 to LINE_SIZE_DW*2-1;
|
||||||
|
signal req_is_hit : std_ulogic;
|
||||||
|
|
||||||
|
-- Return the cache line index (tag index) for an address
|
||||||
function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is
|
function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is
|
||||||
begin
|
begin
|
||||||
return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS)));
|
return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS)));
|
||||||
end;
|
end;
|
||||||
|
|
||||||
function get_word(addr: std_ulogic_vector(63 downto 0); data: cacheline_type) return std_ulogic_vector is
|
-- Return the word index in a cache line for an address
|
||||||
variable word : integer;
|
function get_word(addr: std_ulogic_vector(63 downto 0)) return integer is
|
||||||
begin
|
begin
|
||||||
word := to_integer(unsigned(addr(OFFSET_BITS-1 downto 2)));
|
return to_integer(unsigned(addr(OFFSET_BITS-1 downto 2)));
|
||||||
return data((word+1)*32-1 downto word*32);
|
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
-- Read a word in a cache line for an address
|
||||||
|
function read_word(word: integer; data: cacheline_type) return std_ulogic_vector is
|
||||||
|
begin
|
||||||
|
return data((word+1)*32-1 downto word*32);
|
||||||
|
end;
|
||||||
|
|
||||||
|
-- Calculate the tag value from the address
|
||||||
function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
|
function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
|
||||||
begin
|
begin
|
||||||
return addr(63 downto OFFSET_BITS+INDEX_BITS);
|
return addr(63 downto OFFSET_BITS+INDEX_BITS);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
begin
|
begin
|
||||||
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
|
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
|
||||||
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
|
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
|
||||||
|
|
||||||
icache_read : process(all)
|
icache_comb : process(all)
|
||||||
begin
|
begin
|
||||||
read_index <= get_index(i_in.addr);
|
-- Calculate next index and tag index
|
||||||
read_tag <= get_tag(i_in.addr);
|
req_index <= get_index(i_in.nia);
|
||||||
read_miss <= false;
|
req_tag <= get_tag(i_in.nia);
|
||||||
|
req_word <= get_word(i_in.nia);
|
||||||
|
|
||||||
i_out.ack <= '0';
|
-- Test if pending request is a hit
|
||||||
i_out.insn <= get_word(i_in.addr, cachelines(read_index));
|
if tags(req_index) = req_tag then
|
||||||
|
req_is_hit <= tags_valid(req_index);
|
||||||
|
else
|
||||||
|
req_is_hit <= '0';
|
||||||
|
end if;
|
||||||
|
|
||||||
if i_in.req = '1' then
|
-- Output instruction from current cache line
|
||||||
if (tags_valid(read_index) = '1') and (tags(read_index) = read_tag) then
|
--
|
||||||
-- report hit asynchronously
|
-- Note: This is a mild violation of our design principle of having pipeline
|
||||||
i_out.ack <= '1';
|
-- stages output from a clean latch. In this case we output the result
|
||||||
else
|
-- of a mux. The alternative would be output an entire cache line
|
||||||
read_miss <= true;
|
-- which I prefer not to do just yet.
|
||||||
end if;
|
--
|
||||||
end if;
|
i_out.valid <= r.hit_valid;
|
||||||
|
i_out.insn <= read_word(get_word(r.hit_nia), r.hit_line);
|
||||||
|
i_out.nia <= r.hit_nia;
|
||||||
|
i_out.stop_mark <= r.hit_smark;
|
||||||
|
|
||||||
|
-- This needs to match the latching of a new request in icache_hit
|
||||||
|
stall_out <= not req_is_hit;
|
||||||
|
|
||||||
|
-- Wishbone requests output (from the cache miss reload machine)
|
||||||
|
wishbone_out <= r.wb;
|
||||||
end process;
|
end process;
|
||||||
|
|
||||||
wishbone_out <= r.w;
|
icache_hit : process(clk)
|
||||||
|
begin
|
||||||
|
if rising_edge(clk) then
|
||||||
|
-- Assume we have nothing valid first
|
||||||
|
r.hit_valid <= '0';
|
||||||
|
|
||||||
icache_write : process(clk)
|
-- Are we free to latch a new request ?
|
||||||
|
--
|
||||||
|
-- Note: this test needs to match the equation for generating stall_out
|
||||||
|
--
|
||||||
|
if i_in.req = '1' and req_is_hit = '1' and flush_in = '0' then
|
||||||
|
-- Read the cache line (BRAM read port) and remember the NIA
|
||||||
|
r.hit_line <= cachelines(req_index);
|
||||||
|
r.hit_nia <= i_in.nia;
|
||||||
|
r.hit_smark <= i_in.stop_mark;
|
||||||
|
r.hit_valid <= '1';
|
||||||
|
|
||||||
|
report "cache hit nia:" & to_hstring(i_in.nia) &
|
||||||
|
" SM:" & std_ulogic'image(i_in.stop_mark) &
|
||||||
|
" idx:" & integer'image(req_index) &
|
||||||
|
" tag:" & to_hstring(req_tag);
|
||||||
|
end if;
|
||||||
|
|
||||||
|
-- Flush requested ? discard...
|
||||||
|
if flush_in then
|
||||||
|
r.hit_valid <= '0';
|
||||||
|
end if;
|
||||||
|
end if;
|
||||||
|
end process;
|
||||||
|
|
||||||
|
icache_miss : process(clk)
|
||||||
|
variable store_dword : std_ulogic_vector(OFFSET_BITS-4 downto 0);
|
||||||
begin
|
begin
|
||||||
if rising_edge(clk) then
|
if rising_edge(clk) then
|
||||||
if rst = '1' then
|
if rst = '1' then
|
||||||
tags_valid <= (others => '0');
|
tags_valid <= (others => '0');
|
||||||
|
r.store_mask <= (others => '0');
|
||||||
r.state <= IDLE;
|
r.state <= IDLE;
|
||||||
r.w.cyc <= '0';
|
r.wb.cyc <= '0';
|
||||||
r.w.stb <= '0';
|
r.wb.stb <= '0';
|
||||||
|
|
||||||
|
-- We only ever do reads on wishbone
|
||||||
|
r.wb.dat <= (others => '0');
|
||||||
|
r.wb.sel <= "11111111";
|
||||||
|
r.wb.we <= '0';
|
||||||
end if;
|
end if;
|
||||||
|
|
||||||
r.w.dat <= (others => '0');
|
-- State machine
|
||||||
r.w.sel <= "11111111";
|
|
||||||
r.w.we <= '0';
|
|
||||||
|
|
||||||
case r.state is
|
case r.state is
|
||||||
when IDLE =>
|
when IDLE =>
|
||||||
if read_miss = true then
|
-- We need to read a cache line
|
||||||
|
if i_in.req = '1' and req_is_hit = '0' then
|
||||||
|
|
||||||
|
report "cache miss nia:" & to_hstring(i_in.nia) &
|
||||||
|
" SM:" & std_ulogic'image(i_in.stop_mark) &
|
||||||
|
" idx:" & integer'image(req_index) &
|
||||||
|
" tag:" & to_hstring(req_tag);
|
||||||
|
|
||||||
r.state <= WAIT_ACK;
|
r.state <= WAIT_ACK;
|
||||||
r.store_word <= 0;
|
r.store_mask <= (0 => '1', others => '0');
|
||||||
r.store_index <= read_index;
|
r.store_index <= req_index;
|
||||||
|
|
||||||
tags(read_index) <= read_tag;
|
-- Force misses while reloading that line
|
||||||
tags_valid(read_index) <= '0';
|
tags_valid(req_index) <= '0';
|
||||||
|
tags(req_index) <= req_tag;
|
||||||
|
|
||||||
r.w.adr <= i_in.addr(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0');
|
-- Prep for first dword read
|
||||||
r.w.cyc <= '1';
|
r.wb.adr <= i_in.nia(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0');
|
||||||
r.w.stb <= '1';
|
r.wb.cyc <= '1';
|
||||||
|
r.wb.stb <= '1';
|
||||||
end if;
|
end if;
|
||||||
when WAIT_ACK =>
|
when WAIT_ACK =>
|
||||||
if wishbone_in.ack = '1' then
|
if wishbone_in.ack = '1' then
|
||||||
cachelines(r.store_index)((r.store_word+1)*64-1 downto ((r.store_word)*64)) <= wishbone_in.dat;
|
-- Store the current dword in both the cache
|
||||||
r.store_word <= r.store_word + 1;
|
for i in 0 to LINE_SIZE_DW-1 loop
|
||||||
|
if r.store_mask(i) = '1' then
|
||||||
|
cachelines(r.store_index)(63 + i*64 downto i*64) <= wishbone_in.dat;
|
||||||
|
end if;
|
||||||
|
end loop;
|
||||||
|
|
||||||
if r.store_word = (LINE_SIZE_DW-1) then
|
-- That was the last word ? We are done
|
||||||
|
if r.store_mask(LINE_SIZE_DW-1) = '1' then
|
||||||
r.state <= IDLE;
|
r.state <= IDLE;
|
||||||
tags_valid(r.store_index) <= '1';
|
tags_valid(r.store_index) <= '1';
|
||||||
r.w.cyc <= '0';
|
r.wb.cyc <= '0';
|
||||||
r.w.stb <= '0';
|
r.wb.stb <= '0';
|
||||||
else
|
else
|
||||||
r.w.adr(OFFSET_BITS-1 downto 3) <= std_ulogic_vector(to_unsigned(r.store_word+1, OFFSET_BITS-3));
|
store_dword := r.wb.adr(OFFSET_BITS-1 downto 3);
|
||||||
|
store_dword := std_ulogic_vector(unsigned(store_dword) + 1);
|
||||||
|
r.wb.adr(OFFSET_BITS-1 downto 3) <= store_dword;
|
||||||
end if;
|
end if;
|
||||||
|
-- Advance to next word
|
||||||
|
r.store_mask <= r.store_mask(LINE_SIZE_DW-2 downto 0) & '0';
|
||||||
end if;
|
end if;
|
||||||
end case;
|
end case;
|
||||||
end if;
|
end if;
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ architecture behave of icache_tb is
|
|||||||
signal clk : std_ulogic;
|
signal clk : std_ulogic;
|
||||||
signal rst : std_ulogic;
|
signal rst : std_ulogic;
|
||||||
|
|
||||||
signal i_out : Fetch2ToIcacheType;
|
signal i_out : Fetch1ToIcacheType;
|
||||||
signal i_in : IcacheToFetch2Type;
|
signal i_in : IcacheToFetch2Type;
|
||||||
|
|
||||||
signal wb_bram_in : wishbone_master_out;
|
signal wb_bram_in : wishbone_master_out;
|
||||||
@@ -30,6 +30,7 @@ begin
|
|||||||
rst => rst,
|
rst => rst,
|
||||||
i_in => i_out,
|
i_in => i_out,
|
||||||
i_out => i_in,
|
i_out => i_in,
|
||||||
|
flush_in => '0',
|
||||||
wishbone_out => wb_bram_in,
|
wishbone_out => wb_bram_in,
|
||||||
wishbone_in => wb_bram_out
|
wishbone_in => wb_bram_out
|
||||||
);
|
);
|
||||||
@@ -66,16 +67,16 @@ begin
|
|||||||
stim: process
|
stim: process
|
||||||
begin
|
begin
|
||||||
i_out.req <= '0';
|
i_out.req <= '0';
|
||||||
i_out.addr <= (others => '0');
|
i_out.nia <= (others => '0');
|
||||||
|
|
||||||
wait for 4*clk_period;
|
wait for 4*clk_period;
|
||||||
|
|
||||||
i_out.req <= '1';
|
i_out.req <= '1';
|
||||||
i_out.addr <= x"0000000000000004";
|
i_out.nia <= x"0000000000000004";
|
||||||
|
|
||||||
wait for 30*clk_period;
|
wait for 30*clk_period;
|
||||||
|
|
||||||
assert i_in.ack = '1';
|
assert i_in.valid = '1';
|
||||||
assert i_in.insn = x"00000001";
|
assert i_in.insn = x"00000001";
|
||||||
|
|
||||||
i_out.req <= '0';
|
i_out.req <= '0';
|
||||||
@@ -84,31 +85,31 @@ begin
|
|||||||
|
|
||||||
-- hit
|
-- hit
|
||||||
i_out.req <= '1';
|
i_out.req <= '1';
|
||||||
i_out.addr <= x"0000000000000008";
|
i_out.nia <= x"0000000000000008";
|
||||||
wait for clk_period/2;
|
wait for clk_period;
|
||||||
assert i_in.ack = '1';
|
assert i_in.valid = '1';
|
||||||
assert i_in.insn = x"00000002";
|
assert i_in.insn = x"00000002";
|
||||||
wait for clk_period/2;
|
wait for clk_period;
|
||||||
|
|
||||||
-- another miss
|
-- another miss
|
||||||
i_out.req <= '1';
|
i_out.req <= '1';
|
||||||
i_out.addr <= x"0000000000000040";
|
i_out.nia <= x"0000000000000040";
|
||||||
|
|
||||||
wait for 30*clk_period;
|
wait for 30*clk_period;
|
||||||
|
|
||||||
assert i_in.ack = '1';
|
assert i_in.valid = '1';
|
||||||
assert i_in.insn = x"00000010";
|
assert i_in.insn = x"00000010";
|
||||||
|
|
||||||
-- test something that aliases
|
-- test something that aliases
|
||||||
i_out.req <= '1';
|
i_out.req <= '1';
|
||||||
i_out.addr <= x"0000000000000100";
|
i_out.nia <= x"0000000000000100";
|
||||||
wait for clk_period/2;
|
wait for clk_period;
|
||||||
assert i_in.ack = '0';
|
assert i_in.valid = '0';
|
||||||
wait for clk_period/2;
|
wait for clk_period;
|
||||||
|
|
||||||
wait for 30*clk_period;
|
wait for 30*clk_period;
|
||||||
|
|
||||||
assert i_in.ack = '1';
|
assert i_in.valid = '1';
|
||||||
assert i_in.insn = x"00000040";
|
assert i_in.insn = x"00000040";
|
||||||
|
|
||||||
i_out.req <= '0';
|
i_out.req <= '0';
|
||||||
|
|||||||
Reference in New Issue
Block a user