mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-04-15 23:51:40 +00:00
dcache: Trim one cycle from the load hit path
Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
This commit is contained in:
@@ -229,6 +229,8 @@ package common is
|
||||
xerc : xer_common_t;
|
||||
reserve : std_ulogic;
|
||||
rc : std_ulogic;
|
||||
early_low_addr : std_ulogic_vector(11 downto 0);
|
||||
early_valid : std_ulogic;
|
||||
end record;
|
||||
|
||||
type DcacheToWritebackType is record
|
||||
|
||||
98
dcache.vhdl
98
dcache.vhdl
@@ -124,6 +124,7 @@ architecture rtl of dcache is
|
||||
|
||||
-- Cache state machine
|
||||
type state_t is (IDLE, -- Normal load hit processing
|
||||
PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD
|
||||
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
|
||||
LOAD_UPDATE, -- Load with update extra cycle
|
||||
LOAD_UPDATE2, -- Load with update extra cycle
|
||||
@@ -184,24 +185,6 @@ architecture rtl of dcache is
|
||||
|
||||
signal r1 : reg_stage_1_t;
|
||||
|
||||
-- Second stage register, only used for load hits
|
||||
--
|
||||
type reg_stage_2_t is record
|
||||
hit_way : way_t;
|
||||
hit_load_valid : std_ulogic;
|
||||
load_is_update : std_ulogic;
|
||||
load_reg : std_ulogic_vector(4 downto 0);
|
||||
data_shift : std_ulogic_vector(2 downto 0);
|
||||
length : std_ulogic_vector(3 downto 0);
|
||||
sign_extend : std_ulogic;
|
||||
byte_reverse : std_ulogic;
|
||||
xerc : xer_common_t;
|
||||
last_dword : std_ulogic;
|
||||
second_dword : std_ulogic;
|
||||
end record;
|
||||
|
||||
signal r2 : reg_stage_2_t;
|
||||
|
||||
-- Reservation information
|
||||
--
|
||||
type reservation_t is record
|
||||
@@ -221,6 +204,10 @@ architecture rtl of dcache is
|
||||
signal req_addr : std_ulogic_vector(63 downto 0);
|
||||
signal req_laddr : std_ulogic_vector(63 downto 0);
|
||||
signal req_sel : std_ulogic_vector(7 downto 0);
|
||||
signal next_addr : std_ulogic_vector(63 downto 0);
|
||||
|
||||
signal early_req_addr : std_ulogic_vector(11 downto 0);
|
||||
signal early_req_row : row_t;
|
||||
|
||||
signal cancel_store : std_ulogic;
|
||||
signal set_rsrv : std_ulogic;
|
||||
@@ -404,6 +391,12 @@ begin
|
||||
end generate;
|
||||
end generate;
|
||||
|
||||
-- Wishbone read and write and BRAM write sel bits generation
|
||||
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);
|
||||
|
||||
-- See if the operation crosses two doublewords
|
||||
two_dwords <= or (bus_sel(15 downto 8));
|
||||
|
||||
-- Cache request parsing and hit detection
|
||||
dcache_request : process(all)
|
||||
variable is_hit : std_ulogic;
|
||||
@@ -444,6 +437,9 @@ begin
|
||||
req_laddr <= req_addr(63 downto LINE_OFF_BITS) &
|
||||
(LINE_OFF_BITS-1 downto 0 => '0');
|
||||
|
||||
-- Address of next doubleword, used for unaligned accesses
|
||||
next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";
|
||||
|
||||
-- Test if pending request is a hit on any way
|
||||
hit_way := 0;
|
||||
is_hit := '0';
|
||||
@@ -480,17 +476,21 @@ begin
|
||||
|
||||
req_op <= op;
|
||||
|
||||
-- Versions of the address and row number that are valid one cycle earlier
|
||||
-- in the cases where we need to read the cache data BRAM.
|
||||
if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then
|
||||
early_req_addr <= next_addr(11 downto 0);
|
||||
elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then
|
||||
early_req_addr <= r1.next_addr(11 downto 0);
|
||||
else
|
||||
early_req_addr <= d_in.early_low_addr;
|
||||
end if;
|
||||
early_req_row <= get_row(x"0000000000000" & early_req_addr);
|
||||
end process;
|
||||
|
||||
-- Wire up wishbone request latch out of stage 1
|
||||
wishbone_out <= r1.wb;
|
||||
|
||||
-- Wishbone read and write and BRAM write sel bits generation
|
||||
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);
|
||||
|
||||
-- See if the operation crosses two doublewords
|
||||
two_dwords <= or (bus_sel(15 downto 8));
|
||||
|
||||
-- TODO: Generate errors
|
||||
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';
|
||||
|
||||
@@ -540,14 +540,14 @@ begin
|
||||
-- The mux on d_out.write reg defaults to the normal load hit case.
|
||||
d_out.write_enable <= '0';
|
||||
d_out.valid <= '0';
|
||||
d_out.write_reg <= r2.load_reg;
|
||||
d_out.write_data <= cache_out(r2.hit_way);
|
||||
d_out.write_len <= r2.length;
|
||||
d_out.write_shift <= r2.data_shift;
|
||||
d_out.sign_extend <= r2.sign_extend;
|
||||
d_out.byte_reverse <= r2.byte_reverse;
|
||||
d_out.second_word <= r2.second_dword;
|
||||
d_out.xerc <= r2.xerc;
|
||||
d_out.write_reg <= r1.req.write_reg;
|
||||
d_out.write_data <= cache_out(r1.hit_way);
|
||||
d_out.write_len <= r1.req.length;
|
||||
d_out.write_shift <= r1.req.addr(2 downto 0);
|
||||
d_out.sign_extend <= r1.req.sign_extend;
|
||||
d_out.byte_reverse <= r1.req.byte_reverse;
|
||||
d_out.second_word <= r1.second_dword;
|
||||
d_out.xerc <= r1.req.xerc;
|
||||
d_out.rc <= '0'; -- loads never have rc=1
|
||||
d_out.store_done <= '0';
|
||||
|
||||
@@ -562,26 +562,27 @@ begin
|
||||
--
|
||||
|
||||
-- Sanity: Only one of these must be set in any given cycle
|
||||
assert (r1.update_valid and r2.hit_load_valid) /= '1' report
|
||||
assert (r1.update_valid and r1.hit_load_valid) /= '1' report
|
||||
"unexpected hit_load_delayed collision with update_valid"
|
||||
severity FAILURE;
|
||||
assert (r1.slow_valid and r1.stcx_fail) /= '1' report
|
||||
"unexpected slow_valid collision with stcx_fail"
|
||||
severity FAILURE;
|
||||
assert ((r1.slow_valid or r1.stcx_fail) and r2.hit_load_valid) /= '1' report
|
||||
assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report
|
||||
"unexpected hit_load_delayed collision with slow_valid"
|
||||
severity FAILURE;
|
||||
assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report
|
||||
"unexpected update_valid collision with slow_valid or stcx_fail"
|
||||
severity FAILURE;
|
||||
|
||||
-- Delayed load hit case is the standard path
|
||||
if r2.hit_load_valid = '1' then
|
||||
-- Load hit case is the standard path
|
||||
if r1.hit_load_valid = '1' then
|
||||
d_out.write_enable <= '1';
|
||||
|
||||
-- If there isn't another dword to go and
|
||||
-- it's not a load with update, complete it now
|
||||
if r2.last_dword = '1' and r2.load_is_update = '0' then
|
||||
if (r1.second_dword or not r1.two_dwords) = '1' and
|
||||
r1.req.update = '0' then
|
||||
report "completing load hit";
|
||||
d_out.valid <= '1';
|
||||
end if;
|
||||
@@ -693,7 +694,7 @@ begin
|
||||
begin
|
||||
-- Cache hit reads
|
||||
do_read <= '1';
|
||||
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
|
||||
rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
|
||||
cache_out(i) <= dout;
|
||||
|
||||
-- Write mux:
|
||||
@@ -732,23 +733,11 @@ begin
|
||||
|
||||
--
|
||||
-- Cache hit synchronous machine for the easy case. This handles
|
||||
-- non-update form load hits and stage 1 to stage 2 transfers
|
||||
-- non-update form load hits
|
||||
--
|
||||
dcache_fast_hit : process(clk)
|
||||
begin
|
||||
if rising_edge(clk) then
|
||||
-- stage 1 -> stage 2
|
||||
r2.hit_load_valid <= r1.hit_load_valid;
|
||||
r2.hit_way <= r1.hit_way;
|
||||
r2.load_is_update <= r1.req.update;
|
||||
r2.load_reg <= r1.req.write_reg;
|
||||
r2.data_shift <= r1.req.addr(2 downto 0);
|
||||
r2.length <= r1.req.length;
|
||||
r2.sign_extend <= r1.req.sign_extend;
|
||||
r2.byte_reverse <= r1.req.byte_reverse;
|
||||
r2.second_dword <= r1.second_dword;
|
||||
r2.last_dword <= r1.second_dword or not r1.two_dwords;
|
||||
|
||||
-- If we have a request incoming, we have to latch it as d_in.valid
|
||||
-- is only set for a single cycle. It's up to the control logic to
|
||||
-- ensure we don't override an uncompleted request (for now we are
|
||||
@@ -759,7 +748,7 @@ begin
|
||||
r1.req <= d_in;
|
||||
r1.second_dword <= '0';
|
||||
r1.two_dwords <= two_dwords;
|
||||
r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";
|
||||
r1.next_addr <= next_addr;
|
||||
r1.next_sel <= bus_sel(15 downto 8);
|
||||
|
||||
report "op:" & op_t'image(req_op) &
|
||||
@@ -912,6 +901,9 @@ begin
|
||||
when OP_BAD =>
|
||||
end case;
|
||||
|
||||
when PRE_NEXT_DWORD =>
|
||||
r1.state <= NEXT_DWORD;
|
||||
|
||||
when RELOAD_WAIT_ACK =>
|
||||
-- Requests are all sent if stb is 0
|
||||
stbs_done := r1.wb.stb = '0';
|
||||
@@ -958,7 +950,7 @@ begin
|
||||
-- we also need to do the deferred update cycle.
|
||||
r1.slow_valid <= '1';
|
||||
if r1.two_dwords and not r1.second_dword then
|
||||
r1.state <= NEXT_DWORD;
|
||||
r1.state <= PRE_NEXT_DWORD;
|
||||
elsif r1.req.update = '1' then
|
||||
r1.state <= LOAD_UPDATE2;
|
||||
report "completing miss with load-update !";
|
||||
|
||||
@@ -89,5 +89,9 @@ begin
|
||||
|
||||
-- Update outputs
|
||||
l_out <= r;
|
||||
|
||||
-- Asynchronous output of the low-order address bits (latched in dcache)
|
||||
l_out.early_low_addr <= lsu_sum(11 downto 0);
|
||||
l_out.early_valid <= l_in.valid;
|
||||
end process;
|
||||
end;
|
||||
|
||||
Reference in New Issue
Block a user