mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-04-14 15:34:56 +00:00
dcache: Add support for unaligned loads and stores
For an unaligned load or store, we do the first doubleword (dword) of the transfer as normal, but then go to a new NEXT_DWORD state of the state machine to do the cache tag lookup for the second dword of the transfer. From the NEXT_DWORD state we have much the same transitions to other states as from the IDLE state (the transitions for OP_LOAD_HIT are a bit different but almost identical for the other op values). We now do the preparation of the data to be written in loadstore1, that is, byte reversal if necessary and rotation by a number of bytes based on the low 3 bits of the address. We do rotation not shifting so we have the bytes that need to go into the second doubleword in the right place in the low bytes of the data sent to dcache. The rotation and byte reversal are done in a single step with one multiplexer per byte by setting the select inputs for each byte appropriately. This also fixes writeback to not write the register value until it has received both pieces of an unaligned load value. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
This commit is contained in:
179
dcache.vhdl
179
dcache.vhdl
@@ -124,6 +124,7 @@ architecture rtl of dcache is
|
||||
|
||||
-- Cache state machine
|
||||
type state_t is (IDLE, -- Normal load hit processing
|
||||
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
|
||||
LOAD_UPDATE, -- Load with update extra cycle
|
||||
LOAD_UPDATE2, -- Load with update extra cycle
|
||||
RELOAD_WAIT_ACK, -- Cache reload wait ack
|
||||
@@ -157,6 +158,12 @@ architecture rtl of dcache is
|
||||
hit_way : way_t;
|
||||
hit_load_valid : std_ulogic;
|
||||
|
||||
-- Info for doing the second transfer of a misaligned load/store
|
||||
two_dwords : std_ulogic;
|
||||
second_dword : std_ulogic;
|
||||
next_addr : std_ulogic_vector(63 downto 0);
|
||||
next_sel : std_ulogic_vector(7 downto 0);
|
||||
|
||||
-- Register update (load/store with update)
|
||||
update_valid : std_ulogic;
|
||||
|
||||
@@ -186,6 +193,8 @@ architecture rtl of dcache is
|
||||
sign_extend : std_ulogic;
|
||||
byte_reverse : std_ulogic;
|
||||
xerc : xer_common_t;
|
||||
last_dword : std_ulogic;
|
||||
second_dword : std_ulogic;
|
||||
end record;
|
||||
|
||||
signal r2 : reg_stage_2_t;
|
||||
@@ -196,7 +205,10 @@ architecture rtl of dcache is
|
||||
signal req_hit_way : way_t;
|
||||
signal req_tag : cache_tag_t;
|
||||
signal req_op : op_t;
|
||||
signal req_data : std_ulogic_vector(63 downto 0);
|
||||
signal req_addr : std_ulogic_vector(63 downto 0);
|
||||
signal req_laddr : std_ulogic_vector(63 downto 0);
|
||||
signal req_sel : std_ulogic_vector(7 downto 0);
|
||||
|
||||
-- Cache RAM interface
|
||||
type cache_ram_out_t is array(way_t) of cache_row_t;
|
||||
@@ -208,8 +220,9 @@ architecture rtl of dcache is
|
||||
signal replace_way : way_t;
|
||||
|
||||
-- Wishbone read/write/cache write formatting signals
|
||||
signal bus_sel : wishbone_sel_type;
|
||||
signal store_data : wishbone_data_type;
|
||||
signal bus_sel : std_ulogic_vector(15 downto 0);
|
||||
|
||||
signal two_dwords : std_ulogic;
|
||||
|
||||
--
|
||||
-- Helper functions to decode incoming requests
|
||||
@@ -307,17 +320,17 @@ architecture rtl of dcache is
|
||||
end case;
|
||||
end function length_to_sel;
|
||||
|
||||
-- Calculate shift and byte enables for wishbone
|
||||
function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is
|
||||
begin
|
||||
return to_integer(unsigned(address(2 downto 0))) * 8;
|
||||
end function wishbone_data_shift;
|
||||
|
||||
-- Calculate byte enables for wishbone
|
||||
-- This returns 16 bits, giving the select signals for two transfers,
|
||||
-- to account for unaligned loads or stores
|
||||
function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
|
||||
address : in std_logic_vector(63 downto 0))
|
||||
return std_ulogic_vector is
|
||||
variable longsel : std_ulogic_vector(15 downto 0);
|
||||
begin
|
||||
return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)),
|
||||
longsel := (others => '0');
|
||||
longsel(7 downto 0) := length_to_sel(size);
|
||||
return std_ulogic_vector(shift_left(unsigned(longsel),
|
||||
to_integer(unsigned(address(2 downto 0)))));
|
||||
end function wishbone_data_sel;
|
||||
|
||||
@@ -383,23 +396,43 @@ begin
|
||||
variable tmp : std_ulogic_vector(63 downto 0);
|
||||
variable data : std_ulogic_vector(63 downto 0);
|
||||
variable opsel : std_ulogic_vector(3 downto 0);
|
||||
variable go : std_ulogic;
|
||||
variable is_load : std_ulogic;
|
||||
variable is_nc : std_ulogic;
|
||||
begin
|
||||
-- Extract line, row and tag from request
|
||||
req_index <= get_index(d_in.addr);
|
||||
req_row <= get_row(d_in.addr);
|
||||
req_tag <= get_tag(d_in.addr);
|
||||
if r1.state /= NEXT_DWORD then
|
||||
req_addr <= d_in.addr;
|
||||
req_data <= d_in.data;
|
||||
req_sel <= bus_sel(7 downto 0);
|
||||
go := d_in.valid;
|
||||
is_load := d_in.load;
|
||||
is_nc := d_in.nc;
|
||||
|
||||
-- Calculate address of beginning of cache line, will be
|
||||
-- used for cache miss processing if needed
|
||||
--
|
||||
req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) &
|
||||
(LINE_OFF_BITS-1 downto 0 => '0');
|
||||
else
|
||||
req_addr <= r1.next_addr;
|
||||
req_data <= r1.req.data;
|
||||
req_sel <= r1.next_sel;
|
||||
go := '1';
|
||||
is_load := r1.req.load;
|
||||
is_nc := r1.req.nc;
|
||||
end if;
|
||||
|
||||
req_index <= get_index(req_addr);
|
||||
req_row <= get_row(req_addr);
|
||||
req_tag <= get_tag(req_addr);
|
||||
|
||||
-- Calculate address of beginning of cache line, will be
|
||||
-- used for cache miss processing if needed
|
||||
--
|
||||
req_laddr <= req_addr(63 downto LINE_OFF_BITS) &
|
||||
(LINE_OFF_BITS-1 downto 0 => '0');
|
||||
|
||||
-- Test if pending request is a hit on any way
|
||||
hit_way := 0;
|
||||
is_hit := '0';
|
||||
for i in way_t loop
|
||||
if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then
|
||||
if go = '1' and cache_valids(req_index)(i) = '1' then
|
||||
if read_tag(i, cache_tags(req_index)) = req_tag then
|
||||
hit_way := i;
|
||||
is_hit := '1';
|
||||
@@ -416,7 +449,7 @@ begin
|
||||
-- Combine the request and cache his status to decide what
|
||||
-- operation needs to be done
|
||||
--
|
||||
opsel := d_in.valid & d_in.load & d_in.nc & is_hit;
|
||||
opsel := go & is_load & is_nc & is_hit;
|
||||
case opsel is
|
||||
when "1101" => op := OP_LOAD_HIT;
|
||||
when "1100" => op := OP_LOAD_MISS;
|
||||
@@ -433,22 +466,15 @@ begin
|
||||
|
||||
end process;
|
||||
|
||||
--
|
||||
-- Misc signal assignments
|
||||
--
|
||||
|
||||
-- Wire up wishbone request latch out of stage 1
|
||||
wishbone_out <= r1.wb;
|
||||
|
||||
-- Wishbone & BRAM write data formatting for stores (most of it already
|
||||
-- happens in loadstore1, this is the remaining data shifting)
|
||||
--
|
||||
store_data <= std_logic_vector(shift_left(unsigned(d_in.data),
|
||||
wishbone_data_shift(d_in.addr)));
|
||||
|
||||
-- Wishbone read and write and BRAM write sel bits generation
|
||||
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);
|
||||
|
||||
-- See if the operation crosses two doublewords
|
||||
two_dwords <= or (bus_sel(15 downto 8));
|
||||
|
||||
-- TODO: Generate errors
|
||||
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';
|
||||
|
||||
@@ -469,7 +495,7 @@ begin
|
||||
d_out.write_shift <= r2.data_shift;
|
||||
d_out.sign_extend <= r2.sign_extend;
|
||||
d_out.byte_reverse <= r2.byte_reverse;
|
||||
d_out.second_word <= '0';
|
||||
d_out.second_word <= r2.second_dword;
|
||||
d_out.xerc <= r2.xerc;
|
||||
|
||||
-- We have a valid load or store hit or we just completed a slow
|
||||
@@ -497,8 +523,10 @@ begin
|
||||
if r2.hit_load_valid = '1' then
|
||||
d_out.write_enable <= '1';
|
||||
|
||||
-- If it's not a load with update, complete it now
|
||||
if r2.load_is_update = '0' then
|
||||
-- If there isn't another dword to go and
|
||||
-- it's not a load with update, complete it now
|
||||
if r2.last_dword = '1' and r2.load_is_update = '0' then
|
||||
report "completing load hit";
|
||||
d_out.valid <= '1';
|
||||
end if;
|
||||
end if;
|
||||
@@ -521,10 +549,14 @@ begin
|
||||
d_out.byte_reverse <= r1.req.byte_reverse;
|
||||
d_out.write_len <= r1.req.length;
|
||||
d_out.xerc <= r1.req.xerc;
|
||||
d_out.second_word <= r1.second_dword;
|
||||
end if;
|
||||
|
||||
-- If it's a store or a non-update load form, complete now
|
||||
if r1.req.load = '0' or r1.req.update = '0' then
|
||||
-- unless we need to do another dword transfer
|
||||
if (r1.req.load = '0' or r1.req.update = '0') and
|
||||
(r1.two_dwords = '0' or r1.second_dword = '1') then
|
||||
report "completing store or load miss";
|
||||
d_out.valid <= '1';
|
||||
end if;
|
||||
end if;
|
||||
@@ -543,11 +575,13 @@ begin
|
||||
d_out.sign_extend <= '0';
|
||||
d_out.byte_reverse <= '0';
|
||||
d_out.xerc <= r1.req.xerc;
|
||||
d_out.second_word <= '0';
|
||||
|
||||
-- If it was a load, this completes the operation (load with
|
||||
-- update case).
|
||||
--
|
||||
if r1.req.load = '1' then
|
||||
report "completing after load update";
|
||||
d_out.valid <= '1';
|
||||
end if;
|
||||
end if;
|
||||
@@ -605,11 +639,11 @@ begin
|
||||
-- For timing, the mux on wr_data/sel/addr is not dependent on anything
|
||||
-- other than the current state. Only the do_write signal is.
|
||||
--
|
||||
if r1.state = IDLE then
|
||||
-- When IDLE, the only write path is the store-hit update case
|
||||
if r1.state = IDLE or r1.state = NEXT_DWORD then
|
||||
-- In these states, the only write path is the store-hit update case
|
||||
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
|
||||
wr_data <= store_data;
|
||||
wr_sel <= bus_sel;
|
||||
wr_data <= req_data;
|
||||
wr_sel <= req_sel;
|
||||
else
|
||||
-- Otherwise, we might be doing a reload
|
||||
wr_data <= wishbone_in.dat;
|
||||
@@ -648,6 +682,8 @@ begin
|
||||
r2.length <= r1.req.length;
|
||||
r2.sign_extend <= r1.req.sign_extend;
|
||||
r2.byte_reverse <= r1.req.byte_reverse;
|
||||
r2.second_dword <= r1.second_dword;
|
||||
r2.last_dword <= r1.second_dword or not r1.two_dwords;
|
||||
|
||||
-- If we have a request incoming, we have to latch it as d_in.valid
|
||||
-- is only set for a single cycle. It's up to the control logic to
|
||||
@@ -655,8 +691,12 @@ begin
|
||||
-- single issue on load/stores so we are fine, later, we can generate
|
||||
-- a stall output if necessary).
|
||||
|
||||
if req_op /= OP_NONE then
|
||||
if req_op /= OP_NONE and d_in.valid = '1' then
|
||||
r1.req <= d_in;
|
||||
r1.second_dword <= '0';
|
||||
r1.two_dwords <= two_dwords;
|
||||
r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";
|
||||
r1.next_sel <= bus_sel(15 downto 8);
|
||||
|
||||
report "op:" & op_t'image(req_op) &
|
||||
" addr:" & to_hstring(d_in.addr) &
|
||||
@@ -666,6 +706,8 @@ begin
|
||||
" idx:" & integer'image(req_index) &
|
||||
" tag:" & to_hstring(req_tag) &
|
||||
" way: " & integer'image(req_hit_way);
|
||||
elsif r1.state = NEXT_DWORD then
|
||||
r1.second_dword <= '1';
|
||||
end if;
|
||||
|
||||
-- Fast path for load/store hits. Set signals for the writeback controls.
|
||||
@@ -713,24 +755,36 @@ begin
|
||||
r1.update_valid <= '0';
|
||||
|
||||
-- We cannot currently process a new request when not idle
|
||||
assert req_op = OP_NONE or r1.state = IDLE report "request " &
|
||||
assert d_in.valid = '0' or r1.state = IDLE report "request " &
|
||||
op_t'image(req_op) & " while in state " & state_t'image(r1.state)
|
||||
severity FAILURE;
|
||||
|
||||
-- Main state machine
|
||||
case r1.state is
|
||||
when IDLE =>
|
||||
when IDLE | NEXT_DWORD =>
|
||||
case req_op is
|
||||
when OP_LOAD_HIT =>
|
||||
-- We have a load with update hit, we need the delayed update cycle
|
||||
if d_in.update = '1' then
|
||||
r1.state <= LOAD_UPDATE;
|
||||
end if;
|
||||
when OP_LOAD_HIT =>
|
||||
if r1.state = IDLE then
|
||||
-- If the load is misaligned then we will need to start
|
||||
-- the state machine
|
||||
if two_dwords = '1' then
|
||||
r1.state <= NEXT_DWORD;
|
||||
elsif d_in.update = '1' then
|
||||
-- We have a load with update hit, we need the delayed update cycle
|
||||
r1.state <= LOAD_UPDATE;
|
||||
end if;
|
||||
else
|
||||
if r1.req.update = '1' then
|
||||
r1.state <= LOAD_UPDATE;
|
||||
else
|
||||
r1.state <= IDLE;
|
||||
end if;
|
||||
end if;
|
||||
|
||||
when OP_LOAD_MISS =>
|
||||
-- Normal load cache miss, start the reload machine
|
||||
--
|
||||
report "cache miss addr:" & to_hstring(d_in.addr) &
|
||||
report "cache miss addr:" & to_hstring(req_addr) &
|
||||
" idx:" & integer'image(req_index) &
|
||||
" way:" & integer'image(replace_way) &
|
||||
" tag:" & to_hstring(req_tag);
|
||||
@@ -765,8 +819,8 @@ begin
|
||||
r1.state <= RELOAD_WAIT_ACK;
|
||||
|
||||
when OP_LOAD_NC =>
|
||||
r1.wb.sel <= bus_sel;
|
||||
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
|
||||
r1.wb.sel <= req_sel;
|
||||
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
|
||||
r1.wb.cyc <= '1';
|
||||
r1.wb.stb <= '1';
|
||||
r1.wb.we <= '0';
|
||||
@@ -774,12 +828,10 @@ begin
|
||||
|
||||
when OP_STORE_HIT | OP_STORE_MISS =>
|
||||
-- For store-with-update do the register update
|
||||
if d_in.update = '1' then
|
||||
r1.update_valid <= '1';
|
||||
end if;
|
||||
r1.wb.sel <= bus_sel;
|
||||
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
|
||||
r1.wb.dat <= store_data;
|
||||
r1.update_valid <= d_in.valid and d_in.update;
|
||||
r1.wb.sel <= req_sel;
|
||||
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
|
||||
r1.wb.dat <= req_data;
|
||||
r1.wb.cyc <= '1';
|
||||
r1.wb.stb <= '1';
|
||||
r1.wb.we <= '1';
|
||||
@@ -831,11 +883,13 @@ begin
|
||||
-- Cache line is now valid
|
||||
cache_valids(r1.store_index)(r1.store_way) <= '1';
|
||||
|
||||
-- Complete the load that missed. For load with update
|
||||
-- Write back the load data that we got, and start
|
||||
-- the second dword if necessary. Otherwise, see if
|
||||
-- we also need to do the deferred update cycle.
|
||||
--
|
||||
r1.slow_valid <= '1';
|
||||
if r1.req.update = '1' then
|
||||
if r1.two_dwords and not r1.second_dword then
|
||||
r1.state <= NEXT_DWORD;
|
||||
elsif r1.req.update = '1' then
|
||||
r1.state <= LOAD_UPDATE2;
|
||||
report "completing miss with load-update !";
|
||||
else
|
||||
@@ -864,12 +918,15 @@ begin
|
||||
|
||||
-- Got ack ? complete.
|
||||
if wishbone_in.ack = '1' then
|
||||
r1.state <= IDLE;
|
||||
if r1.two_dwords and not r1.second_dword then
|
||||
r1.state <= NEXT_DWORD;
|
||||
elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then
|
||||
r1.state <= LOAD_UPDATE2;
|
||||
else
|
||||
r1.state <= IDLE;
|
||||
end if;
|
||||
if r1.state = NC_LOAD_WAIT_ACK then
|
||||
r1.slow_data <= wishbone_in.dat;
|
||||
if r1.req.update = '1' then
|
||||
r1.state <= LOAD_UPDATE2;
|
||||
end if;
|
||||
end if;
|
||||
r1.slow_valid <= '1';
|
||||
r1.wb.cyc <= '0';
|
||||
|
||||
@@ -35,12 +35,15 @@ begin
|
||||
|
||||
loadstore1_1: process(all)
|
||||
variable v : Loadstore1ToDcacheType;
|
||||
variable brev_lenm1 : unsigned(2 downto 0);
|
||||
variable byte_offset : unsigned(2 downto 0);
|
||||
variable j : integer;
|
||||
variable k : unsigned(2 downto 0);
|
||||
begin
|
||||
v := r;
|
||||
|
||||
v.valid := l_in.valid;
|
||||
v.load := l_in.load;
|
||||
v.data := l_in.data;
|
||||
v.write_reg := l_in.write_reg;
|
||||
v.length := l_in.length;
|
||||
v.byte_reverse := l_in.byte_reverse;
|
||||
@@ -63,9 +66,18 @@ begin
|
||||
|
||||
-- XXX Do length_to_sel here ?
|
||||
|
||||
-- byte reverse stores in the first cycle
|
||||
if v.load = '0' and l_in.byte_reverse = '1' then
|
||||
v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length)));
|
||||
-- Do byte reversing and rotating for stores in the first cycle
|
||||
if v.load = '0' then
|
||||
byte_offset := unsigned(lsu_sum(2 downto 0));
|
||||
brev_lenm1 := "000";
|
||||
if l_in.byte_reverse = '1' then
|
||||
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
|
||||
end if;
|
||||
for i in 0 to 7 loop
|
||||
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
|
||||
j := to_integer(k) * 8;
|
||||
v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
|
||||
end loop;
|
||||
end if;
|
||||
|
||||
v.addr := lsu_sum;
|
||||
|
||||
@@ -116,12 +116,12 @@ begin
|
||||
if l_in.byte_reverse = '1' then
|
||||
brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
|
||||
end if;
|
||||
w_out.write_enable <= '1';
|
||||
second_word <= l_in.second_word;
|
||||
if l_in.valid = '0' and (data_len + byte_offset > 8) then
|
||||
partial_write <= '1';
|
||||
end if;
|
||||
xe := l_in.xerc;
|
||||
w_out.write_enable <= not partial_write or second_word;
|
||||
end if;
|
||||
|
||||
-- shift and byte-reverse data bytes
|
||||
|
||||
Reference in New Issue
Block a user