1
0
mirror of https://github.com/antonblanchard/microwatt.git synced 2026-01-11 23:43:15 +00:00
antonblanchard.microwatt/litedram/extras/litedram-wrapper-l2.vhdl
Paul Mackerras ca4eb46aea Make wishbone addresses be in units of doublewords or words
This makes the 64-bit wishbone buses have the address expressed in
units of doublewords (64 bits), and similarly for the 32-bit buses the
address is in units of words (32 bits).  This is to comply with the
wishbone spec.  Previously the addresses on the wishbone buses were in
units of bytes regardless of the bus data width, which is not correct
and caused problems with interfacing with externally-generated logic.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2021-09-15 18:18:09 +10:00

1176 lines
48 KiB
VHDL

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use std.textio.all;
library work;
use work.wishbone_types.all;
use work.utils.all;
use work.helpers.all;
entity litedram_wrapper is
generic (
DRAM_ABITS : positive;
DRAM_ALINES : natural;
DRAM_DLINES : natural;
DRAM_PORT_WIDTH : positive;
-- Pseudo-ROM payload
PAYLOAD_SIZE : natural;
PAYLOAD_FILE : string;
-- L2 cache --
-- Line size in bytes
LINE_SIZE : positive := 128;
-- Number of lines in a set
NUM_LINES : positive := 64;
-- Number of ways
NUM_WAYS : positive := 4;
-- Max number of stores in the queue
STOREQ_DEPTH : positive := 8;
-- Don't send loads until all pending stores acked in litedram
NO_LS_OVERLAP : boolean := false;
-- Debug
LITEDRAM_TRACE : boolean := false;
TRACE : boolean := false
);
port(
-- LiteDRAM generates the system clock and reset
-- from the input clkin
clk_in : in std_ulogic;
rst : in std_ulogic;
system_clk : out std_ulogic;
system_reset : out std_ulogic;
core_alt_reset : out std_ulogic;
pll_locked : out std_ulogic;
-- Wishbone ports:
wb_in : in wishbone_master_out;
wb_out : out wishbone_slave_out;
wb_ctrl_in : in wb_io_master_out;
wb_ctrl_out : out wb_io_slave_out;
wb_ctrl_is_csr : in std_ulogic;
wb_ctrl_is_init : in std_ulogic;
-- Misc
init_done : out std_ulogic;
init_error : out std_ulogic;
-- DRAM wires
ddram_a : out std_ulogic_vector(DRAM_ALINES-1 downto 0);
ddram_ba : out std_ulogic_vector(2 downto 0);
ddram_ras_n : out std_ulogic;
ddram_cas_n : out std_ulogic;
ddram_we_n : out std_ulogic;
ddram_cs_n : out std_ulogic;
ddram_dm : out std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
ddram_dq : inout std_ulogic_vector(DRAM_DLINES-1 downto 0);
ddram_dqs_p : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
ddram_dqs_n : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
ddram_clk_p : out std_ulogic;
ddram_clk_n : out std_ulogic;
ddram_cke : out std_ulogic;
ddram_odt : out std_ulogic;
ddram_reset_n : out std_ulogic
);
end entity litedram_wrapper;
architecture behaviour of litedram_wrapper is
component litedram_core port (
clk : in std_ulogic;
rst : in std_ulogic;
pll_locked : out std_ulogic;
ddram_a : out std_ulogic_vector(DRAM_ALINES-1 downto 0);
ddram_ba : out std_ulogic_vector(2 downto 0);
ddram_ras_n : out std_ulogic;
ddram_cas_n : out std_ulogic;
ddram_we_n : out std_ulogic;
ddram_cs_n : out std_ulogic;
ddram_dm : out std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
ddram_dq : inout std_ulogic_vector(DRAM_DLINES-1 downto 0);
ddram_dqs_p : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
ddram_dqs_n : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
ddram_clk_p : out std_ulogic;
ddram_clk_n : out std_ulogic;
ddram_cke : out std_ulogic;
ddram_odt : out std_ulogic;
ddram_reset_n : out std_ulogic;
init_done : out std_ulogic;
init_error : out std_ulogic;
user_clk : out std_ulogic;
user_rst : out std_ulogic;
wb_ctrl_adr : in std_ulogic_vector(29 downto 0);
wb_ctrl_dat_w : in std_ulogic_vector(31 downto 0);
wb_ctrl_dat_r : out std_ulogic_vector(31 downto 0);
wb_ctrl_sel : in std_ulogic_vector(3 downto 0);
wb_ctrl_cyc : in std_ulogic;
wb_ctrl_stb : in std_ulogic;
wb_ctrl_ack : out std_ulogic;
wb_ctrl_we : in std_ulogic;
wb_ctrl_cti : in std_ulogic_vector(2 downto 0);
wb_ctrl_bte : in std_ulogic_vector(1 downto 0);
wb_ctrl_err : out std_ulogic;
user_port_native_0_cmd_valid : in std_ulogic;
user_port_native_0_cmd_ready : out std_ulogic;
user_port_native_0_cmd_we : in std_ulogic;
user_port_native_0_cmd_addr : in std_ulogic_vector(DRAM_ABITS-1 downto 0);
user_port_native_0_wdata_valid : in std_ulogic;
user_port_native_0_wdata_ready : out std_ulogic;
user_port_native_0_wdata_we : in std_ulogic_vector(DRAM_PORT_WIDTH/8-1 downto 0);
user_port_native_0_wdata_data : in std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0);
user_port_native_0_rdata_valid : out std_ulogic;
user_port_native_0_rdata_ready : in std_ulogic;
user_port_native_0_rdata_data : out std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0)
);
end component;
signal user_port0_cmd_valid : std_ulogic;
signal user_port0_cmd_ready : std_ulogic;
signal user_port0_cmd_we : std_ulogic;
signal user_port0_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0);
signal user_port0_wdata_valid : std_ulogic;
signal user_port0_wdata_ready : std_ulogic;
signal user_port0_wdata_we : std_ulogic_vector(DRAM_PORT_WIDTH/8-1 downto 0);
signal user_port0_wdata_data : std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0);
signal user_port0_rdata_valid : std_ulogic;
signal user_port0_rdata_ready : std_ulogic;
signal user_port0_rdata_data : std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0);
signal wb_ctrl_adr : std_ulogic_vector(29 downto 0);
signal wb_ctrl_dat_w : std_ulogic_vector(31 downto 0);
signal wb_ctrl_dat_r : std_ulogic_vector(31 downto 0);
signal wb_ctrl_sel : std_ulogic_vector(3 downto 0);
signal wb_ctrl_cyc : std_ulogic := '0';
signal wb_ctrl_stb : std_ulogic;
signal wb_ctrl_ack : std_ulogic;
signal wb_ctrl_we : std_ulogic;
signal wb_init_in : wb_io_master_out;
signal wb_init_out : wb_io_slave_out;
-- DRAM data port width
constant DRAM_DBITS : natural := DRAM_PORT_WIDTH;
-- DRAM data port sel bits
constant DRAM_SBITS : natural := (DRAM_DBITS / 8);
-- WB geometry (just a few shortcuts)
constant WBL : positive := wb_in.dat'length;
constant WBSL : positive := wb_in.sel'length;
-- Select a WB word inside DRAM port width
constant WB_WORD_COUNT : positive := DRAM_DBITS/WBL;
constant WB_WSEL_BITS : positive := log2(WB_WORD_COUNT);
-- BRAM organisation: We never access more than wishbone_data_bits at
-- a time so to save resources we make the array only that wide, and
-- use consecutive indices for to make a cache "line"
--
-- ROW_SIZE is the width in bytes of the BRAM, ie, litedram port width
constant ROW_SIZE : natural := DRAM_DBITS / 8;
-- ROW_PER_LINE is the number of row (litedram transactions) in a line
constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
-- BRAM_ROWS is the number of rows in BRAM needed to represent the full
-- dcache
constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
-- Bit fields counts in the address
-- ROW_BITS is the number of bits to select a row
constant ROW_BITS : natural := log2(BRAM_ROWS);
-- ROW_LINEBITS is the number of bits to select a row within a line
constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
-- LINE_OFF_BITS is the number of bits for the offset in a cache line
constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
-- ROW_OFF_BITS is the number of bits for the offset in a row
constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
-- REAL_ADDR_BITS is the number of real address bits that we store
constant REAL_ADDR_BITS : positive := DRAM_ABITS + ROW_OFF_BITS;
-- INDEX_BITS is the number if bits to select a cache line
constant INDEX_BITS : natural := log2(NUM_LINES);
-- SET_SIZE_BITS is the log base 2 of the set size
constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
-- TAG_BITS is the number of bits of the tag part of the address
constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
-- WAY_BITS is the number of bits to select a way
constant WAY_BITS : natural := log2(NUM_WAYS);
subtype row_t is integer range 0 to BRAM_ROWS-1;
subtype index_t is integer range 0 to NUM_LINES-1;
subtype way_t is integer range 0 to NUM_WAYS-1;
subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
-- The cache data BRAM organized as described above for each way
subtype cache_row_t is std_ulogic_vector(DRAM_DBITS-1 downto 0);
-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-- not handle a clean (commented) definition of the cache tags as a 3d
-- memory. For now, work around it by putting all the tags
subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
-- type cache_tags_set_t is array(way_t) of cache_tag_t;
-- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
type cache_tags_array_t is array(index_t) of cache_tags_set_t;
-- The cache valid bits
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
type cache_valids_t is array(index_t) of cache_way_valids_t;
-- "Temporary" valid bits for the rows of the currently refilled line
type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
signal cache_tags : cache_tags_array_t;
signal cache_valids : cache_valids_t;
attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed";
--
-- Store queue signals
--
-- We store a single wishbone dword per entry (64-bit)
-- along with the wishbone sel bits and the necessary address
-- bits to select which part of DRAM port to write to.
constant STOREQ_BITS : positive := WBL + WBSL + WB_WSEL_BITS;
signal storeq_rd_ready : std_ulogic;
signal storeq_rd_valid : std_ulogic;
signal storeq_rd_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);
signal storeq_wr_ready : std_ulogic;
signal storeq_wr_valid : std_ulogic;
signal storeq_wr_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);
--
-- Cache management signals
--
-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
REFILL_CLR_TAG, -- Cache refill clear tag
REFILL_WAIT_ACK); -- Cache refill wait ack
signal state : state_t;
-- Latched WB request
signal wb_req : wishbone_master_out := wishbone_master_out_init;
-- Stashed WB request
signal wb_stash : wishbone_master_out := wishbone_master_out_init;
-- Read pipeline (to handle cache RAM latency)
signal read_ack_0 : std_ulogic := '0';
signal read_ack_1 : std_ulogic := '0';
signal read_wsl_0 : std_ulogic_vector(WB_WSEL_BITS-1 downto 0) := (others => '0');
signal read_wsl_1 : std_ulogic_vector(WB_WSEL_BITS-1 downto 0) := (others => '0');
signal read_way_0 : way_t;
signal read_way_1 : way_t;
-- Store ack pipeline
signal store_ack_0 : std_ulogic := '0';
signal store_ack_1 : std_ulogic := '0';
-- Async signals decoding latched request
type req_op_t is (OP_NONE,
OP_LOAD_HIT,
OP_LOAD_MISS,
OP_STORE_HIT,
OP_STORE_MISS,
OP_STORE_DELAYED);
signal req_index : index_t;
signal req_row : row_t;
signal req_hit_way : way_t;
signal req_tag : cache_tag_t;
signal req_op : req_op_t;
signal req_laddr : std_ulogic_vector(REAL_ADDR_BITS-1 downto 0);
signal req_wsl : std_ulogic_vector(WB_WSEL_BITS-1 downto 0);
signal req_we : std_ulogic_vector(DRAM_SBITS-1 downto 0);
signal req_wdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
signal stall : std_ulogic;
-- Line refill command signals and latches
signal refill_cmd_valid : std_ulogic;
signal refill_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0);
signal refill_way : way_t;
signal refill_index : index_t;
signal refill_row : row_t;
signal refill_end_row : row_in_line_t;
signal refill_rows_vlid : row_per_line_valid_t;
-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t;
-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;
--
-- Helper functions to decode incoming requests
--
-- Return the DRAM real address from a wishbone address
function get_real_addr(addr: wishbone_addr_type) return std_ulogic_vector is
variable ra: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0) := (others => '0');
begin
ra(REAL_ADDR_BITS - 1 downto wishbone_log2_width) :=
addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto 0);
return ra;
end;
-- Return the cache line index (tag index) for an address
function get_index(addr: wishbone_addr_type) return index_t is
begin
return to_integer(unsigned(addr(SET_SIZE_BITS - wishbone_log2_width - 1 downto
LINE_OFF_BITS - wishbone_log2_width)));
end;
-- Return the cache row index (data memory) for an address
function get_row(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto 0)) return row_t is
begin
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
end;
-- Return the index of a row within a line
function get_row_of_line(row: row_t) return row_in_line_t is
variable row_v : unsigned(ROW_BITS-1 downto 0);
begin
row_v := to_unsigned(row, ROW_BITS);
return row_v(ROW_LINEBITS-1 downto 0);
end;
-- Returns whether this is the last row of a line. It takes a DRAM address
function is_last_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS);
last: row_in_line_t)
return boolean is
begin
return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
end;
-- Returns whether this is the last row of a line
function is_last_row(row: row_t; last: row_in_line_t) return boolean is
begin
return get_row_of_line(row) = last;
end;
-- Return the address of the next row in the current cache line. It takes a
-- DRAM address
function next_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS))
return std_ulogic_vector is
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS);
begin
-- Is there no simpler way in VHDL to generate that 3 bits adder ?
row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
result := addr;
result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
return result;
end;
-- Return the next row in the current cache line. We use a dedicated
-- function in order to limit the size of the generated adder to be
-- only the bits within a cache line (3 bits with default settings)
--
function next_row(row: row_t) return row_t is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
row_idx := row_v(ROW_LINEBITS-1 downto 0);
row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
return to_integer(unsigned(row_v));
end;
-- Get the tag value from the address
function get_tag(addr: wishbone_addr_type) return cache_tag_t is
begin
return addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto
SET_SIZE_BITS - wishbone_log2_width);
end;
-- Read a tag from a tag memory row
function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
begin
return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
end;
-- Write a tag to tag memory row
procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
tag: cache_tag_t) is
begin
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end;
begin
-- Sanity checks
assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
-- alternate core reset address set when DRAM is not initialized.
core_alt_reset <= not init_done;
-- Init code BRAM memory slave
init_ram_0: entity work.dram_init_mem
generic map(
EXTRA_PAYLOAD_FILE => PAYLOAD_FILE,
EXTRA_PAYLOAD_SIZE => PAYLOAD_SIZE
)
port map(
clk => system_clk,
wb_in => wb_init_in,
wb_out => wb_init_out
);
--
-- Control bus wishbone: This muxes the wishbone to the CSRs
-- and an internal small one to the init BRAM
--
-- Init DRAM wishbone IN signals
wb_init_in.adr <= wb_ctrl_in.adr;
wb_init_in.dat <= wb_ctrl_in.dat;
wb_init_in.sel <= wb_ctrl_in.sel;
wb_init_in.we <= wb_ctrl_in.we;
wb_init_in.stb <= wb_ctrl_in.stb;
wb_init_in.cyc <= wb_ctrl_in.cyc and wb_ctrl_is_init;
-- DRAM CSR IN signals. Extra latch to help with timing
csr_latch: process(system_clk)
begin
if rising_edge(system_clk) then
if system_reset = '1' then
wb_ctrl_cyc <= '0';
wb_ctrl_stb <= '0';
else
-- XXX Maybe only update addr when cyc = '1' to save power ?
wb_ctrl_adr <= x"0000" & wb_ctrl_in.adr(13 downto 0);
wb_ctrl_dat_w <= wb_ctrl_in.dat;
wb_ctrl_sel <= wb_ctrl_in.sel;
wb_ctrl_we <= wb_ctrl_in.we;
wb_ctrl_cyc <= wb_ctrl_in.cyc and wb_ctrl_is_csr;
wb_ctrl_stb <= wb_ctrl_in.stb and wb_ctrl_is_csr;
-- Clear stb on ack otherwise the memory will latch
-- the write twice which breaks levelling. On the next
-- cycle we will latch an updated stb that takes the
-- ack into account.
if wb_ctrl_ack = '1' then
wb_ctrl_stb <= '0';
end if;
end if;
end if;
end process;
-- Ctrl bus wishbone OUT signals. XXX Consider adding latch on
-- CSR response to help timing
wb_ctrl_out.ack <= wb_ctrl_ack when wb_ctrl_is_csr = '1'
else wb_init_out.ack;
wb_ctrl_out.dat <= wb_ctrl_dat_r when wb_ctrl_is_csr = '1'
else wb_init_out.dat;
wb_ctrl_out.stall <= wb_init_out.stall when wb_ctrl_is_init else
'0' when wb_ctrl_in.cyc = '0' else not wb_ctrl_ack;
-- Generate a cache RAM for each way
rams: for i in 0 to NUM_WAYS-1 generate
signal do_read : std_ulogic;
signal do_write : std_ulogic;
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_data : std_ulogic_vector(DRAM_DBITS-1 downto 0);
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal dout : cache_row_t;
begin
way: entity work.cache_ram
generic map (
ROW_BITS => ROW_BITS,
WIDTH => DRAM_DBITS,
ADD_BUF => true
)
port map (
clk => system_clk,
rd_en => do_read,
rd_addr => rd_addr,
rd_data => dout,
wr_sel => wr_sel_m,
wr_addr => wr_addr,
wr_data => wr_data
);
process(all)
begin
--
-- Read port
--
do_read <= '1';
cache_out(i) <= dout;
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
--
-- Write mux: cache refills from DRAM or writes from Wishbone
--
if req_op = OP_STORE_HIT and req_hit_way = i then
-- Write from wishbone
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= req_wdata;
wr_sel <= req_we;
else
-- Refill from DRAM
wr_data <= user_port0_rdata_data;
wr_sel <= (others => '1');
wr_addr <= std_ulogic_vector(to_unsigned(refill_row, ROW_BITS));
end if;
--
-- Write enable logic
--
do_write <= '0';
if req_op = OP_STORE_HIT and req_hit_way = i then
do_write <= '1';
elsif user_port0_rdata_valid = '1' and refill_way = i then
do_write <= '1';
end if;
-- Mask write selects with do_write since BRAM doesn't always
-- have a global write-enable (Vivado generates TDP instead
-- of SDP when using one, thus doubling cache BRAM usage).
for i in 0 to ROW_SIZE-1 loop
wr_sel_m(i) <= wr_sel(i) and do_write;
end loop;
if TRACE and rising_edge(system_clk) then
if do_write = '1' then
report "cache write way:" & integer'image(i) &
" addr:" & to_hstring(wr_addr) &
" sel:" & to_hstring(wr_sel_m) &
" data:" & to_hstring(wr_data);
end if;
end if;
end process;
end generate;
-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
begin
plrus: for i in 0 to NUM_LINES-1 generate
-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => system_clk,
rst => system_reset,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);
process(req_index, req_op, req_hit_way, plru_out)
begin
-- PLRU interface
if (req_op = OP_LOAD_HIT or
req_op = OP_STORE_HIT) and req_index = i then
plru_acc_en <= '1';
else
plru_acc_en <= '0';
end if;
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
plru_victim(i) <= plru_out;
end process;
end generate;
end generate;
--
-- Wishbone request interface:
--
-- - Incoming wishbone request latch (to help with timing)
-- - Read response pipeline (to match BRAM output buffer delay)
-- - Stall generation
--
-- XXX TODO: Properly handle cyc drops before all acks are sent...
--
request_latch: process(system_clk)
begin
if rising_edge(system_clk) then
-- Implement a stash buffer. If we are stalled and stash is
-- free, fill it up. This will generate a WB stall on the
-- next cycle.
if stall = '1' and wb_out.stall = '0' and wb_in.cyc = '1' and wb_in.stb = '1' then
wb_stash <= wb_in;
if TRACE then
report "stashed wb req ! addr:" & to_hstring(wb_in.adr & "000") &
" we:" & std_ulogic'image(wb_in.we) &
" sel:" & to_hstring(wb_in.sel);
end if;
end if;
-- We aren't stalled, see what we can do
if stall = '0' then
if wb_stash.cyc = '1' then
-- Something in stash ! use it and clear stash
wb_req <= wb_stash;
wb_stash.cyc <= '0';
if TRACE then
report "unstashed wb req ! addr:" & to_hstring(wb_stash.adr & "000") &
" we:" & std_ulogic'image(wb_stash.we) &
" sel:" & to_hstring(wb_stash.sel);
end if;
else
-- Grab request from WB
if wb_in.cyc = '1' then
wb_req <= wb_in;
else
wb_req.cyc <= wb_in.cyc;
wb_req.stb <= wb_in.stb;
end if;
if TRACE then
if wb_in.cyc = '1' and wb_in.stb = '1' then
report "latch new wb req ! addr:" & to_hstring(wb_in.adr & "000") &
" we:" & std_ulogic'image(wb_in.we) &
" sel:" & to_hstring(wb_in.sel);
end if;
end if;
end if;
end if;
end if;
end process;
-- Stall when stash is full
wb_out.stall <= wb_stash.cyc;
--
-- Read response pipeline
--
read_pipe: process(system_clk)
begin
if rising_edge(system_clk) then
read_ack_0 <= '1' when req_op = OP_LOAD_HIT else '0';
read_wsl_0 <= req_wsl;
read_way_0 <= req_hit_way;
read_ack_1 <= read_ack_0;
read_wsl_1 <= read_wsl_0;
read_way_1 <= read_way_0;
if TRACE then
if req_op = OP_LOAD_HIT then
report "Load hit addr:" & to_hstring(wb_req.adr & "000") &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way:" & integer'image(req_hit_way);
elsif req_op = OP_LOAD_MISS then
report "Load miss addr:" & to_hstring(wb_req.adr & "000");
end if;
if read_ack_0 = '1' then
report "read data:" & to_hstring(cache_out(read_way_0));
end if;
end if;
end if;
end process;
--
-- Store acks pipeline
--
store_ack_pipe: process(system_clk)
begin
if rising_edge(system_clk) then
store_ack_1 <= store_ack_0;
end if;
end process;
--
-- Wishbone response generation
--
wb_rseponse: process(all)
variable rdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
variable store_done : std_ulogic;
variable accept_store : std_ulogic;
variable wsel : natural range 0 to WB_WORD_COUNT-1;
begin
-- Can we accept a store ? This is set when the store queue & command
-- queue are not full.
--
-- This does *not* mean that we will accept the store, there are other
-- reasons to delay them (see OP_STORE_DELAYED).
--
-- A store is fully accepted when *both* req_op is not OP_STORE_DELAYED
-- and accept_store is '1'.
--
-- The reason for this split is to avoid a circular dependency inside
-- LiteDRAM, since cmd_ready from litedram is driven from cmd_valid (*)
-- we don't want to generate cmd_valid from cmd_ready. So we generate
-- it instead from all the *other* conditions that make a store valid.
--
-- (*) It's my understanding that user_port0_cmd_ready from LiteDRAM is
-- ombinational from user_port0_cmd_valid along with a bunch of other
-- internal signals. IE. we won't know that LiteDRAM cannot accept a
-- command until we try to send one.
--
accept_store := user_port0_cmd_ready and storeq_wr_ready;
-- Generate stalls. For stores we stall if we can't accept it.
-- For loads, we stall if we are going to take a load miss or
-- are in the middle of a refill and it isn't a partial hit.
if req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then
stall <= not accept_store;
elsif req_op = OP_LOAD_MISS or req_op = OP_STORE_DELAYED then
stall <= '1';
else
stall <= '0';
end if;
-- Data out mux
rdata := cache_out(read_way_1);
-- Hard wired for 64-bit wishbone
wsel := to_integer(unsigned(read_wsl_1));
wb_out.dat <= rdata((wsel+1)*WBL-1 downto wsel*WBL);
-- Early-complete stores on wishbone.
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
store_done := accept_store;
else
store_done := '0';
end if;
-- Pipeline store acks
store_ack_0 <= store_done;
-- Generate Wishbone ACKs on read hits and store complete
--
-- This can happen on store right behind loads ! This is why
-- we delay a store when a load ack is in the pipeline in the
-- request decoder below.
--
wb_out.ack <= read_ack_1 or store_ack_1;
assert read_ack_1 = '0' or store_ack_1 = '0' report
"Read ack and store ack collision !"
severity failure;
end process;
--
-- Cache request decode
--
request_decode: process(all)
variable valid : boolean;
variable is_hit : boolean;
variable store_delay : boolean;
variable hit_way : way_t;
begin
-- Extract line, row and tag from request
req_index <= get_index(wb_req.adr);
req_row <= get_row(get_real_addr(wb_req.adr));
req_tag <= get_tag(wb_req.adr);
-- Calculate address of beginning of cache row, will be
-- used for cache miss processing if needed
req_laddr <= get_real_addr(wb_req.adr);
-- Do we have a valid request in the WB latch ?
valid := wb_req.cyc = '1' and wb_req.stb = '1';
-- Store signals (hard wired for 64-bit wishbone at the moment)
req_wsl <= wb_req.adr(WB_WSEL_BITS-1 downto 0);
for i in 0 to WB_WORD_COUNT-1 loop
if to_integer(unsigned(req_wsl)) = i then
req_we(WBSL*(i+1)-1 downto WBSL*i) <= wb_req.sel;
else
req_we(WBSL*(i+1)-1 downto WBSL*i) <= x"00";
end if;
req_wdata(WBL*(i+1)-1 downto WBL*i) <= wb_req.dat;
end loop;
-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := false;
for i in way_t loop
if valid and
(cache_valids(req_index)(i) = '1' or
(state = REFILL_WAIT_ACK and
req_index = refill_index and i = refill_way and
refill_rows_vlid(req_row mod ROW_PER_LINE) = '1')) then
if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i;
is_hit := true;
end if;
end if;
end loop;
-- We need to delay stores under some circumstances to avoid
-- collisions with the refill machine.
--
-- Corner case !!! The read acks pipeline takes two extra cycles
-- which means a store ack can collide with a previous load hit
-- ack. Thus we stall stores if we have a load ack pending.
--
if read_ack_0 = '1' or read_ack_1 = '1' then
-- Clash with pending read acks, delay..
store_delay := true;
elsif state /= IDLE then
-- If the reload machine is active, we cannot accept a store
-- for now.
--
-- We could improve this a bit by allowing stores if we have sent
-- all the requests down to litedram (we are only waiting for the
-- responses) *and* either of those conditions is true:
--
-- * It's a miss (doesn't require a write to BRAM) and isn't
-- for the line being reloaded (otherwise we might reload
-- stale data into the cache).
-- * It's a hit on a different way than the one being reloaded
-- in which case there is no conflict for BRAM access.
--
-- Otherwise we delay it...
--
store_delay := true;
else
store_delay := false;
end if;
-- Generate the req op. We only allow OP_LOAD_* when in the
-- IDLE state as our PLRU and ACK generation rely on this,
-- stores are allowed in IDLE state.
--
req_op <= OP_NONE;
if valid then
if wb_req.we = '1' then
if store_delay then
req_op <= OP_STORE_DELAYED;
elsif is_hit then
req_op <= OP_STORE_HIT;
else
req_op <= OP_STORE_MISS;
end if;
else
if is_hit then
req_op <= OP_LOAD_HIT;
else
req_op <= OP_LOAD_MISS;
end if;
end if;
end if;
req_hit_way <= hit_way;
end process;
--
-- Store queue
--
-- For now, queue up to 16 stores
store_queue: entity work.sync_fifo
generic map (
DEPTH => STOREQ_DEPTH,
WIDTH => STOREQ_BITS
)
port map (
clk => system_clk,
reset => system_reset,
rd_ready => storeq_rd_ready,
rd_valid => storeq_rd_valid,
rd_data => storeq_rd_data,
wr_ready => storeq_wr_ready,
wr_valid => storeq_wr_valid,
wr_data => storeq_wr_data
);
storeq_control : process(all)
variable stq_data : wishbone_data_type;
variable stq_sel : wishbone_sel_type;
variable stq_wsl : std_ulogic_vector(WB_WSEL_BITS-1 downto 0);
begin
storeq_wr_data <= wb_req.dat & wb_req.sel &
wb_req.adr(WB_WSEL_BITS-1 downto 0);
-- Only queue stores if we can also send a command
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
storeq_wr_valid <= user_port0_cmd_ready;
else
storeq_wr_valid <= '0';
end if;
-- Store signals (hard wired for 64-bit wishbone at the moment)
stq_data := storeq_rd_data(storeq_rd_data'left downto WBSL+WB_WSEL_BITS);
stq_sel := storeq_rd_data(WBSL+WB_WSEL_BITS-1 downto WB_WSEL_BITS);
stq_wsl := storeq_rd_data(WB_WSEL_BITS-1 downto 0);
for i in 0 to WB_WORD_COUNT-1 loop
if to_integer(unsigned(stq_wsl)) = i then
user_port0_wdata_we(WBSL*(i+1)-1 downto WBSL*i) <= stq_sel;
else
user_port0_wdata_we(WBSL*(i+1)-1 downto WBSL*i) <= x"00";
end if;
user_port0_wdata_data(WBL*(i+1)-1 downto WBL*i) <= stq_data;
end loop;
-- Note: Current litedram ignores user_port0_wdata_valid. We
-- must make sure to always have the data available at the
-- output of the store queue when we send the write command.
--
-- Thankfully this is always the case with this design.
--
user_port0_wdata_valid <= storeq_rd_valid;
storeq_rd_ready <= user_port0_wdata_ready;
if TRACE then
if rising_edge(system_clk) then
if req_op = OP_STORE_HIT then
report "Store hit to:" &
to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") &
" data:" & to_hstring(req_wdata) &
" we:" & to_hstring(req_we) &
" V:" & std_ulogic'image(user_port0_cmd_ready);
else
report "Store miss to:" &
to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") &
" data:" & to_hstring(req_wdata) &
" we:" & to_hstring(req_we) &
" V:" & std_ulogic'image(user_port0_cmd_ready);
end if;
if storeq_wr_valid = '1' and storeq_wr_ready = '1' then
report "storeq push " & to_hstring(storeq_wr_data);
end if;
if storeq_rd_valid = '1' and storeq_rd_ready = '1' then
report "storeq pop " & to_hstring(storeq_rd_data);
end if;
end if;
end if;
end process;
-- LiteDRAM command mux
dram_commands: process(all)
begin
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
-- For stores, forward signals directly. Only send command if
-- the FIFO can accept a store.
user_port0_cmd_addr <= wb_req.adr(DRAM_ABITS + ROW_OFF_BITS - wishbone_log2_width - 1 downto
ROW_OFF_BITS - wishbone_log2_width);
user_port0_cmd_we <= '1';
user_port0_cmd_valid <= storeq_wr_ready;
else
-- For loads, we route via a latch controlled by the refill machine
user_port0_cmd_addr <= refill_cmd_addr;
user_port0_cmd_valid <= refill_cmd_valid;
user_port0_cmd_we <= '0';
end if;
-- Note: litedram ignores this signal and assumes we are
-- always ready to accept read data.
user_port0_rdata_ready <= '1'; -- Always 1
end process;
-- LiteDRAM refill machine
--
-- This handles the cache line refills
--
refill_machine : process(system_clk)
variable tagset : cache_tags_set_t;
variable cmds_done : boolean;
variable wait_qdrain : boolean;
begin
if rising_edge(system_clk) then
-- On reset, clear all valid bits to force misses
if system_reset = '1' then
for i in index_t loop
cache_valids(i) <= (others => '0');
end loop;
state <= IDLE;
refill_cmd_valid <= '0';
else
-- Main state machine
case state is
when IDLE =>
assert refill_cmd_valid = '0' report "refill cmd valid in IDLE state !"
severity failure;
-- Reset per-row valid flags, only used in WAIT_ACK
for i in 0 to ROW_PER_LINE - 1 loop
refill_rows_vlid(i) <= '0';
end loop;
-- If NO_LS_OVERLAP is set, disallow a load miss if the store
-- queue still has data in it.
wait_qdrain := false;
if NO_LS_OVERLAP then
wait_qdrain := storeq_rd_valid = '1';
end if;
-- We need to read a cache line
if req_op = OP_LOAD_MISS and not wait_qdrain then
-- Grab way to replace
refill_way <= to_integer(unsigned(plru_victim(req_index)));
-- Keep track of our index and way for subsequent stores
refill_index <= req_index;
refill_row <= get_row(req_laddr);
refill_end_row <= get_row_of_line(get_row(req_laddr)) - 1;
-- Prep for first DRAM read
--
-- XXX TODO: We could start a cycle early here by using
-- combo logic to generate the first command in
-- "dram_commands". In fact, we could make refill_cmd_addr
-- only contain the "counter" bits and wire it with the
-- other bits from req_laddr.
refill_cmd_addr <= req_laddr(DRAM_ABITS+ROW_OFF_BITS-1 downto ROW_OFF_BITS);
refill_cmd_valid <= '1';
if TRACE then
report "refill addr " & to_hstring(req_laddr);
end if;
-- Track that we had one request sent
state <= REFILL_CLR_TAG;
end if;
when REFILL_CLR_TAG | REFILL_WAIT_ACK =>
-- Delayed tag clearing to help timing on PLRU output
if state = REFILL_CLR_TAG then
-- Force misses on that way while refilling that line
cache_valids(req_index)(refill_way) <= '0';
-- Store new tag in selected way
for i in 0 to NUM_WAYS-1 loop
if i = refill_way then
tagset := cache_tags(refill_index);
write_tag(i, tagset, req_tag);
cache_tags(refill_index) <= tagset;
end if;
end loop;
state <= REFILL_WAIT_ACK;
end if;
-- Commands are all sent if user_port0_cmd_valid is 0
cmds_done := refill_cmd_valid = '0';
-- If we are still sending requests, was one accepted ?
if user_port0_cmd_ready = '1' and not cmds_done then
-- That was the last word ? We are done sending. Clear
-- command valid and set cmds_done so we can handle an
-- eventual last ack on the same cycle.
--
if TRACE then
report "got refill cmd ack !";
end if;
if is_last_row_addr(refill_cmd_addr, refill_end_row) then
refill_cmd_valid <= '0';
cmds_done := true;
if TRACE then
report "all refill cmds done !";
end if;
else
-- Calculate the next row address
refill_cmd_addr <= next_row_addr(refill_cmd_addr);
if TRACE then
report "refill addr " &
to_hstring(next_row_addr(refill_cmd_addr));
end if;
end if;
end if;
-- Incoming read data processing
if user_port0_rdata_valid = '1' then
if TRACE then
report "got refill data ack !";
end if;
-- Mark partial line valid
refill_rows_vlid(refill_row mod ROW_PER_LINE) <= '1';
-- Check for completion
if cmds_done and is_last_row(refill_row, refill_end_row) then
if TRACE then
report "all refill data done !";
end if;
-- Cache line is now valid
cache_valids(refill_index)(refill_way) <= '1';
-- We are done
state <= IDLE;
end if;
-- Increment store row counter
refill_row <= next_row(refill_row);
end if;
end case;
end if;
end if;
end process;
may_trace: if LITEDRAM_TRACE generate
component litedram_trace_stub
end component;
begin
litedram_trace: litedram_trace_stub;
end generate;
litedram: litedram_core
port map(
clk => clk_in,
rst => rst,
pll_locked => pll_locked,
ddram_a => ddram_a,
ddram_ba => ddram_ba,
ddram_ras_n => ddram_ras_n,
ddram_cas_n => ddram_cas_n,
ddram_we_n => ddram_we_n,
ddram_cs_n => ddram_cs_n,
ddram_dm => ddram_dm,
ddram_dq => ddram_dq,
ddram_dqs_p => ddram_dqs_p,
ddram_dqs_n => ddram_dqs_n,
ddram_clk_p => ddram_clk_p,
ddram_clk_n => ddram_clk_n,
ddram_cke => ddram_cke,
ddram_odt => ddram_odt,
ddram_reset_n => ddram_reset_n,
init_done => init_done,
init_error => init_error,
user_clk => system_clk,
user_rst => system_reset,
wb_ctrl_adr => wb_ctrl_adr,
wb_ctrl_dat_w => wb_ctrl_dat_w,
wb_ctrl_dat_r => wb_ctrl_dat_r,
wb_ctrl_sel => wb_ctrl_sel,
wb_ctrl_cyc => wb_ctrl_cyc,
wb_ctrl_stb => wb_ctrl_stb,
wb_ctrl_ack => wb_ctrl_ack,
wb_ctrl_we => wb_ctrl_we,
wb_ctrl_cti => "000",
wb_ctrl_bte => "00",
wb_ctrl_err => open,
user_port_native_0_cmd_valid => user_port0_cmd_valid,
user_port_native_0_cmd_ready => user_port0_cmd_ready,
user_port_native_0_cmd_we => user_port0_cmd_we,
user_port_native_0_cmd_addr => user_port0_cmd_addr,
user_port_native_0_wdata_valid => user_port0_wdata_valid,
user_port_native_0_wdata_ready => user_port0_wdata_ready,
user_port_native_0_wdata_we => user_port0_wdata_we,
user_port_native_0_wdata_data => user_port0_wdata_data,
user_port_native_0_rdata_valid => user_port0_rdata_valid,
user_port_native_0_rdata_ready => user_port0_rdata_ready,
user_port_native_0_rdata_data => user_port0_rdata_data
);
end architecture behaviour;