mirror of
https://github.com/antonblanchard/microwatt.git
synced 2026-01-11 23:43:15 +00:00
This makes the 64-bit wishbone buses have the address expressed in units of doublewords (64 bits), and similarly for the 32-bit buses the address is in units of words (32 bits). This is to comply with the wishbone spec. Previously the addresses on the wishbone buses were in units of bytes regardless of the bus data width, which is not correct and caused problems with interfacing with externally-generated logic. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1176 lines
48 KiB
VHDL
1176 lines
48 KiB
VHDL
library ieee;
|
|
use ieee.std_logic_1164.all;
|
|
use ieee.numeric_std.all;
|
|
use std.textio.all;
|
|
|
|
library work;
|
|
use work.wishbone_types.all;
|
|
use work.utils.all;
|
|
use work.helpers.all;
|
|
|
|
entity litedram_wrapper is
|
|
generic (
|
|
DRAM_ABITS : positive;
|
|
DRAM_ALINES : natural;
|
|
DRAM_DLINES : natural;
|
|
DRAM_PORT_WIDTH : positive;
|
|
|
|
-- Pseudo-ROM payload
|
|
PAYLOAD_SIZE : natural;
|
|
PAYLOAD_FILE : string;
|
|
|
|
-- L2 cache --
|
|
|
|
-- Line size in bytes
|
|
LINE_SIZE : positive := 128;
|
|
-- Number of lines in a set
|
|
NUM_LINES : positive := 64;
|
|
-- Number of ways
|
|
NUM_WAYS : positive := 4;
|
|
-- Max number of stores in the queue
|
|
STOREQ_DEPTH : positive := 8;
|
|
-- Don't send loads until all pending stores acked in litedram
|
|
NO_LS_OVERLAP : boolean := false;
|
|
|
|
-- Debug
|
|
LITEDRAM_TRACE : boolean := false;
|
|
TRACE : boolean := false
|
|
);
|
|
port(
|
|
-- LiteDRAM generates the system clock and reset
|
|
-- from the input clkin
|
|
clk_in : in std_ulogic;
|
|
rst : in std_ulogic;
|
|
system_clk : out std_ulogic;
|
|
system_reset : out std_ulogic;
|
|
core_alt_reset : out std_ulogic;
|
|
pll_locked : out std_ulogic;
|
|
|
|
-- Wishbone ports:
|
|
wb_in : in wishbone_master_out;
|
|
wb_out : out wishbone_slave_out;
|
|
wb_ctrl_in : in wb_io_master_out;
|
|
wb_ctrl_out : out wb_io_slave_out;
|
|
wb_ctrl_is_csr : in std_ulogic;
|
|
wb_ctrl_is_init : in std_ulogic;
|
|
|
|
-- Misc
|
|
init_done : out std_ulogic;
|
|
init_error : out std_ulogic;
|
|
|
|
-- DRAM wires
|
|
ddram_a : out std_ulogic_vector(DRAM_ALINES-1 downto 0);
|
|
ddram_ba : out std_ulogic_vector(2 downto 0);
|
|
ddram_ras_n : out std_ulogic;
|
|
ddram_cas_n : out std_ulogic;
|
|
ddram_we_n : out std_ulogic;
|
|
ddram_cs_n : out std_ulogic;
|
|
ddram_dm : out std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
|
|
ddram_dq : inout std_ulogic_vector(DRAM_DLINES-1 downto 0);
|
|
ddram_dqs_p : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
|
|
ddram_dqs_n : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
|
|
ddram_clk_p : out std_ulogic;
|
|
ddram_clk_n : out std_ulogic;
|
|
ddram_cke : out std_ulogic;
|
|
ddram_odt : out std_ulogic;
|
|
ddram_reset_n : out std_ulogic
|
|
);
|
|
end entity litedram_wrapper;
|
|
|
|
architecture behaviour of litedram_wrapper is
|
|
|
|
component litedram_core port (
|
|
clk : in std_ulogic;
|
|
rst : in std_ulogic;
|
|
pll_locked : out std_ulogic;
|
|
ddram_a : out std_ulogic_vector(DRAM_ALINES-1 downto 0);
|
|
ddram_ba : out std_ulogic_vector(2 downto 0);
|
|
ddram_ras_n : out std_ulogic;
|
|
ddram_cas_n : out std_ulogic;
|
|
ddram_we_n : out std_ulogic;
|
|
ddram_cs_n : out std_ulogic;
|
|
ddram_dm : out std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
|
|
ddram_dq : inout std_ulogic_vector(DRAM_DLINES-1 downto 0);
|
|
ddram_dqs_p : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
|
|
ddram_dqs_n : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
|
|
ddram_clk_p : out std_ulogic;
|
|
ddram_clk_n : out std_ulogic;
|
|
ddram_cke : out std_ulogic;
|
|
ddram_odt : out std_ulogic;
|
|
ddram_reset_n : out std_ulogic;
|
|
init_done : out std_ulogic;
|
|
init_error : out std_ulogic;
|
|
user_clk : out std_ulogic;
|
|
user_rst : out std_ulogic;
|
|
wb_ctrl_adr : in std_ulogic_vector(29 downto 0);
|
|
wb_ctrl_dat_w : in std_ulogic_vector(31 downto 0);
|
|
wb_ctrl_dat_r : out std_ulogic_vector(31 downto 0);
|
|
wb_ctrl_sel : in std_ulogic_vector(3 downto 0);
|
|
wb_ctrl_cyc : in std_ulogic;
|
|
wb_ctrl_stb : in std_ulogic;
|
|
wb_ctrl_ack : out std_ulogic;
|
|
wb_ctrl_we : in std_ulogic;
|
|
wb_ctrl_cti : in std_ulogic_vector(2 downto 0);
|
|
wb_ctrl_bte : in std_ulogic_vector(1 downto 0);
|
|
wb_ctrl_err : out std_ulogic;
|
|
user_port_native_0_cmd_valid : in std_ulogic;
|
|
user_port_native_0_cmd_ready : out std_ulogic;
|
|
user_port_native_0_cmd_we : in std_ulogic;
|
|
user_port_native_0_cmd_addr : in std_ulogic_vector(DRAM_ABITS-1 downto 0);
|
|
user_port_native_0_wdata_valid : in std_ulogic;
|
|
user_port_native_0_wdata_ready : out std_ulogic;
|
|
user_port_native_0_wdata_we : in std_ulogic_vector(DRAM_PORT_WIDTH/8-1 downto 0);
|
|
user_port_native_0_wdata_data : in std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0);
|
|
user_port_native_0_rdata_valid : out std_ulogic;
|
|
user_port_native_0_rdata_ready : in std_ulogic;
|
|
user_port_native_0_rdata_data : out std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0)
|
|
);
|
|
end component;
|
|
|
|
signal user_port0_cmd_valid : std_ulogic;
|
|
signal user_port0_cmd_ready : std_ulogic;
|
|
signal user_port0_cmd_we : std_ulogic;
|
|
signal user_port0_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0);
|
|
signal user_port0_wdata_valid : std_ulogic;
|
|
signal user_port0_wdata_ready : std_ulogic;
|
|
signal user_port0_wdata_we : std_ulogic_vector(DRAM_PORT_WIDTH/8-1 downto 0);
|
|
signal user_port0_wdata_data : std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0);
|
|
signal user_port0_rdata_valid : std_ulogic;
|
|
signal user_port0_rdata_ready : std_ulogic;
|
|
signal user_port0_rdata_data : std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0);
|
|
|
|
signal wb_ctrl_adr : std_ulogic_vector(29 downto 0);
|
|
signal wb_ctrl_dat_w : std_ulogic_vector(31 downto 0);
|
|
signal wb_ctrl_dat_r : std_ulogic_vector(31 downto 0);
|
|
signal wb_ctrl_sel : std_ulogic_vector(3 downto 0);
|
|
signal wb_ctrl_cyc : std_ulogic := '0';
|
|
signal wb_ctrl_stb : std_ulogic;
|
|
signal wb_ctrl_ack : std_ulogic;
|
|
signal wb_ctrl_we : std_ulogic;
|
|
|
|
signal wb_init_in : wb_io_master_out;
|
|
signal wb_init_out : wb_io_slave_out;
|
|
|
|
-- DRAM data port width
|
|
constant DRAM_DBITS : natural := DRAM_PORT_WIDTH;
|
|
-- DRAM data port sel bits
|
|
constant DRAM_SBITS : natural := (DRAM_DBITS / 8);
|
|
|
|
-- WB geometry (just a few shortcuts)
|
|
constant WBL : positive := wb_in.dat'length;
|
|
constant WBSL : positive := wb_in.sel'length;
|
|
|
|
-- Select a WB word inside DRAM port width
|
|
constant WB_WORD_COUNT : positive := DRAM_DBITS/WBL;
|
|
constant WB_WSEL_BITS : positive := log2(WB_WORD_COUNT);
|
|
|
|
-- BRAM organisation: We never access more than wishbone_data_bits at
|
|
-- a time so to save resources we make the array only that wide, and
|
|
-- use consecutive indices for to make a cache "line"
|
|
--
|
|
-- ROW_SIZE is the width in bytes of the BRAM, ie, litedram port width
|
|
constant ROW_SIZE : natural := DRAM_DBITS / 8;
|
|
-- ROW_PER_LINE is the number of row (litedram transactions) in a line
|
|
constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
|
|
-- BRAM_ROWS is the number of rows in BRAM needed to represent the full
|
|
-- dcache
|
|
constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
|
|
|
|
-- Bit fields counts in the address
|
|
|
|
-- ROW_BITS is the number of bits to select a row
|
|
constant ROW_BITS : natural := log2(BRAM_ROWS);
|
|
-- ROW_LINEBITS is the number of bits to select a row within a line
|
|
constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
|
|
-- LINE_OFF_BITS is the number of bits for the offset in a cache line
|
|
constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
|
|
-- ROW_OFF_BITS is the number of bits for the offset in a row
|
|
constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
|
|
-- REAL_ADDR_BITS is the number of real address bits that we store
|
|
constant REAL_ADDR_BITS : positive := DRAM_ABITS + ROW_OFF_BITS;
|
|
-- INDEX_BITS is the number if bits to select a cache line
|
|
constant INDEX_BITS : natural := log2(NUM_LINES);
|
|
-- SET_SIZE_BITS is the log base 2 of the set size
|
|
constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
|
|
-- TAG_BITS is the number of bits of the tag part of the address
|
|
constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
|
|
-- WAY_BITS is the number of bits to select a way
|
|
constant WAY_BITS : natural := log2(NUM_WAYS);
|
|
|
|
subtype row_t is integer range 0 to BRAM_ROWS-1;
|
|
subtype index_t is integer range 0 to NUM_LINES-1;
|
|
subtype way_t is integer range 0 to NUM_WAYS-1;
|
|
subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
|
|
|
|
-- The cache data BRAM organized as described above for each way
|
|
subtype cache_row_t is std_ulogic_vector(DRAM_DBITS-1 downto 0);
|
|
|
|
-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
|
|
-- not handle a clean (commented) definition of the cache tags as a 3d
|
|
-- memory. For now, work around it by putting all the tags
|
|
subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
|
|
-- type cache_tags_set_t is array(way_t) of cache_tag_t;
|
|
-- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
|
|
constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
|
|
subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
|
|
type cache_tags_array_t is array(index_t) of cache_tags_set_t;
|
|
|
|
-- The cache valid bits
|
|
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
|
|
type cache_valids_t is array(index_t) of cache_way_valids_t;
|
|
|
|
-- "Temporary" valid bits for the rows of the currently refilled line
|
|
type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
|
|
|
|
-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
|
|
signal cache_tags : cache_tags_array_t;
|
|
signal cache_valids : cache_valids_t;
|
|
|
|
attribute ram_style : string;
|
|
attribute ram_style of cache_tags : signal is "distributed";
|
|
|
|
--
|
|
-- Store queue signals
|
|
--
|
|
-- We store a single wishbone dword per entry (64-bit)
|
|
-- along with the wishbone sel bits and the necessary address
|
|
-- bits to select which part of DRAM port to write to.
|
|
constant STOREQ_BITS : positive := WBL + WBSL + WB_WSEL_BITS;
|
|
|
|
signal storeq_rd_ready : std_ulogic;
|
|
signal storeq_rd_valid : std_ulogic;
|
|
signal storeq_rd_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);
|
|
signal storeq_wr_ready : std_ulogic;
|
|
signal storeq_wr_valid : std_ulogic;
|
|
signal storeq_wr_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);
|
|
|
|
--
|
|
-- Cache management signals
|
|
--
|
|
|
|
-- Cache state machine
|
|
type state_t is (IDLE, -- Normal load hit processing
|
|
REFILL_CLR_TAG, -- Cache refill clear tag
|
|
REFILL_WAIT_ACK); -- Cache refill wait ack
|
|
signal state : state_t;
|
|
|
|
-- Latched WB request
|
|
signal wb_req : wishbone_master_out := wishbone_master_out_init;
|
|
-- Stashed WB request
|
|
signal wb_stash : wishbone_master_out := wishbone_master_out_init;
|
|
|
|
-- Read pipeline (to handle cache RAM latency)
|
|
signal read_ack_0 : std_ulogic := '0';
|
|
signal read_ack_1 : std_ulogic := '0';
|
|
signal read_wsl_0 : std_ulogic_vector(WB_WSEL_BITS-1 downto 0) := (others => '0');
|
|
signal read_wsl_1 : std_ulogic_vector(WB_WSEL_BITS-1 downto 0) := (others => '0');
|
|
signal read_way_0 : way_t;
|
|
signal read_way_1 : way_t;
|
|
|
|
-- Store ack pipeline
|
|
signal store_ack_0 : std_ulogic := '0';
|
|
signal store_ack_1 : std_ulogic := '0';
|
|
|
|
-- Async signals decoding latched request
|
|
type req_op_t is (OP_NONE,
|
|
OP_LOAD_HIT,
|
|
OP_LOAD_MISS,
|
|
OP_STORE_HIT,
|
|
OP_STORE_MISS,
|
|
OP_STORE_DELAYED);
|
|
|
|
signal req_index : index_t;
|
|
signal req_row : row_t;
|
|
signal req_hit_way : way_t;
|
|
signal req_tag : cache_tag_t;
|
|
signal req_op : req_op_t;
|
|
signal req_laddr : std_ulogic_vector(REAL_ADDR_BITS-1 downto 0);
|
|
signal req_wsl : std_ulogic_vector(WB_WSEL_BITS-1 downto 0);
|
|
signal req_we : std_ulogic_vector(DRAM_SBITS-1 downto 0);
|
|
signal req_wdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
|
|
signal stall : std_ulogic;
|
|
|
|
-- Line refill command signals and latches
|
|
signal refill_cmd_valid : std_ulogic;
|
|
signal refill_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0);
|
|
signal refill_way : way_t;
|
|
signal refill_index : index_t;
|
|
signal refill_row : row_t;
|
|
signal refill_end_row : row_in_line_t;
|
|
signal refill_rows_vlid : row_per_line_valid_t;
|
|
|
|
-- Cache RAM interface
|
|
type cache_ram_out_t is array(way_t) of cache_row_t;
|
|
signal cache_out : cache_ram_out_t;
|
|
|
|
-- PLRU output interface
|
|
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
|
|
signal plru_victim : plru_out_t;
|
|
|
|
--
|
|
-- Helper functions to decode incoming requests
|
|
--
|
|
|
|
-- Return the DRAM real address from a wishbone address
|
|
function get_real_addr(addr: wishbone_addr_type) return std_ulogic_vector is
|
|
variable ra: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0) := (others => '0');
|
|
begin
|
|
ra(REAL_ADDR_BITS - 1 downto wishbone_log2_width) :=
|
|
addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto 0);
|
|
return ra;
|
|
end;
|
|
|
|
-- Return the cache line index (tag index) for an address
|
|
function get_index(addr: wishbone_addr_type) return index_t is
|
|
begin
|
|
return to_integer(unsigned(addr(SET_SIZE_BITS - wishbone_log2_width - 1 downto
|
|
LINE_OFF_BITS - wishbone_log2_width)));
|
|
end;
|
|
|
|
-- Return the cache row index (data memory) for an address
|
|
function get_row(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto 0)) return row_t is
|
|
begin
|
|
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
|
|
end;
|
|
|
|
-- Return the index of a row within a line
|
|
function get_row_of_line(row: row_t) return row_in_line_t is
|
|
variable row_v : unsigned(ROW_BITS-1 downto 0);
|
|
begin
|
|
row_v := to_unsigned(row, ROW_BITS);
|
|
return row_v(ROW_LINEBITS-1 downto 0);
|
|
end;
|
|
-- Returns whether this is the last row of a line. It takes a DRAM address
|
|
function is_last_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS);
|
|
last: row_in_line_t)
|
|
return boolean is
|
|
begin
|
|
return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
|
|
end;
|
|
|
|
-- Returns whether this is the last row of a line
|
|
function is_last_row(row: row_t; last: row_in_line_t) return boolean is
|
|
begin
|
|
return get_row_of_line(row) = last;
|
|
end;
|
|
|
|
-- Return the address of the next row in the current cache line. It takes a
|
|
-- DRAM address
|
|
function next_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS))
|
|
return std_ulogic_vector is
|
|
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
|
|
variable result : std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS);
|
|
begin
|
|
-- Is there no simpler way in VHDL to generate that 3 bits adder ?
|
|
row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
|
|
row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
|
|
result := addr;
|
|
result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
|
|
return result;
|
|
end;
|
|
|
|
-- Return the next row in the current cache line. We use a dedicated
|
|
-- function in order to limit the size of the generated adder to be
|
|
-- only the bits within a cache line (3 bits with default settings)
|
|
--
|
|
function next_row(row: row_t) return row_t is
|
|
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
|
|
variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
begin
|
|
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
|
|
row_idx := row_v(ROW_LINEBITS-1 downto 0);
|
|
row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
|
|
return to_integer(unsigned(row_v));
|
|
end;
|
|
|
|
-- Get the tag value from the address
|
|
function get_tag(addr: wishbone_addr_type) return cache_tag_t is
|
|
begin
|
|
return addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto
|
|
SET_SIZE_BITS - wishbone_log2_width);
|
|
end;
|
|
|
|
-- Read a tag from a tag memory row
|
|
function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
|
|
begin
|
|
return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
|
|
end;
|
|
|
|
-- Write a tag to tag memory row
|
|
procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
|
|
tag: cache_tag_t) is
|
|
begin
|
|
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
|
|
end;
|
|
|
|
begin
|
|
|
|
-- Sanity checks
|
|
assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
|
|
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
|
|
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
|
|
assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
|
|
assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
|
|
-- alternate core reset address set when DRAM is not initialized.
|
|
core_alt_reset <= not init_done;
|
|
|
|
-- Init code BRAM memory slave
|
|
init_ram_0: entity work.dram_init_mem
|
|
generic map(
|
|
EXTRA_PAYLOAD_FILE => PAYLOAD_FILE,
|
|
EXTRA_PAYLOAD_SIZE => PAYLOAD_SIZE
|
|
)
|
|
port map(
|
|
clk => system_clk,
|
|
wb_in => wb_init_in,
|
|
wb_out => wb_init_out
|
|
);
|
|
|
|
--
|
|
-- Control bus wishbone: This muxes the wishbone to the CSRs
|
|
-- and an internal small one to the init BRAM
|
|
--
|
|
|
|
-- Init DRAM wishbone IN signals
|
|
wb_init_in.adr <= wb_ctrl_in.adr;
|
|
wb_init_in.dat <= wb_ctrl_in.dat;
|
|
wb_init_in.sel <= wb_ctrl_in.sel;
|
|
wb_init_in.we <= wb_ctrl_in.we;
|
|
wb_init_in.stb <= wb_ctrl_in.stb;
|
|
wb_init_in.cyc <= wb_ctrl_in.cyc and wb_ctrl_is_init;
|
|
|
|
-- DRAM CSR IN signals. Extra latch to help with timing
|
|
csr_latch: process(system_clk)
|
|
begin
|
|
if rising_edge(system_clk) then
|
|
if system_reset = '1' then
|
|
wb_ctrl_cyc <= '0';
|
|
wb_ctrl_stb <= '0';
|
|
else
|
|
-- XXX Maybe only update addr when cyc = '1' to save power ?
|
|
wb_ctrl_adr <= x"0000" & wb_ctrl_in.adr(13 downto 0);
|
|
wb_ctrl_dat_w <= wb_ctrl_in.dat;
|
|
wb_ctrl_sel <= wb_ctrl_in.sel;
|
|
wb_ctrl_we <= wb_ctrl_in.we;
|
|
wb_ctrl_cyc <= wb_ctrl_in.cyc and wb_ctrl_is_csr;
|
|
wb_ctrl_stb <= wb_ctrl_in.stb and wb_ctrl_is_csr;
|
|
|
|
-- Clear stb on ack otherwise the memory will latch
|
|
-- the write twice which breaks levelling. On the next
|
|
-- cycle we will latch an updated stb that takes the
|
|
-- ack into account.
|
|
if wb_ctrl_ack = '1' then
|
|
wb_ctrl_stb <= '0';
|
|
end if;
|
|
end if;
|
|
end if;
|
|
end process;
|
|
|
|
-- Ctrl bus wishbone OUT signals. XXX Consider adding latch on
|
|
-- CSR response to help timing
|
|
wb_ctrl_out.ack <= wb_ctrl_ack when wb_ctrl_is_csr = '1'
|
|
else wb_init_out.ack;
|
|
wb_ctrl_out.dat <= wb_ctrl_dat_r when wb_ctrl_is_csr = '1'
|
|
else wb_init_out.dat;
|
|
wb_ctrl_out.stall <= wb_init_out.stall when wb_ctrl_is_init else
|
|
'0' when wb_ctrl_in.cyc = '0' else not wb_ctrl_ack;
|
|
|
|
|
|
-- Generate a cache RAM for each way
|
|
rams: for i in 0 to NUM_WAYS-1 generate
|
|
signal do_read : std_ulogic;
|
|
signal do_write : std_ulogic;
|
|
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
signal wr_data : std_ulogic_vector(DRAM_DBITS-1 downto 0);
|
|
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
|
|
signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
|
|
signal dout : cache_row_t;
|
|
begin
|
|
way: entity work.cache_ram
|
|
generic map (
|
|
ROW_BITS => ROW_BITS,
|
|
WIDTH => DRAM_DBITS,
|
|
ADD_BUF => true
|
|
)
|
|
port map (
|
|
clk => system_clk,
|
|
rd_en => do_read,
|
|
rd_addr => rd_addr,
|
|
rd_data => dout,
|
|
wr_sel => wr_sel_m,
|
|
wr_addr => wr_addr,
|
|
wr_data => wr_data
|
|
);
|
|
process(all)
|
|
begin
|
|
--
|
|
-- Read port
|
|
--
|
|
do_read <= '1';
|
|
cache_out(i) <= dout;
|
|
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
|
|
|
|
--
|
|
-- Write mux: cache refills from DRAM or writes from Wishbone
|
|
--
|
|
if req_op = OP_STORE_HIT and req_hit_way = i then
|
|
-- Write from wishbone
|
|
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
|
|
wr_data <= req_wdata;
|
|
wr_sel <= req_we;
|
|
else
|
|
-- Refill from DRAM
|
|
wr_data <= user_port0_rdata_data;
|
|
wr_sel <= (others => '1');
|
|
wr_addr <= std_ulogic_vector(to_unsigned(refill_row, ROW_BITS));
|
|
end if;
|
|
|
|
--
|
|
-- Write enable logic
|
|
--
|
|
do_write <= '0';
|
|
if req_op = OP_STORE_HIT and req_hit_way = i then
|
|
do_write <= '1';
|
|
elsif user_port0_rdata_valid = '1' and refill_way = i then
|
|
do_write <= '1';
|
|
end if;
|
|
|
|
-- Mask write selects with do_write since BRAM doesn't always
|
|
-- have a global write-enable (Vivado generates TDP instead
|
|
-- of SDP when using one, thus doubling cache BRAM usage).
|
|
for i in 0 to ROW_SIZE-1 loop
|
|
wr_sel_m(i) <= wr_sel(i) and do_write;
|
|
end loop;
|
|
|
|
if TRACE and rising_edge(system_clk) then
|
|
if do_write = '1' then
|
|
report "cache write way:" & integer'image(i) &
|
|
" addr:" & to_hstring(wr_addr) &
|
|
" sel:" & to_hstring(wr_sel_m) &
|
|
" data:" & to_hstring(wr_data);
|
|
end if;
|
|
end if;
|
|
end process;
|
|
end generate;
|
|
|
|
-- Generate PLRUs
|
|
maybe_plrus: if NUM_WAYS > 1 generate
|
|
begin
|
|
plrus: for i in 0 to NUM_LINES-1 generate
|
|
-- PLRU interface
|
|
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
|
|
signal plru_acc_en : std_ulogic;
|
|
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
|
|
begin
|
|
plru : entity work.plru
|
|
generic map (
|
|
BITS => WAY_BITS
|
|
)
|
|
port map (
|
|
clk => system_clk,
|
|
rst => system_reset,
|
|
acc => plru_acc,
|
|
acc_en => plru_acc_en,
|
|
lru => plru_out
|
|
);
|
|
|
|
process(req_index, req_op, req_hit_way, plru_out)
|
|
begin
|
|
-- PLRU interface
|
|
if (req_op = OP_LOAD_HIT or
|
|
req_op = OP_STORE_HIT) and req_index = i then
|
|
plru_acc_en <= '1';
|
|
else
|
|
plru_acc_en <= '0';
|
|
end if;
|
|
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
|
|
plru_victim(i) <= plru_out;
|
|
end process;
|
|
end generate;
|
|
end generate;
|
|
|
|
--
|
|
-- Wishbone request interface:
|
|
--
|
|
-- - Incoming wishbone request latch (to help with timing)
|
|
-- - Read response pipeline (to match BRAM output buffer delay)
|
|
-- - Stall generation
|
|
--
|
|
-- XXX TODO: Properly handle cyc drops before all acks are sent...
|
|
--
|
|
request_latch: process(system_clk)
|
|
begin
|
|
if rising_edge(system_clk) then
|
|
|
|
-- Implement a stash buffer. If we are stalled and stash is
|
|
-- free, fill it up. This will generate a WB stall on the
|
|
-- next cycle.
|
|
if stall = '1' and wb_out.stall = '0' and wb_in.cyc = '1' and wb_in.stb = '1' then
|
|
wb_stash <= wb_in;
|
|
if TRACE then
|
|
report "stashed wb req ! addr:" & to_hstring(wb_in.adr & "000") &
|
|
" we:" & std_ulogic'image(wb_in.we) &
|
|
" sel:" & to_hstring(wb_in.sel);
|
|
end if;
|
|
end if;
|
|
|
|
-- We aren't stalled, see what we can do
|
|
if stall = '0' then
|
|
if wb_stash.cyc = '1' then
|
|
-- Something in stash ! use it and clear stash
|
|
wb_req <= wb_stash;
|
|
wb_stash.cyc <= '0';
|
|
if TRACE then
|
|
report "unstashed wb req ! addr:" & to_hstring(wb_stash.adr & "000") &
|
|
" we:" & std_ulogic'image(wb_stash.we) &
|
|
" sel:" & to_hstring(wb_stash.sel);
|
|
end if;
|
|
else
|
|
-- Grab request from WB
|
|
if wb_in.cyc = '1' then
|
|
wb_req <= wb_in;
|
|
else
|
|
wb_req.cyc <= wb_in.cyc;
|
|
wb_req.stb <= wb_in.stb;
|
|
end if;
|
|
|
|
if TRACE then
|
|
if wb_in.cyc = '1' and wb_in.stb = '1' then
|
|
report "latch new wb req ! addr:" & to_hstring(wb_in.adr & "000") &
|
|
" we:" & std_ulogic'image(wb_in.we) &
|
|
" sel:" & to_hstring(wb_in.sel);
|
|
end if;
|
|
end if;
|
|
end if;
|
|
end if;
|
|
end if;
|
|
end process;
|
|
|
|
-- Stall when stash is full
|
|
wb_out.stall <= wb_stash.cyc;
|
|
|
|
--
|
|
-- Read response pipeline
|
|
--
|
|
read_pipe: process(system_clk)
|
|
begin
|
|
if rising_edge(system_clk) then
|
|
read_ack_0 <= '1' when req_op = OP_LOAD_HIT else '0';
|
|
read_wsl_0 <= req_wsl;
|
|
read_way_0 <= req_hit_way;
|
|
|
|
read_ack_1 <= read_ack_0;
|
|
read_wsl_1 <= read_wsl_0;
|
|
read_way_1 <= read_way_0;
|
|
|
|
if TRACE then
|
|
if req_op = OP_LOAD_HIT then
|
|
report "Load hit addr:" & to_hstring(wb_req.adr & "000") &
|
|
" idx:" & integer'image(req_index) &
|
|
" tag:" & to_hstring(req_tag) &
|
|
" way:" & integer'image(req_hit_way);
|
|
elsif req_op = OP_LOAD_MISS then
|
|
report "Load miss addr:" & to_hstring(wb_req.adr & "000");
|
|
end if;
|
|
if read_ack_0 = '1' then
|
|
report "read data:" & to_hstring(cache_out(read_way_0));
|
|
end if;
|
|
end if;
|
|
end if;
|
|
end process;
|
|
|
|
--
|
|
-- Store acks pipeline
|
|
--
|
|
store_ack_pipe: process(system_clk)
|
|
begin
|
|
if rising_edge(system_clk) then
|
|
store_ack_1 <= store_ack_0;
|
|
end if;
|
|
end process;
|
|
|
|
--
|
|
-- Wishbone response generation
|
|
--
|
|
|
|
wb_rseponse: process(all)
|
|
variable rdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
|
|
variable store_done : std_ulogic;
|
|
variable accept_store : std_ulogic;
|
|
variable wsel : natural range 0 to WB_WORD_COUNT-1;
|
|
begin
|
|
-- Can we accept a store ? This is set when the store queue & command
|
|
-- queue are not full.
|
|
--
|
|
-- This does *not* mean that we will accept the store, there are other
|
|
-- reasons to delay them (see OP_STORE_DELAYED).
|
|
--
|
|
-- A store is fully accepted when *both* req_op is not OP_STORE_DELAYED
|
|
-- and accept_store is '1'.
|
|
--
|
|
-- The reason for this split is to avoid a circular dependency inside
|
|
-- LiteDRAM, since cmd_ready from litedram is driven from cmd_valid (*)
|
|
-- we don't want to generate cmd_valid from cmd_ready. So we generate
|
|
-- it instead from all the *other* conditions that make a store valid.
|
|
--
|
|
-- (*) It's my understanding that user_port0_cmd_ready from LiteDRAM is
|
|
-- ombinational from user_port0_cmd_valid along with a bunch of other
|
|
-- internal signals. IE. we won't know that LiteDRAM cannot accept a
|
|
-- command until we try to send one.
|
|
--
|
|
accept_store := user_port0_cmd_ready and storeq_wr_ready;
|
|
|
|
-- Generate stalls. For stores we stall if we can't accept it.
|
|
-- For loads, we stall if we are going to take a load miss or
|
|
-- are in the middle of a refill and it isn't a partial hit.
|
|
if req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then
|
|
stall <= not accept_store;
|
|
elsif req_op = OP_LOAD_MISS or req_op = OP_STORE_DELAYED then
|
|
stall <= '1';
|
|
else
|
|
stall <= '0';
|
|
end if;
|
|
|
|
-- Data out mux
|
|
rdata := cache_out(read_way_1);
|
|
|
|
-- Hard wired for 64-bit wishbone
|
|
wsel := to_integer(unsigned(read_wsl_1));
|
|
wb_out.dat <= rdata((wsel+1)*WBL-1 downto wsel*WBL);
|
|
|
|
-- Early-complete stores on wishbone.
|
|
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
|
|
store_done := accept_store;
|
|
else
|
|
store_done := '0';
|
|
end if;
|
|
|
|
-- Pipeline store acks
|
|
store_ack_0 <= store_done;
|
|
|
|
-- Generate Wishbone ACKs on read hits and store complete
|
|
--
|
|
-- This can happen on store right behind loads ! This is why
|
|
-- we delay a store when a load ack is in the pipeline in the
|
|
-- request decoder below.
|
|
--
|
|
wb_out.ack <= read_ack_1 or store_ack_1;
|
|
assert read_ack_1 = '0' or store_ack_1 = '0' report
|
|
"Read ack and store ack collision !"
|
|
severity failure;
|
|
end process;
|
|
|
|
--
|
|
-- Cache request decode
|
|
--
|
|
request_decode: process(all)
|
|
variable valid : boolean;
|
|
variable is_hit : boolean;
|
|
variable store_delay : boolean;
|
|
variable hit_way : way_t;
|
|
begin
|
|
-- Extract line, row and tag from request
|
|
req_index <= get_index(wb_req.adr);
|
|
req_row <= get_row(get_real_addr(wb_req.adr));
|
|
req_tag <= get_tag(wb_req.adr);
|
|
|
|
-- Calculate address of beginning of cache row, will be
|
|
-- used for cache miss processing if needed
|
|
req_laddr <= get_real_addr(wb_req.adr);
|
|
|
|
|
|
-- Do we have a valid request in the WB latch ?
|
|
valid := wb_req.cyc = '1' and wb_req.stb = '1';
|
|
|
|
-- Store signals (hard wired for 64-bit wishbone at the moment)
|
|
req_wsl <= wb_req.adr(WB_WSEL_BITS-1 downto 0);
|
|
for i in 0 to WB_WORD_COUNT-1 loop
|
|
if to_integer(unsigned(req_wsl)) = i then
|
|
req_we(WBSL*(i+1)-1 downto WBSL*i) <= wb_req.sel;
|
|
else
|
|
req_we(WBSL*(i+1)-1 downto WBSL*i) <= x"00";
|
|
end if;
|
|
req_wdata(WBL*(i+1)-1 downto WBL*i) <= wb_req.dat;
|
|
end loop;
|
|
|
|
-- Test if pending request is a hit on any way
|
|
hit_way := 0;
|
|
is_hit := false;
|
|
for i in way_t loop
|
|
if valid and
|
|
(cache_valids(req_index)(i) = '1' or
|
|
(state = REFILL_WAIT_ACK and
|
|
req_index = refill_index and i = refill_way and
|
|
refill_rows_vlid(req_row mod ROW_PER_LINE) = '1')) then
|
|
if read_tag(i, cache_tags(req_index)) = req_tag then
|
|
hit_way := i;
|
|
is_hit := true;
|
|
end if;
|
|
end if;
|
|
end loop;
|
|
|
|
-- We need to delay stores under some circumstances to avoid
|
|
-- collisions with the refill machine.
|
|
--
|
|
-- Corner case !!! The read acks pipeline takes two extra cycles
|
|
-- which means a store ack can collide with a previous load hit
|
|
-- ack. Thus we stall stores if we have a load ack pending.
|
|
--
|
|
if read_ack_0 = '1' or read_ack_1 = '1' then
|
|
-- Clash with pending read acks, delay..
|
|
store_delay := true;
|
|
elsif state /= IDLE then
|
|
-- If the reload machine is active, we cannot accept a store
|
|
-- for now.
|
|
--
|
|
-- We could improve this a bit by allowing stores if we have sent
|
|
-- all the requests down to litedram (we are only waiting for the
|
|
-- responses) *and* either of those conditions is true:
|
|
--
|
|
-- * It's a miss (doesn't require a write to BRAM) and isn't
|
|
-- for the line being reloaded (otherwise we might reload
|
|
-- stale data into the cache).
|
|
-- * It's a hit on a different way than the one being reloaded
|
|
-- in which case there is no conflict for BRAM access.
|
|
--
|
|
-- Otherwise we delay it...
|
|
--
|
|
store_delay := true;
|
|
else
|
|
store_delay := false;
|
|
end if;
|
|
|
|
-- Generate the req op. We only allow OP_LOAD_* when in the
|
|
-- IDLE state as our PLRU and ACK generation rely on this,
|
|
-- stores are allowed in IDLE state.
|
|
--
|
|
req_op <= OP_NONE;
|
|
if valid then
|
|
if wb_req.we = '1' then
|
|
if store_delay then
|
|
req_op <= OP_STORE_DELAYED;
|
|
elsif is_hit then
|
|
req_op <= OP_STORE_HIT;
|
|
else
|
|
req_op <= OP_STORE_MISS;
|
|
end if;
|
|
else
|
|
if is_hit then
|
|
req_op <= OP_LOAD_HIT;
|
|
else
|
|
req_op <= OP_LOAD_MISS;
|
|
end if;
|
|
end if;
|
|
end if;
|
|
req_hit_way <= hit_way;
|
|
end process;
|
|
|
|
--
|
|
-- Store queue
|
|
--
|
|
-- For now, queue up to 16 stores
|
|
store_queue: entity work.sync_fifo
|
|
generic map (
|
|
DEPTH => STOREQ_DEPTH,
|
|
WIDTH => STOREQ_BITS
|
|
)
|
|
port map (
|
|
clk => system_clk,
|
|
reset => system_reset,
|
|
rd_ready => storeq_rd_ready,
|
|
rd_valid => storeq_rd_valid,
|
|
rd_data => storeq_rd_data,
|
|
wr_ready => storeq_wr_ready,
|
|
wr_valid => storeq_wr_valid,
|
|
wr_data => storeq_wr_data
|
|
);
|
|
|
|
storeq_control : process(all)
|
|
variable stq_data : wishbone_data_type;
|
|
variable stq_sel : wishbone_sel_type;
|
|
variable stq_wsl : std_ulogic_vector(WB_WSEL_BITS-1 downto 0);
|
|
begin
|
|
storeq_wr_data <= wb_req.dat & wb_req.sel &
|
|
wb_req.adr(WB_WSEL_BITS-1 downto 0);
|
|
|
|
-- Only queue stores if we can also send a command
|
|
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
|
|
storeq_wr_valid <= user_port0_cmd_ready;
|
|
else
|
|
storeq_wr_valid <= '0';
|
|
end if;
|
|
|
|
-- Store signals (hard wired for 64-bit wishbone at the moment)
|
|
stq_data := storeq_rd_data(storeq_rd_data'left downto WBSL+WB_WSEL_BITS);
|
|
stq_sel := storeq_rd_data(WBSL+WB_WSEL_BITS-1 downto WB_WSEL_BITS);
|
|
stq_wsl := storeq_rd_data(WB_WSEL_BITS-1 downto 0);
|
|
for i in 0 to WB_WORD_COUNT-1 loop
|
|
if to_integer(unsigned(stq_wsl)) = i then
|
|
user_port0_wdata_we(WBSL*(i+1)-1 downto WBSL*i) <= stq_sel;
|
|
else
|
|
user_port0_wdata_we(WBSL*(i+1)-1 downto WBSL*i) <= x"00";
|
|
end if;
|
|
user_port0_wdata_data(WBL*(i+1)-1 downto WBL*i) <= stq_data;
|
|
end loop;
|
|
|
|
-- Note: Current litedram ignores user_port0_wdata_valid. We
|
|
-- must make sure to always have the data available at the
|
|
-- output of the store queue when we send the write command.
|
|
--
|
|
-- Thankfully this is always the case with this design.
|
|
--
|
|
user_port0_wdata_valid <= storeq_rd_valid;
|
|
storeq_rd_ready <= user_port0_wdata_ready;
|
|
|
|
if TRACE then
|
|
if rising_edge(system_clk) then
|
|
if req_op = OP_STORE_HIT then
|
|
report "Store hit to:" &
|
|
to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") &
|
|
" data:" & to_hstring(req_wdata) &
|
|
" we:" & to_hstring(req_we) &
|
|
" V:" & std_ulogic'image(user_port0_cmd_ready);
|
|
else
|
|
report "Store miss to:" &
|
|
to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") &
|
|
" data:" & to_hstring(req_wdata) &
|
|
" we:" & to_hstring(req_we) &
|
|
" V:" & std_ulogic'image(user_port0_cmd_ready);
|
|
end if;
|
|
if storeq_wr_valid = '1' and storeq_wr_ready = '1' then
|
|
report "storeq push " & to_hstring(storeq_wr_data);
|
|
end if;
|
|
if storeq_rd_valid = '1' and storeq_rd_ready = '1' then
|
|
report "storeq pop " & to_hstring(storeq_rd_data);
|
|
end if;
|
|
end if;
|
|
end if;
|
|
end process;
|
|
|
|
-- LiteDRAM command mux
|
|
dram_commands: process(all)
|
|
begin
|
|
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
|
|
-- For stores, forward signals directly. Only send command if
|
|
-- the FIFO can accept a store.
|
|
user_port0_cmd_addr <= wb_req.adr(DRAM_ABITS + ROW_OFF_BITS - wishbone_log2_width - 1 downto
|
|
ROW_OFF_BITS - wishbone_log2_width);
|
|
user_port0_cmd_we <= '1';
|
|
user_port0_cmd_valid <= storeq_wr_ready;
|
|
else
|
|
-- For loads, we route via a latch controlled by the refill machine
|
|
user_port0_cmd_addr <= refill_cmd_addr;
|
|
user_port0_cmd_valid <= refill_cmd_valid;
|
|
user_port0_cmd_we <= '0';
|
|
end if;
|
|
|
|
-- Note: litedram ignores this signal and assumes we are
|
|
-- always ready to accept read data.
|
|
user_port0_rdata_ready <= '1'; -- Always 1
|
|
end process;
|
|
|
|
-- LiteDRAM refill machine
|
|
--
|
|
-- This handles the cache line refills
|
|
--
|
|
refill_machine : process(system_clk)
|
|
variable tagset : cache_tags_set_t;
|
|
variable cmds_done : boolean;
|
|
variable wait_qdrain : boolean;
|
|
begin
|
|
if rising_edge(system_clk) then
|
|
-- On reset, clear all valid bits to force misses
|
|
if system_reset = '1' then
|
|
for i in index_t loop
|
|
cache_valids(i) <= (others => '0');
|
|
end loop;
|
|
state <= IDLE;
|
|
refill_cmd_valid <= '0';
|
|
else
|
|
-- Main state machine
|
|
case state is
|
|
when IDLE =>
|
|
assert refill_cmd_valid = '0' report "refill cmd valid in IDLE state !"
|
|
severity failure;
|
|
|
|
-- Reset per-row valid flags, only used in WAIT_ACK
|
|
for i in 0 to ROW_PER_LINE - 1 loop
|
|
refill_rows_vlid(i) <= '0';
|
|
end loop;
|
|
|
|
-- If NO_LS_OVERLAP is set, disallow a load miss if the store
|
|
-- queue still has data in it.
|
|
wait_qdrain := false;
|
|
if NO_LS_OVERLAP then
|
|
wait_qdrain := storeq_rd_valid = '1';
|
|
end if;
|
|
|
|
-- We need to read a cache line
|
|
if req_op = OP_LOAD_MISS and not wait_qdrain then
|
|
-- Grab way to replace
|
|
refill_way <= to_integer(unsigned(plru_victim(req_index)));
|
|
|
|
-- Keep track of our index and way for subsequent stores
|
|
refill_index <= req_index;
|
|
refill_row <= get_row(req_laddr);
|
|
refill_end_row <= get_row_of_line(get_row(req_laddr)) - 1;
|
|
|
|
-- Prep for first DRAM read
|
|
--
|
|
-- XXX TODO: We could start a cycle early here by using
|
|
-- combo logic to generate the first command in
|
|
-- "dram_commands". In fact, we could make refill_cmd_addr
|
|
-- only contain the "counter" bits and wire it with the
|
|
-- other bits from req_laddr.
|
|
refill_cmd_addr <= req_laddr(DRAM_ABITS+ROW_OFF_BITS-1 downto ROW_OFF_BITS);
|
|
refill_cmd_valid <= '1';
|
|
|
|
if TRACE then
|
|
report "refill addr " & to_hstring(req_laddr);
|
|
end if;
|
|
|
|
-- Track that we had one request sent
|
|
state <= REFILL_CLR_TAG;
|
|
end if;
|
|
|
|
when REFILL_CLR_TAG | REFILL_WAIT_ACK =>
|
|
|
|
-- Delayed tag clearing to help timing on PLRU output
|
|
if state = REFILL_CLR_TAG then
|
|
-- Force misses on that way while refilling that line
|
|
cache_valids(req_index)(refill_way) <= '0';
|
|
|
|
-- Store new tag in selected way
|
|
for i in 0 to NUM_WAYS-1 loop
|
|
if i = refill_way then
|
|
tagset := cache_tags(refill_index);
|
|
write_tag(i, tagset, req_tag);
|
|
cache_tags(refill_index) <= tagset;
|
|
end if;
|
|
end loop;
|
|
state <= REFILL_WAIT_ACK;
|
|
end if;
|
|
|
|
-- Commands are all sent if user_port0_cmd_valid is 0
|
|
cmds_done := refill_cmd_valid = '0';
|
|
|
|
-- If we are still sending requests, was one accepted ?
|
|
if user_port0_cmd_ready = '1' and not cmds_done then
|
|
-- That was the last word ? We are done sending. Clear
|
|
-- command valid and set cmds_done so we can handle an
|
|
-- eventual last ack on the same cycle.
|
|
--
|
|
if TRACE then
|
|
report "got refill cmd ack !";
|
|
end if;
|
|
if is_last_row_addr(refill_cmd_addr, refill_end_row) then
|
|
refill_cmd_valid <= '0';
|
|
cmds_done := true;
|
|
if TRACE then
|
|
report "all refill cmds done !";
|
|
end if;
|
|
else
|
|
-- Calculate the next row address
|
|
refill_cmd_addr <= next_row_addr(refill_cmd_addr);
|
|
if TRACE then
|
|
report "refill addr " &
|
|
to_hstring(next_row_addr(refill_cmd_addr));
|
|
end if;
|
|
end if;
|
|
end if;
|
|
|
|
-- Incoming read data processing
|
|
if user_port0_rdata_valid = '1' then
|
|
if TRACE then
|
|
report "got refill data ack !";
|
|
end if;
|
|
|
|
-- Mark partial line valid
|
|
refill_rows_vlid(refill_row mod ROW_PER_LINE) <= '1';
|
|
|
|
-- Check for completion
|
|
if cmds_done and is_last_row(refill_row, refill_end_row) then
|
|
if TRACE then
|
|
report "all refill data done !";
|
|
end if;
|
|
-- Cache line is now valid
|
|
cache_valids(refill_index)(refill_way) <= '1';
|
|
-- We are done
|
|
state <= IDLE;
|
|
end if;
|
|
|
|
-- Increment store row counter
|
|
refill_row <= next_row(refill_row);
|
|
end if;
|
|
end case;
|
|
end if;
|
|
end if;
|
|
end process;
|
|
|
|
may_trace: if LITEDRAM_TRACE generate
|
|
component litedram_trace_stub
|
|
end component;
|
|
begin
|
|
litedram_trace: litedram_trace_stub;
|
|
end generate;
|
|
|
|
litedram: litedram_core
|
|
port map(
|
|
clk => clk_in,
|
|
rst => rst,
|
|
pll_locked => pll_locked,
|
|
ddram_a => ddram_a,
|
|
ddram_ba => ddram_ba,
|
|
ddram_ras_n => ddram_ras_n,
|
|
ddram_cas_n => ddram_cas_n,
|
|
ddram_we_n => ddram_we_n,
|
|
ddram_cs_n => ddram_cs_n,
|
|
ddram_dm => ddram_dm,
|
|
ddram_dq => ddram_dq,
|
|
ddram_dqs_p => ddram_dqs_p,
|
|
ddram_dqs_n => ddram_dqs_n,
|
|
ddram_clk_p => ddram_clk_p,
|
|
ddram_clk_n => ddram_clk_n,
|
|
ddram_cke => ddram_cke,
|
|
ddram_odt => ddram_odt,
|
|
ddram_reset_n => ddram_reset_n,
|
|
init_done => init_done,
|
|
init_error => init_error,
|
|
user_clk => system_clk,
|
|
user_rst => system_reset,
|
|
wb_ctrl_adr => wb_ctrl_adr,
|
|
wb_ctrl_dat_w => wb_ctrl_dat_w,
|
|
wb_ctrl_dat_r => wb_ctrl_dat_r,
|
|
wb_ctrl_sel => wb_ctrl_sel,
|
|
wb_ctrl_cyc => wb_ctrl_cyc,
|
|
wb_ctrl_stb => wb_ctrl_stb,
|
|
wb_ctrl_ack => wb_ctrl_ack,
|
|
wb_ctrl_we => wb_ctrl_we,
|
|
wb_ctrl_cti => "000",
|
|
wb_ctrl_bte => "00",
|
|
wb_ctrl_err => open,
|
|
user_port_native_0_cmd_valid => user_port0_cmd_valid,
|
|
user_port_native_0_cmd_ready => user_port0_cmd_ready,
|
|
user_port_native_0_cmd_we => user_port0_cmd_we,
|
|
user_port_native_0_cmd_addr => user_port0_cmd_addr,
|
|
user_port_native_0_wdata_valid => user_port0_wdata_valid,
|
|
user_port_native_0_wdata_ready => user_port0_wdata_ready,
|
|
user_port_native_0_wdata_we => user_port0_wdata_we,
|
|
user_port_native_0_wdata_data => user_port0_wdata_data,
|
|
user_port_native_0_rdata_valid => user_port0_rdata_valid,
|
|
user_port_native_0_rdata_ready => user_port0_rdata_ready,
|
|
user_port_native_0_rdata_data => user_port0_rdata_data
|
|
);
|
|
|
|
end architecture behaviour;
|