mirror of
https://github.com/mist-devel/mist-board.git
synced 2026-02-07 08:27:07 +00:00
416 lines
11 KiB
Verilog
416 lines
11 KiB
Verilog
// Two Way Cache, supporting 64 meg address space.
|
|
// Would be easy enough to extend to wider address spaces, just need
|
|
// to increase the width of the "tag" blockram.
|
|
|
|
// If we're targetting Cyclone 3, then we have 9kbit blockrams to play with.
|
|
// Each burst, assuming we stick with 4-word bursts, is 64 bits, so a single M9K
|
|
// can hold 128 cachelines. Since we're building a 2-way cache, this will end
|
|
// up being 2 overlapping sets of 64 cachelines.
|
|
|
|
// The address is broken down as follows:
|
|
// bit 0 is irrelevant because we're working in 16-bit words.
|
|
// bits 2:1 specify which word of a burst we're interested in.
|
|
// Bits 10:3 specify the six bit address of the cachelines;
|
|
// this will map to {1'b0,addr[8:3]} and {1;b1,addr[8:3]} respectively.
|
|
// Bits 25:11 have to be stored in the tag, which, it turns out is no problem,
|
|
// since we can use 18-bit wide words. The highest bit will be used as
|
|
// a "most recently used" flag, leaving one bit spare, so we can support 64 meg
|
|
// without changing bit widths.
|
|
// (Storing the MRU flag in both tags in a 2-way cache is redundant, so we'll only
|
|
// store it in the first tag.)
|
|
|
|
// FIXME - add bus snooping.
|
|
// Bus snooping works simply by invalidating cachelines that match
|
|
// addresses appearing on the snoop_addr lines. This is triggered by
|
|
// the snoop_req line, which should be taken high when a chipset write to ChipRAM
|
|
// takes place.
|
|
// Since we can't afford to delay the chipset write accessed, we need to latch
|
|
// the snoop address.
|
|
|
|
|
|
module TwoWayCache
|
|
(
|
|
input clk,
|
|
input reset, // active low
|
|
output ready,
|
|
input [31:0] cpu_addr,
|
|
input cpu_req, // 1 to request attention
|
|
output reg cpu_ack, // 1 to signal that data is ready.
|
|
output reg cpu_wr_ack, // 1 to signal that write cycles have been actioned
|
|
input cpu_rw, // 1 for read cycles, 0 for write cycles
|
|
input cpu_rwl,
|
|
input cpu_rwu,
|
|
input [15:0] data_from_cpu,
|
|
output reg [15:0] data_to_cpu,
|
|
output reg [31:0] sdram_addr,
|
|
input [15:0] data_from_sdram,
|
|
output reg [15:0] data_to_sdram,
|
|
output reg sdram_req,
|
|
input sdram_fill,
|
|
output reg sdram_rw, // 1 for read cycles, 0 for write cycles
|
|
input [20:0] snoop_addr, // Address of chipram writes
|
|
input snoop_req // 1 when snoop_addr contains an address that requires invalidation.
|
|
);
|
|
|
|
// States for state machine
|
|
parameter WAITING=0, WAITRD=1, WAITFILL=2,
|
|
FILL2=3, FILL3=4, FILL4=5, FILL5=6, PAUSE1=7,
|
|
WRITE1=8, WRITE2=9, INIT1=10, INIT2=11;
|
|
reg [4:0] state = INIT1;
|
|
reg init;
|
|
reg [7:0] initctr;
|
|
assign ready=~init;
|
|
|
|
|
|
// BlockRAM and related signals for data
|
|
|
|
wire [10:0] data_port1_addr;
|
|
wire [10:0] data_port2_addr;
|
|
wire [17:0] data_port1_r;
|
|
wire [17:0] data_port2_r;
|
|
reg[17:0] data_ports_w;
|
|
reg data_wren1;
|
|
reg data_wren2;
|
|
|
|
Cache_DataRAM dataram(
|
|
.clock(clk),
|
|
.address_a(data_port1_addr),
|
|
.address_b(data_port2_addr),
|
|
.data_a(data_ports_w),
|
|
.data_b(data_ports_w),
|
|
.q_a(data_port1_r),
|
|
.q_b(data_port2_r),
|
|
.wren_a(data_wren1),
|
|
.wren_b(data_wren2)
|
|
);
|
|
|
|
wire data_valid1;
|
|
wire data_valid2;
|
|
|
|
assign data_valid1 = data_port1_r[17] & data_port1_r[16];
|
|
assign data_valid2 = data_port2_r[17] & data_port2_r[16];
|
|
|
|
// BlockRAM and related signals for tags.
|
|
|
|
wire [8:0] tag_port1_addr;
|
|
wire [8:0] tag_port2_addr;
|
|
wire [17:0] tag_port1_r;
|
|
wire [17:0] tag_port2_r;
|
|
wire [17:0] tag_port1_w;
|
|
wire [17:0] tag_port2_w;
|
|
|
|
reg tag_wren1;
|
|
reg tag_wren2;
|
|
reg tag_mru1;
|
|
|
|
CacheBlockRAM tagram(
|
|
.clock(clk),
|
|
.address_a(tag_port1_addr),
|
|
.address_b(tag_port2_addr),
|
|
.data_a(tag_port1_w),
|
|
.data_b(tag_port2_w),
|
|
.q_a(tag_port1_r),
|
|
.q_b(tag_port2_r),
|
|
.wren_a(tag_wren1),
|
|
.wren_b(tag_wren2)
|
|
);
|
|
|
|
// bits 2:1 specify which word of a burst we're interested in.
|
|
// Bits 10:3 specify the six bit address of the cachelines;
|
|
// Since we're building a 2-way cache, we'll map this to
|
|
// {1'b0,addr[10:3]} and {1;b1,addr[10:3]} respectively.
|
|
|
|
wire [10:0] cacheline1;
|
|
wire [10:0] cacheline2;
|
|
|
|
reg readword_burst; // Set to 1 when the lsb of the cache address should
|
|
// track the SDRAM controller.
|
|
reg [9:0] readword;
|
|
|
|
//assign cacheline1 = {1'b0,cpu_addr[10:3],(readword_burst ? readword : cpu_addr[2:1])};
|
|
//assign cacheline2 = {1'b1,cpu_addr[10:3],(readword_burst ? readword : cpu_addr[2:1])};
|
|
|
|
assign cacheline1 = {1'b0,readword_burst ? readword : cpu_addr[10:1]};
|
|
assign cacheline2 = {1'b1,readword_burst ? readword : cpu_addr[10:1]};
|
|
|
|
// We share each tag between all four words of a cacheline. We therefore only need
|
|
// one M9K tag RAM for four M9Ks of data RAM.
|
|
|
|
assign tag_port1_addr = cacheline1[10:2];
|
|
assign tag_port2_addr = cacheline2[10:2];
|
|
|
|
// The first port contains the mru flag, so we have to write to it on every
|
|
// access. The second tag only needs writing when a cacheline in the second
|
|
// block is updated, so we tie the write port of the second tag to part of the
|
|
// CPU address.
|
|
// The first port has to be toggled between old and new data, depending upon
|
|
// the state of the mru flag.
|
|
// (Writing both ports on every access for troubleshooting)
|
|
|
|
assign tag_port1_w = {tag_mru1,(tag_mru1 ? cpu_addr[25:9] : tag_port1_r[16:0])};
|
|
assign tag_port2_w = {1'b0,(!tag_mru1 ? cpu_addr[25:9] : tag_port2_r[16:0])};
|
|
//assign tag_port2_w = {1'b0,cpu_addr[25:9]};
|
|
|
|
|
|
// Boolean signals to indicate cache hits.
|
|
|
|
wire tag_hit1;
|
|
wire tag_hit2;
|
|
|
|
assign tag_hit1 = tag_port1_r[16:0]==cpu_addr[25:9];
|
|
assign tag_hit2 = tag_port2_r[16:0]==cpu_addr[25:9];
|
|
|
|
|
|
// In the data blockram the lower two bits of the address determine
|
|
// which word of the burst we're reading. When reading from the cache, this comes
|
|
// from the CPU address; when writing to the cache it's determined by the state
|
|
// machine.
|
|
|
|
|
|
assign data_port1_addr = init ? {1'b0,initctr} : cacheline1;
|
|
assign data_port2_addr = init ? {1'b1,initctr} : cacheline2;
|
|
|
|
|
|
always @(posedge clk)
|
|
begin
|
|
|
|
// Defaults
|
|
tag_wren1<=1'b0;
|
|
tag_wren2<=1'b0;
|
|
data_wren1<=1'b0;
|
|
data_wren2<=1'b0;
|
|
init<=1'b0;
|
|
readword_burst<=1'b0;
|
|
cpu_wr_ack<=1'b0;
|
|
|
|
case(state)
|
|
|
|
// FIXME - need an init state here that loops through the data clearing
|
|
// the valid flag - for which we'll use bit 17 of the data entry.
|
|
|
|
INIT1:
|
|
begin
|
|
init<=1'b1; // need to mark the entire cache as invalid before starting.
|
|
initctr<=8'b0000_0000;
|
|
data_ports_w<=18'b0; // Mark entire cache as invalid
|
|
data_wren1<=1'b1;
|
|
data_wren2<=1'b1;
|
|
state<=INIT2;
|
|
end
|
|
|
|
INIT2:
|
|
begin
|
|
init<=1'b1;
|
|
initctr<=initctr+1;
|
|
data_wren1<=1'b1;
|
|
data_wren2<=1'b1;
|
|
if(initctr==8'b1111_1111)
|
|
state<=WAITING;
|
|
end
|
|
|
|
WAITING:
|
|
begin
|
|
state<=WAITING;
|
|
if(cpu_req==1'b1)
|
|
begin
|
|
if(cpu_rw==1'b1) // Read cycle
|
|
state<=WAITRD;
|
|
else // Write cycle
|
|
state<=WRITE1;
|
|
end
|
|
end
|
|
WRITE1:
|
|
begin
|
|
// If the current address is in cache,
|
|
// we must update the appropriate cacheline
|
|
|
|
// We mark the two halves of the word separately.
|
|
// If this is a byte write, the byte not being written
|
|
// will be marked as invalid, triggering a re-read if
|
|
// the other byte or whole word is read.
|
|
data_ports_w<={~cpu_rwu,~cpu_rwl,data_from_cpu};
|
|
|
|
if(tag_hit1)
|
|
begin
|
|
// Write the data to the first cache way
|
|
data_wren1<=1'b1;
|
|
// Mark tag1 as most recently used.
|
|
tag_mru1<=1'b1;
|
|
tag_wren1<=1'b1;
|
|
end
|
|
// Note: it's possible that both ways of the cache will end up caching
|
|
// the same address; if so, we must write to both ways, or at least
|
|
// invalidate them both, otherwise we'll have problems with stale data.
|
|
if(tag_hit2)
|
|
begin
|
|
// Write the data to the second cache way
|
|
data_wren2<=1'b1;
|
|
// Mark tag2 as most recently used.
|
|
tag_mru1<=1'b0;
|
|
tag_wren1<=1'b1;
|
|
end
|
|
// FIXME - ultimately we should clear a cacheline here and cache
|
|
// the data for future use. Need to have a working valid flag first.
|
|
state<=WRITE2;
|
|
end
|
|
|
|
WRITE2:
|
|
begin
|
|
cpu_wr_ack<=1'b1; // Indicate to the Write cache that it's safe to proceed.
|
|
if(cpu_req==1'b0) // Wait for the write cycle to finish
|
|
state<=WAITING;
|
|
end
|
|
|
|
WAITRD:
|
|
begin
|
|
state<=PAUSE1;
|
|
// Check both tags for a match...
|
|
if(tag_hit1 && data_valid1)
|
|
begin
|
|
// Copy data to output
|
|
data_to_cpu<=data_port1_r;
|
|
cpu_ack<=1'b1;
|
|
|
|
// Mark tag1 as most recently used.
|
|
tag_mru1<=1'b1;
|
|
tag_wren1<=1'b1;
|
|
end
|
|
else if(tag_hit2 && data_valid2)
|
|
begin
|
|
// Copy data to output
|
|
data_to_cpu<=data_port2_r;
|
|
cpu_ack<=1'b1;
|
|
|
|
// Mark tag2 as most recently used.
|
|
tag_mru1<=1'b0;
|
|
tag_wren1<=1'b1;
|
|
end
|
|
else // No matches? How do we decide which one to use?
|
|
begin
|
|
// invert most recently used flags on both tags.
|
|
// (Whichever one was least recently used will be overwritten, so
|
|
// is now the most recently used.)
|
|
// If either tag matches, but the corresponding data is stale,
|
|
// we re-use the stale cacheline.
|
|
|
|
if(tag_hit1)
|
|
tag_mru1<=1'b1; // Way 1 contains stale data
|
|
else if(tag_hit2)
|
|
tag_mru1<=1'b0; // Way 2 contains stale data
|
|
else
|
|
tag_mru1<=!tag_port1_r[17];
|
|
|
|
// For simulation only, to avoid the unknown value of unitialised blockram
|
|
// tag_mru1<=cpu_addr[1];
|
|
|
|
tag_wren1<=1'b1;
|
|
tag_wren2<=1'b1;
|
|
// If r[17] is 1, tag_mru1 is 0, so we need to write to the second tag.
|
|
// FIXME - might be simpler to just write every cycle and switch between new and old data.
|
|
// tag_wren2<=tag_port1_r[17];
|
|
|
|
// Pass request on to RAM controller.
|
|
sdram_addr<={cpu_addr[31:3],3'b000};
|
|
sdram_req<=1'b1;
|
|
sdram_rw<=1'b1; // Read cycle
|
|
state<=WAITFILL;
|
|
end
|
|
end
|
|
|
|
PAUSE1:
|
|
begin
|
|
state<=PAUSE1;
|
|
if(cpu_req==1'b0)
|
|
state<=WAITING;
|
|
end
|
|
|
|
WAITFILL:
|
|
begin
|
|
readword_burst<=1'b1;
|
|
|
|
// In the interests of performance, read the word we're waiting for first.
|
|
readword<=cpu_addr[10:1];
|
|
|
|
if (sdram_fill==1'b1)
|
|
begin
|
|
sdram_req<=1'b0;
|
|
|
|
// Forward data to CPU
|
|
// (We now latch the address until the current cycle is complete.
|
|
// TAGRAM is already written, so just need to take care of
|
|
// Data RAM addresses, which we do with the readword signal.
|
|
data_to_cpu<=data_from_sdram;
|
|
cpu_ack<=1'b1;
|
|
|
|
// write first word to Cache...
|
|
data_ports_w<={2'b11,data_from_sdram};
|
|
data_wren1<=tag_mru1;
|
|
data_wren2<=!tag_mru1;
|
|
state<=FILL2;
|
|
end
|
|
end
|
|
|
|
FILL2:
|
|
begin
|
|
// write second word to Cache...
|
|
readword_burst<=1'b1;
|
|
readword[1:0]<=readword[1:0]+1;
|
|
data_ports_w<={2'b11,data_from_sdram};
|
|
data_wren1<=tag_mru1;
|
|
data_wren2<=!tag_mru1;
|
|
state<=FILL3;
|
|
end
|
|
|
|
FILL3:
|
|
begin
|
|
// write third word to Cache...
|
|
readword_burst<=1'b1;
|
|
readword[1:0]<=readword[1:0]+1;
|
|
data_ports_w<={2'b11,data_from_sdram};
|
|
data_wren1<=tag_mru1;
|
|
data_wren2<=!tag_mru1;
|
|
state<=FILL4;
|
|
end
|
|
|
|
FILL4:
|
|
begin
|
|
// write last word to Cache...
|
|
readword_burst<=1'b1;
|
|
readword[1:0]<=readword[1:0]+1;
|
|
data_ports_w<={2'b11,data_from_sdram};
|
|
data_wren1<=tag_mru1;
|
|
data_wren2<=!tag_mru1;
|
|
state<=FILL5;
|
|
end
|
|
|
|
FILL5:
|
|
begin
|
|
state<=FILL5;
|
|
// Shouldn't need to worry about readword now - only used during burst
|
|
// readword=cpu_addr[2:1];
|
|
|
|
// Remain on state 5 until cpu_ack is low.
|
|
// We use this rather than cpu_req because in the time it's taken us to
|
|
// reach this point, it's possible the next request could have started.
|
|
if(cpu_ack==1'b0)
|
|
state<=WAITING;
|
|
end
|
|
|
|
default:
|
|
state<=WAITING;
|
|
endcase
|
|
|
|
// Cancel the ack flag as soon as req drops.
|
|
// The state machine will wait for this to happen before starting a new cycle.
|
|
if(cpu_req==1'b0)
|
|
cpu_ack<=1'b0;
|
|
|
|
if(reset==1'b0)
|
|
begin
|
|
state<=INIT1;
|
|
cpu_ack<=1'b0;
|
|
end
|
|
end
|
|
|
|
endmodule
|