// Two Way Cache, supporting 64 meg address space. // Would be easy enough to extend to wider address spaces, just need // to increase the width of the "tag" blockram. // If we're targetting Cyclone 3, then we have 9kbit blockrams to play with. // Each burst, assuming we stick with 4-word bursts, is 64 bits, so a single M9K // can hold 128 cachelines. Since we're building a 2-way cache, this will end // up being 2 overlapping sets of 64 cachelines. // The address is broken down as follows: // bit 0 is irrelevant because we're working in 16-bit words. // bits 2:1 specify which word of a burst we're interested in. // Bits 10:3 specify the six bit address of the cachelines; // this will map to {1'b0,addr[8:3]} and {1;b1,addr[8:3]} respectively. // Bits 25:11 have to be stored in the tag, which, it turns out is no problem, // since we can use 18-bit wide words. The highest bit will be used as // a "most recently used" flag, leaving one bit spare, so we can support 64 meg // without changing bit widths. // (Storing the MRU flag in both tags in a 2-way cache is redundant, so we'll only // store it in the first tag.) // FIXME - add bus snooping. // Bus snooping works simply by invalidating cachelines that match // addresses appearing on the snoop_addr lines. This is triggered by // the snoop_req line, which should be taken high when a chipset write to ChipRAM // takes place. // Since we can't afford to delay the chipset write accessed, we need to latch // the snoop address. module TwoWayCache ( input clk, input reset, // active low output ready, input [31:0] cpu_addr, input cpu_req, // 1 to request attention output reg cpu_ack, // 1 to signal that data is ready. output reg cpu_wr_ack, // 1 to signal that write cycles have been actioned input cpu_rw, // 1 for read cycles, 0 for write cycles input cpu_rwl, input cpu_rwu, input [15:0] data_from_cpu, output reg [15:0] data_to_cpu, output reg [31:0] sdram_addr, input [15:0] data_from_sdram, output reg [15:0] data_to_sdram, output reg sdram_req, input sdram_fill, output reg sdram_rw, // 1 for read cycles, 0 for write cycles input [20:0] snoop_addr, // Address of chipram writes input snoop_req // 1 when snoop_addr contains an address that requires invalidation. ); // States for state machine parameter WAITING=0, WAITRD=1, WAITFILL=2, FILL2=3, FILL3=4, FILL4=5, FILL5=6, PAUSE1=7, WRITE1=8, WRITE2=9, INIT1=10, INIT2=11; reg [4:0] state = INIT1; reg init; reg [7:0] initctr; assign ready=~init; // BlockRAM and related signals for data wire [10:0] data_port1_addr; wire [10:0] data_port2_addr; wire [17:0] data_port1_r; wire [17:0] data_port2_r; reg[17:0] data_ports_w; reg data_wren1; reg data_wren2; Cache_DataRAM dataram( .clock(clk), .address_a(data_port1_addr), .address_b(data_port2_addr), .data_a(data_ports_w), .data_b(data_ports_w), .q_a(data_port1_r), .q_b(data_port2_r), .wren_a(data_wren1), .wren_b(data_wren2) ); wire data_valid1; wire data_valid2; assign data_valid1 = data_port1_r[17] & data_port1_r[16]; assign data_valid2 = data_port2_r[17] & data_port2_r[16]; // BlockRAM and related signals for tags. wire [8:0] tag_port1_addr; wire [8:0] tag_port2_addr; wire [17:0] tag_port1_r; wire [17:0] tag_port2_r; wire [17:0] tag_port1_w; wire [17:0] tag_port2_w; reg tag_wren1; reg tag_wren2; reg tag_mru1; CacheBlockRAM tagram( .clock(clk), .address_a(tag_port1_addr), .address_b(tag_port2_addr), .data_a(tag_port1_w), .data_b(tag_port2_w), .q_a(tag_port1_r), .q_b(tag_port2_r), .wren_a(tag_wren1), .wren_b(tag_wren2) ); // bits 2:1 specify which word of a burst we're interested in. // Bits 10:3 specify the six bit address of the cachelines; // Since we're building a 2-way cache, we'll map this to // {1'b0,addr[10:3]} and {1;b1,addr[10:3]} respectively. wire [10:0] cacheline1; wire [10:0] cacheline2; reg readword_burst; // Set to 1 when the lsb of the cache address should // track the SDRAM controller. reg [9:0] readword; //assign cacheline1 = {1'b0,cpu_addr[10:3],(readword_burst ? readword : cpu_addr[2:1])}; //assign cacheline2 = {1'b1,cpu_addr[10:3],(readword_burst ? readword : cpu_addr[2:1])}; assign cacheline1 = {1'b0,readword_burst ? readword : cpu_addr[10:1]}; assign cacheline2 = {1'b1,readword_burst ? readword : cpu_addr[10:1]}; // We share each tag between all four words of a cacheline. We therefore only need // one M9K tag RAM for four M9Ks of data RAM. assign tag_port1_addr = cacheline1[10:2]; assign tag_port2_addr = cacheline2[10:2]; // The first port contains the mru flag, so we have to write to it on every // access. The second tag only needs writing when a cacheline in the second // block is updated, so we tie the write port of the second tag to part of the // CPU address. // The first port has to be toggled between old and new data, depending upon // the state of the mru flag. // (Writing both ports on every access for troubleshooting) assign tag_port1_w = {tag_mru1,(tag_mru1 ? cpu_addr[25:9] : tag_port1_r[16:0])}; assign tag_port2_w = {1'b0,(!tag_mru1 ? cpu_addr[25:9] : tag_port2_r[16:0])}; //assign tag_port2_w = {1'b0,cpu_addr[25:9]}; // Boolean signals to indicate cache hits. wire tag_hit1; wire tag_hit2; assign tag_hit1 = tag_port1_r[16:0]==cpu_addr[25:9]; assign tag_hit2 = tag_port2_r[16:0]==cpu_addr[25:9]; // In the data blockram the lower two bits of the address determine // which word of the burst we're reading. When reading from the cache, this comes // from the CPU address; when writing to the cache it's determined by the state // machine. assign data_port1_addr = init ? {1'b0,initctr} : cacheline1; assign data_port2_addr = init ? {1'b1,initctr} : cacheline2; always @(posedge clk) begin // Defaults tag_wren1<=1'b0; tag_wren2<=1'b0; data_wren1<=1'b0; data_wren2<=1'b0; init<=1'b0; readword_burst<=1'b0; cpu_wr_ack<=1'b0; case(state) // FIXME - need an init state here that loops through the data clearing // the valid flag - for which we'll use bit 17 of the data entry. INIT1: begin init<=1'b1; // need to mark the entire cache as invalid before starting. initctr<=8'b0000_0000; data_ports_w<=18'b0; // Mark entire cache as invalid data_wren1<=1'b1; data_wren2<=1'b1; state<=INIT2; end INIT2: begin init<=1'b1; initctr<=initctr+1; data_wren1<=1'b1; data_wren2<=1'b1; if(initctr==8'b1111_1111) state<=WAITING; end WAITING: begin state<=WAITING; if(cpu_req==1'b1) begin if(cpu_rw==1'b1) // Read cycle state<=WAITRD; else // Write cycle state<=WRITE1; end end WRITE1: begin // If the current address is in cache, // we must update the appropriate cacheline // We mark the two halves of the word separately. // If this is a byte write, the byte not being written // will be marked as invalid, triggering a re-read if // the other byte or whole word is read. data_ports_w<={~cpu_rwu,~cpu_rwl,data_from_cpu}; if(tag_hit1) begin // Write the data to the first cache way data_wren1<=1'b1; // Mark tag1 as most recently used. tag_mru1<=1'b1; tag_wren1<=1'b1; end // Note: it's possible that both ways of the cache will end up caching // the same address; if so, we must write to both ways, or at least // invalidate them both, otherwise we'll have problems with stale data. if(tag_hit2) begin // Write the data to the second cache way data_wren2<=1'b1; // Mark tag2 as most recently used. tag_mru1<=1'b0; tag_wren1<=1'b1; end // FIXME - ultimately we should clear a cacheline here and cache // the data for future use. Need to have a working valid flag first. state<=WRITE2; end WRITE2: begin cpu_wr_ack<=1'b1; // Indicate to the Write cache that it's safe to proceed. if(cpu_req==1'b0) // Wait for the write cycle to finish state<=WAITING; end WAITRD: begin state<=PAUSE1; // Check both tags for a match... if(tag_hit1 && data_valid1) begin // Copy data to output data_to_cpu<=data_port1_r; cpu_ack<=1'b1; // Mark tag1 as most recently used. tag_mru1<=1'b1; tag_wren1<=1'b1; end else if(tag_hit2 && data_valid2) begin // Copy data to output data_to_cpu<=data_port2_r; cpu_ack<=1'b1; // Mark tag2 as most recently used. tag_mru1<=1'b0; tag_wren1<=1'b1; end else // No matches? How do we decide which one to use? begin // invert most recently used flags on both tags. // (Whichever one was least recently used will be overwritten, so // is now the most recently used.) // If either tag matches, but the corresponding data is stale, // we re-use the stale cacheline. if(tag_hit1) tag_mru1<=1'b1; // Way 1 contains stale data else if(tag_hit2) tag_mru1<=1'b0; // Way 2 contains stale data else tag_mru1<=!tag_port1_r[17]; // For simulation only, to avoid the unknown value of unitialised blockram // tag_mru1<=cpu_addr[1]; tag_wren1<=1'b1; tag_wren2<=1'b1; // If r[17] is 1, tag_mru1 is 0, so we need to write to the second tag. // FIXME - might be simpler to just write every cycle and switch between new and old data. // tag_wren2<=tag_port1_r[17]; // Pass request on to RAM controller. sdram_addr<={cpu_addr[31:3],3'b000}; sdram_req<=1'b1; sdram_rw<=1'b1; // Read cycle state<=WAITFILL; end end PAUSE1: begin state<=PAUSE1; if(cpu_req==1'b0) state<=WAITING; end WAITFILL: begin readword_burst<=1'b1; // In the interests of performance, read the word we're waiting for first. readword<=cpu_addr[10:1]; if (sdram_fill==1'b1) begin sdram_req<=1'b0; // Forward data to CPU // (We now latch the address until the current cycle is complete. // TAGRAM is already written, so just need to take care of // Data RAM addresses, which we do with the readword signal. data_to_cpu<=data_from_sdram; cpu_ack<=1'b1; // write first word to Cache... data_ports_w<={2'b11,data_from_sdram}; data_wren1<=tag_mru1; data_wren2<=!tag_mru1; state<=FILL2; end end FILL2: begin // write second word to Cache... readword_burst<=1'b1; readword[1:0]<=readword[1:0]+1; data_ports_w<={2'b11,data_from_sdram}; data_wren1<=tag_mru1; data_wren2<=!tag_mru1; state<=FILL3; end FILL3: begin // write third word to Cache... readword_burst<=1'b1; readword[1:0]<=readword[1:0]+1; data_ports_w<={2'b11,data_from_sdram}; data_wren1<=tag_mru1; data_wren2<=!tag_mru1; state<=FILL4; end FILL4: begin // write last word to Cache... readword_burst<=1'b1; readword[1:0]<=readword[1:0]+1; data_ports_w<={2'b11,data_from_sdram}; data_wren1<=tag_mru1; data_wren2<=!tag_mru1; state<=FILL5; end FILL5: begin state<=FILL5; // Shouldn't need to worry about readword now - only used during burst // readword=cpu_addr[2:1]; // Remain on state 5 until cpu_ack is low. // We use this rather than cpu_req because in the time it's taken us to // reach this point, it's possible the next request could have started. if(cpu_ack==1'b0) state<=WAITING; end default: state<=WAITING; endcase // Cancel the ack flag as soon as req drops. // The state machine will wait for this to happen before starting a new cycle. if(cpu_req==1'b0) cpu_ack<=1'b0; if(reset==1'b0) begin state<=INIT1; cpu_ack<=1'b0; end end endmodule