From fe35d86b89dc969b56b936d56fafc1035b1da309 Mon Sep 17 00:00:00 2001 From: harbaum Date: Tue, 28 Jan 2014 19:58:48 +0000 Subject: [PATCH] Obsession\!\!\! --- cores/mist/TG68KdotC_Kernel.vhd | 2 +- cores/mist/acia.v | 11 +- cores/mist/blitter.v | 23 ++- cores/mist/cache.v | 106 +++++++--- cores/mist/clock.v | 6 +- cores/mist/clock_bb.v | 4 +- cores/mist/data_io.v | 9 +- cores/mist/dongle.v | 122 +++++++++-- cores/mist/mfp.v | 131 +++++------- cores/mist/mfp_timer.v | 16 +- cores/mist/mist_top.v | 349 ++++++++++++++++++-------------- cores/mist/sdram.v | 6 +- cores/mist/ste_dma_snd.v | 26 ++- cores/mist/video.v | 224 +++++++++++++------- cores/mist/video_modes.v | 53 +++-- 15 files changed, 697 insertions(+), 391 deletions(-) diff --git a/cores/mist/TG68KdotC_Kernel.vhd b/cores/mist/TG68KdotC_Kernel.vhd index e2958fc..a032fde 100644 --- a/cores/mist/TG68KdotC_Kernel.vhd +++ b/cores/mist/TG68KdotC_Kernel.vhd @@ -269,7 +269,7 @@ architecture logic of TG68KdotC_Kernel is signal set_oddout : std_logic; signal PCbase : std_logic; signal set_PCbase : std_logic; - + signal last_data_read : std_logic_vector(31 downto 0); signal last_data_in : std_logic_vector(31 downto 0); diff --git a/cores/mist/acia.v b/cores/mist/acia.v index eb2d9de..9a49e0a 100644 --- a/cores/mist/acia.v +++ b/cores/mist/acia.v @@ -99,16 +99,15 @@ always @(negedge clk) begin readTimer <= 14'd11138; end end -end - +end + // ------------------ cpu interface -------------------- wire ikbd_irq = ikbd_cr[7] && ikbd_rx_data_available; // rx irq -wire [7:0] ikbd_rx_data = fifoIn[readPin]; - -wire ikbd_rx_data_available; -assign ikbd_rx_data_available = (readPin != writePin) && (readTimer == 0); +wire [7:0] ikbd_rx_data = ikbd_rx_data_available?fifoIn[readPin]:fifoIn[readPin-4'd1]; + +wire ikbd_rx_data_available = (readPin != writePin) && (readTimer == 0); // in a real ST the irqs are active low open collector outputs and are simply wired // tegether ("wired or") diff --git a/cores/mist/blitter.v b/cores/mist/blitter.v index e8c10ac..1969779 100644 --- a/cores/mist/blitter.v +++ b/cores/mist/blitter.v @@ -23,10 +23,6 @@ // http://mikro.naprvyraz.sk/docs/ST_E/BLITTER.TXT // http://paradox.atari.org/files/BLIT_FAQ.TXT -// TODO: -// - Try to use bus cycle 3 as well to make a "turbo blitter" being twice as fast -// - Don't spend a whole state 0 if nfsr && last_word_in_row - module blitter ( input [1:0] bus_cycle, @@ -52,6 +48,7 @@ module blitter ( input br_in, output reg br_out, output irq, + input bg, input turbo // 16Mhz blitter ); @@ -93,6 +90,13 @@ reg fxsr; wire cycle_advance = (bus_cycle == 2'd0) || (turbo && (bus_cycle == 2'd2)); wire cycle_read = (bus_cycle == 2'd1) || (turbo && (bus_cycle == 2'd3)); +// latch bus cycle information to use at the end of the cycle (posedge clk) +reg cycle_advanceL, cycle_readL; +always @(negedge clk) begin + cycle_advanceL <= cycle_advance; + cycle_readL <= cycle_read; +end + // ------------------ cpu interface -------------------- // CPU READ @@ -150,7 +154,7 @@ reg [15:0] bm_data_in_latch; // latch incoming data at end of bus cycle always @(posedge clk) - if(cycle_read) + if(cycle_readL) bm_data_in_latch <= bm_data_in; always @(negedge clk) begin @@ -249,8 +253,11 @@ always @(negedge clk) begin // change between both states (bus grabbed and bus released) if(bus_coop_cnt == 0) begin - bus_coop_cnt <= 6'd63; - wait4bus <= !wait4bus; + // release bus immediately, grab bus only if bg is set + if(!wait4bus || (wait4bus && bg)) begin + bus_coop_cnt <= 6'd63; + wait4bus <= !wait4bus; + end end // blitter has just been setup, so init the state machine in first step @@ -352,7 +359,7 @@ always @(posedge clk) begin bm_read <= 1'b0; bm_write <= 1'b0; - if(br_out && !br_in && (y_count != 0) && cycle_advance) begin + if(br_out && !br_in && (y_count != 0) && cycle_advanceL) begin if(state == 2'd0) bm_read <= 1'b1; else if(state == 2'd1) bm_read <= 1'b1; else if(state == 2'd2) bm_write <= 1'b1; diff --git a/cores/mist/cache.v b/cores/mist/cache.v index 0894760..18ff54b 100644 --- a/cores/mist/cache.v +++ b/cores/mist/cache.v @@ -19,7 +19,7 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . // - + module cache ( input clk_128, input clk_8, @@ -27,15 +27,18 @@ module cache ( input flush, input [22:0] addr, // cpu word address - input wr, - input rd, + input [1:0] ds, // upper (0) and lower (1) data strobe - output [15:0] dout, + output reg [15:0] dout, output hit, - // interface to update entire caches when read from ram + // interface to store entire cache lines when read from ram input [63:0] din64, - input update64 + input store, + + // interface to update existing cache lines on cpu ram write + input [15:0] din16, + input update ); reg [3:0] t; @@ -48,53 +51,100 @@ always @(posedge clk_128) begin t <= t + 4'd1; end -// de-multiplex 64 bit data into word requested by cpu -assign dout = (word == 2'd0)?current_data[15: 0]: - (word == 2'd1)?current_data[31:16]: - (word == 2'd2)?current_data[47:32]: - current_data[63:48]; - -// wire entry according to line/address -wire [63:0] current_data = data_latch[line]; - // cache size configuration -localparam BITS = 5; -localparam ENTRIES = 32; // 2 ** BITS -localparam ALLZERO = 32'd0; // 2 ** BITS zero bits +// the cache sizein bytes is 8*(2^BITS), e.g. 2kBytes if BITS == 8 +localparam BITS = 6; +localparam ENTRIES = 64; // 2 ** BITS +localparam ALLZERO = 64'd0; // 2 ** BITS zero bits // _word_ address mapping example with 16 cache lines (BITS == 4) // 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 // T T T T T T T T T T T T T T T T T L L L L W W // T = stored in tag RAM // L = cache line -// W = 16 bit word select -wire [21-BITS-1:0] tag = addr[22:2+BITS] /* synthesis keep */; -wire [BITS-1:0] line = addr[2+BITS-1:2]; -wire [1:0] word = addr[1:0]; +// W = 16 bit word select +wire [21-BITS-1:0] tag = addr[22:2+BITS]; +reg [BITS-1:0] line; /* ------------------------------------------------------------------------------ */ /* --------------------------------- cache memory ------------------------------- */ /* ------------------------------------------------------------------------------ */ -reg [63:0] data_latch [ENTRIES-1:0]; +reg [63:56] data_latch_7 [ENTRIES-1:0]; +reg [55:48] data_latch_6 [ENTRIES-1:0]; +reg [47:40] data_latch_5 [ENTRIES-1:0]; +reg [39:32] data_latch_4 [ENTRIES-1:0]; +reg [31:24] data_latch_3 [ENTRIES-1:0]; +reg [23:16] data_latch_2 [ENTRIES-1:0]; +reg [15: 8] data_latch_1 [ENTRIES-1:0]; +reg [ 7: 0] data_latch_0 [ENTRIES-1:0]; + reg [21-BITS-1:0] tag_latch [ENTRIES-1:0]; reg [ENTRIES-1:0] valid; +reg [21-BITS-1:0] current_tag; + // signal indicating the currently selected cache line is valid and matches the // address the cpu is currently requesting -assign hit = valid[line] && (tag_latch[line] == tag); +// assign hit = valid[line] && (tag_latch[line] == tag); +assign hit = valid[line] && (current_tag == tag); +// permanently output data according to current line +// de-multiplex 64 bit data into word requested by cpu +always @(posedge clk_128) begin + dout <= (addr[1:0] == 2'd0)?{data_latch_1[line], data_latch_0[line]}: + (addr[1:0] == 2'd1)?{data_latch_3[line], data_latch_2[line]}: + (addr[1:0] == 2'd2)?{data_latch_5[line], data_latch_4[line]}: + {data_latch_7[line], data_latch_6[line]}; + current_tag <= tag_latch[line]; +end + +always @(negedge clk_128) + line <= addr[2+BITS-1:2]; + always @(posedge clk_128) begin if(reset || flush) begin valid <= ALLZERO; end else begin - - // update64 indicates that a whole cache line is to be updated - if(update64) begin - data_latch[line] <= din64; + // store indicates that a whole cache line is to be stored + if(store) begin + data_latch_7[line] <= din64[63:56]; + data_latch_6[line] <= din64[55:48]; + data_latch_5[line] <= din64[47:40]; + data_latch_4[line] <= din64[39:32]; + data_latch_3[line] <= din64[31:24]; + data_latch_2[line] <= din64[23:16]; + data_latch_1[line] <= din64[15: 8]; + data_latch_0[line] <= din64[ 7: 0]; + tag_latch[line] <= tag; valid[line] <= 1'b1; end + + // cpu (or other bus master!) writes to ram, so update cache contents if necessary + else if(update && hit) begin + // no need to care for "tag_latch" or "valid" as they simply stay the same + + if(addr[1:0] == 2'd0) begin + if(ds[1]) data_latch_0[line] <= din16[7:0]; + if(ds[0]) data_latch_1[line] <= din16[15:8]; + end + + if(addr[1:0] == 2'd1) begin + if(ds[1]) data_latch_2[line] <= din16[7:0]; + if(ds[0]) data_latch_3[line] <= din16[15:8]; + end + + if(addr[1:0] == 2'd2) begin + if(ds[1]) data_latch_4[line] <= din16[7:0]; + if(ds[0]) data_latch_5[line] <= din16[15:8]; + end + + if(addr[1:0] == 2'd3) begin + if(ds[1]) data_latch_6[line] <= din16[7:0]; + if(ds[0]) data_latch_7[line] <= din16[15:8]; + end + end end end diff --git a/cores/mist/clock.v b/cores/mist/clock.v index 1b2defe..c85e787 100644 --- a/cores/mist/clock.v +++ b/cores/mist/clock.v @@ -126,7 +126,7 @@ module clock ( altpll_component.clk2_divide_by = 18, altpll_component.clk2_duty_cycle = 50, altpll_component.clk2_multiply_by = 85, - altpll_component.clk2_phase_shift = "-1000", + altpll_component.clk2_phase_shift = "-1500", altpll_component.clk3_divide_by = 5625, altpll_component.clk3_duty_cycle = 50, altpll_component.clk3_multiply_by = 512, @@ -265,7 +265,7 @@ endmodule // Retrieval info: PRIVATE: PHASE_RECONFIG_INPUTS_CHECK STRING "0" // Retrieval info: PRIVATE: PHASE_SHIFT0 STRING "0.00000000" // Retrieval info: PRIVATE: PHASE_SHIFT1 STRING "0.00000000" -// Retrieval info: PRIVATE: PHASE_SHIFT2 STRING "-1000.00000000" +// Retrieval info: PRIVATE: PHASE_SHIFT2 STRING "-1500.00000000" // Retrieval info: PRIVATE: PHASE_SHIFT3 STRING "0.00000000" // Retrieval info: PRIVATE: PHASE_SHIFT_STEP_ENABLED_CHECK STRING "0" // Retrieval info: PRIVATE: PHASE_SHIFT_UNIT0 STRING "deg" @@ -323,7 +323,7 @@ endmodule // Retrieval info: CONSTANT: CLK2_DIVIDE_BY NUMERIC "18" // Retrieval info: CONSTANT: CLK2_DUTY_CYCLE NUMERIC "50" // Retrieval info: CONSTANT: CLK2_MULTIPLY_BY NUMERIC "85" -// Retrieval info: CONSTANT: CLK2_PHASE_SHIFT STRING "-1000" +// Retrieval info: CONSTANT: CLK2_PHASE_SHIFT STRING "-1500" // Retrieval info: CONSTANT: CLK3_DIVIDE_BY NUMERIC "5625" // Retrieval info: CONSTANT: CLK3_DUTY_CYCLE NUMERIC "50" // Retrieval info: CONSTANT: CLK3_MULTIPLY_BY NUMERIC "512" diff --git a/cores/mist/clock_bb.v b/cores/mist/clock_bb.v index 37be15a..7882d3f 100644 --- a/cores/mist/clock_bb.v +++ b/cores/mist/clock_bb.v @@ -196,9 +196,9 @@ endmodule // Retrieval info: CONSTANT: CLK2_DUTY_CYCLE NUMERIC "50" // Retrieval info: CONSTANT: CLK2_MULTIPLY_BY NUMERIC "85" // Retrieval info: CONSTANT: CLK2_PHASE_SHIFT STRING "-1000" -// Retrieval info: CONSTANT: CLK3_DIVIDE_BY NUMERIC "5625" +// Retrieval info: CONSTANT: CLK3_DIVIDE_BY NUMERIC "27000000" // Retrieval info: CONSTANT: CLK3_DUTY_CYCLE NUMERIC "50" -// Retrieval info: CONSTANT: CLK3_MULTIPLY_BY NUMERIC "512" +// Retrieval info: CONSTANT: CLK3_MULTIPLY_BY NUMERIC "2457599" // Retrieval info: CONSTANT: CLK3_PHASE_SHIFT STRING "0" // Retrieval info: CONSTANT: COMPENSATE_CLOCK STRING "CLK0" // Retrieval info: CONSTANT: INCLK0_INPUT_FREQUENCY NUMERIC "37037" diff --git a/cores/mist/data_io.v b/cores/mist/data_io.v index 7197e0c..f4c1243 100644 --- a/cores/mist/data_io.v +++ b/cores/mist/data_io.v @@ -50,13 +50,18 @@ reg brI; // signals to bring br into local clock domain // address auto increment takes place at the beginning of each transfer assign addr = (cmd==2)?(addrR[22:0]-23'd1):addrR[22:0]; +// latch bus cycle to have it stable at the end of the cycle (rising edge of clk8) +reg [1:0] bus_cycle_L; +always @(negedge clk_8) + bus_cycle_L <= bus_cycle; + // generate state signals required to control the sdram host interface always @(posedge clk_8) begin // start io transfers clock cycles after bus_cycle 0 // (after the cpu cycle) - writeD <= writeCmd && ((bus_cycle == 3) || writeD); + writeD <= writeCmd && ((bus_cycle_L == 3) || writeD); writeD2 <= writeD; - readD <= readCmd && ((bus_cycle == 3) || readD); + readD <= readCmd && ((bus_cycle_L == 3) || readD); readD2 <= readD; br <= brI; diff --git a/cores/mist/dongle.v b/cores/mist/dongle.v index cf3302c..7a51ca6 100644 --- a/cores/mist/dongle.v +++ b/cores/mist/dongle.v @@ -1,19 +1,117 @@ -// Empty dongle module. Just a placeholder ... - module dongle ( // cpu register interface - input clk, - input sel, - input cpu_as, - input uds, - input rw, - input [14:0] addr, - output[7:0] dout, + input clk, + input sel, + input cpu_as, // cpu_cycle && as + input uds, + input rw, + input [14:0] addr, + output reg [7:0] dout, - output present + output present ); -assign present = 1'b0; -assign dout = 8'h00; +assign present = 1'b0; // 0 = deactivate dongle + +// ------------------------------------------------------------------------------------ +// ------------------------------------ CUBASE 2 DONGLE ------------------------------- +// ------------------------------------------------------------------------------------ +reg [15:8] d; +reg [15:8] next_d; + +// read +always @(sel, uds, rw, d) begin + dout = 8'd0; + if(sel && ~uds && rw) + dout = d; +end + +wire [8:1] a = addr[7:0]; + +// special addresses: +// a[8:1] = 8'b11011000,0 -> 0x1b0 clear all +// a[8:1] = 8'bxxx00xx0,0 -> a5+a4+a1 = 0 sets all, incl. $0c + +// update register in the middle of the transfer +always @(negedge clk) begin + if(cpu_as && ~uds) begin + next_d[15] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (!d[15] & !d[14] & !d[13] & !d[12] & !d[11] & d[10] & !d[9] & a[4] ) | + ( d[14] & d[12] & d[10] & a[1]) | + ( d[13] & !d[10] & a[4] ) | + ( !d[14] & !d[10] & a[1]) | + ( d[15] & !d[10] & a[4] ) | + ( !d[12] & !d[10] & a[1]) | + (!d[8] & a[5] )); + + next_d[14] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (!d[15] & !d[14] & !d[13] & !d[12] & !d[11] & !d[10] & !d[9] & d[8] & a[4] ) | + ( d[14] & d[12] & d[10] & d[8] & a[1]) | + ( !d[10] & !d[8] & a[1]) | + ( !d[12] & !d[8] & a[1]) | + ( d[15] & !d[8] & a[4] ) | + ( !d[14] & !d[8] & a[1]) | + (!d[15] & a[5] )); + + next_d[13] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (d[15]&d[14]&d[13]&d[12]&d[11]&d[10]&d[8]&a[1]) | + (!d[15]&!d[13]&d[11]&a[4]) | + (d[13]&!d[11]&a[4]) | + (!d[12]&!d[11]&a[1]) | + (d[15]&!d[11]&a[4]) | + (!d[14]&!d[11]&a[1]) | + (!d[9]&a[5])); + + next_d[12] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (d[15]&d[14]&d[13]&d[12]&d[10]&d[8]&a[1]) | + (!d[13]&!d[10]&a[1]) | + (!d[15]&d[13]&a[4]) | + (!d[13]&!d[12]&a[1]) | + (d[15]&!d[13]&a[4]) | + (!d[14]&!d[13]&a[1]) | + (!d[11]&a[5])); + + next_d[11] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (d[15]&d[14]&d[12]&d[10]&d[8]&a[1]) | + (!d[15]&!d[8]&a[1]) | + (!d[15]&!d[10]&a[1]) | + (!d[15]&!d[12]&a[1]) | + (!d[15]&!d[14]&a[1]) | + (d[15]&a[4]) | + (!d[13]&a[5])); + + next_d[10] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (d[15]&d[14]&d[13]&d[12]&d[11]&d[10]&d[9]&d[8]&a[1]) | + (!d[15]&!d[13]&!d[11]&d[9]&a[4]) | + (d[11]&!d[9]&a[4]) | + (d[13]&!d[9]&a[4]) | + (d[15]&!d[9]&a[4]) | + (!d[14]&!d[9]&a[1]) | + (!d[14]&a[5])); + + next_d[9] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (!d[15]&d[14]&!d[13]&!d[11]&!d[9]&a[4]) | + (!d[14]&d[9]&a[4]) | + (!d[14]&d[11]&a[4]) | + (!d[14]&d[13]&a[4]) | + (d[15]&!d[14]&a[4]) | + (d[14]&a[1]) | + (!d[12]&a[5])); + + next_d[8] <= !(( a[8] & a[7] & !a[6] & a[5] & a[4] & !a[3] & !a[2] & !a[1]) | + (!d[15]&!d[14]&!d[13]&d[12]&!d[11]&!d[9]&a[4]) | + (d[14]&d[12]&a[1]) | + (!d[12]&d[11]&a[4]) | + (d[13]&!d[12]&a[4]) | + (d[15]&!d[12]&a[4]) | + (!d[14]&!d[12]&a[1]) | + (!d[10]&a[5])); + + + end +end + +always @(posedge clk) + d <= next_d; endmodule \ No newline at end of file diff --git a/cores/mist/mfp.v b/cores/mist/mfp.v index 06ea3de..a09e124 100644 --- a/cores/mist/mfp.v +++ b/cores/mist/mfp.v @@ -136,83 +136,54 @@ reg [7:0] aer, ddr, gpip; reg [15:0] ipr, ier, imr, isr; // interrupt registers reg [7:0] vr; -// any pending and not masked interrupt causes the irq line to go high -// if highest_irq_pending != higest_irq_active then there's a high prio -// irq in service and no irq is generated until this one is finished -//assign irq = ((ipr & imr) != 16'h0000) && (highest_irq_active == highest_irq_pending); - -// handle pending and in service irqs -//wire [15:0] irq_active_map = (ipr | isr) & imr; - -// (i am pretty sure this can be done much more elegant ...) -// check the number of the highest active irq -//wire [3:0] highest_irq_active= -// ( irq_active_map[15] == 1'b1)?4'd15: -// ((irq_active_map[15:14] == 2'b01)?4'd14: -// ((irq_active_map[15:13] == 3'b001)?4'd13: -// ((irq_active_map[15:12] == 4'b0001)?4'd12: -// ((irq_active_map[15:11] == 5'b00001)?4'd11: -// ((irq_active_map[15:10] == 6'b000001)?4'd10: -// ((irq_active_map[15:9] == 7'b0000001)?4'd9: -// ((irq_active_map[15:8] == 8'b00000001)?4'd8: -// ((irq_active_map[15:7] == 9'b000000001)?4'd7: -// ((irq_active_map[15:6] == 10'b000000001)?4'd6: -// ((irq_active_map[15:5] == 11'b0000000001)?4'd5: -// ((irq_active_map[15:4] == 12'b00000000001)?4'd4: -// ((irq_active_map[15:3] == 13'b000000000001)?4'd3: -// ((irq_active_map[15:2] == 14'b0000000000001)?4'd2: -// ((irq_active_map[15:1] == 15'b00000000000001)?4'd1: -// ((irq_active_map[15:0] == 16'b000000000000001)?4'd0: -// 4'd0))))))))))))))); - // generate irq signal if an irq is pending and no other irq of same or higher prio is in service -assign irq = ((ipr & imr) != 16'h0000) && ((isr == 16'h0000) || (highest_irq_pending > irq_in_service)); +assign irq = ((ipr & imr) != 16'h0000) && (highest_irq_pending >= irq_in_service); // check number of current interrupt in service wire [3:0] irq_in_service = - ( isr[15] == 1'b1)?4'd15: - ((isr[15:14] == 2'b01)?4'd14: - ((isr[15:13] == 3'b001)?4'd13: - ((isr[15:12] == 4'b0001)?4'd12: - ((isr[15:11] == 5'b00001)?4'd11: - ((isr[15:10] == 6'b000001)?4'd10: - ((isr[15:9] == 7'b0000001)?4'd9: - ((isr[15:8] == 8'b00000001)?4'd8: - ((isr[15:7] == 9'b000000001)?4'd7: - ((isr[15:6] == 10'b000000001)?4'd6: - ((isr[15:5] == 11'b0000000001)?4'd5: - ((isr[15:4] == 12'b00000000001)?4'd4: - ((isr[15:3] == 13'b000000000001)?4'd3: - ((isr[15:2] == 14'b0000000000001)?4'd2: - ((isr[15:1] == 15'b00000000000001)?4'd1: - ((isr[15:0] == 16'b000000000000001)?4'd0: - 4'd0))))))))))))))); + (isr[15] == 1'b1)?4'd15: + (isr[15:14] == 2'b1)?4'd14: + (isr[15:13] == 3'b1)?4'd13: + (isr[15:12] == 4'b1)?4'd12: + (isr[15:11] == 5'b1)?4'd11: + (isr[15:10] == 6'b1)?4'd10: + (isr[15:9] == 7'b1)?4'd9: + (isr[15:8] == 8'b1)?4'd8: + (isr[15:7] == 9'b1)?4'd7: + (isr[15:6] == 10'b1)?4'd6: + (isr[15:5] == 11'b1)?4'd5: + (isr[15:4] == 12'b1)?4'd4: + (isr[15:3] == 13'b1)?4'd3: + (isr[15:2] == 14'b1)?4'd2: + (isr[15:1] == 15'b1)?4'd1: + (isr[15:0] == 16'b1)?4'd0: + 4'd0; wire [15:0] irq_pending_map = ipr & imr; // check the number of the highest pending irq wire [3:0] highest_irq_pending = - ( irq_pending_map[15] == 1'b1)?4'd15: - ((irq_pending_map[15:14] == 2'b01)?4'd14: - ((irq_pending_map[15:13] == 3'b001)?4'd13: - ((irq_pending_map[15:12] == 4'b0001)?4'd12: - ((irq_pending_map[15:11] == 5'b00001)?4'd11: - ((irq_pending_map[15:10] == 6'b000001)?4'd10: - ((irq_pending_map[15:9] == 7'b0000001)?4'd9: - ((irq_pending_map[15:8] == 8'b00000001)?4'd8: - ((irq_pending_map[15:7] == 9'b000000001)?4'd7: - ((irq_pending_map[15:6] == 10'b000000001)?4'd6: - ((irq_pending_map[15:5] == 11'b0000000001)?4'd5: - ((irq_pending_map[15:4] == 12'b00000000001)?4'd4: - ((irq_pending_map[15:3] == 13'b000000000001)?4'd3: - ((irq_pending_map[15:2] == 14'b0000000000001)?4'd2: - ((irq_pending_map[15:1] == 15'b00000000000001)?4'd1: - ((irq_pending_map[15:0] == 16'b000000000000001)?4'd0: - 4'd0))))))))))))))); + (irq_pending_map[15] == 1'b1)?4'd15: + (irq_pending_map[15:14] == 2'b1)?4'd14: + (irq_pending_map[15:13] == 3'b1)?4'd13: + (irq_pending_map[15:12] == 4'b1)?4'd12: + (irq_pending_map[15:11] == 5'b1)?4'd11: + (irq_pending_map[15:10] == 6'b1)?4'd10: + (irq_pending_map[15:9] == 7'b1)?4'd9: + (irq_pending_map[15:8] == 8'b1)?4'd8: + (irq_pending_map[15:7] == 9'b1)?4'd7: + (irq_pending_map[15:6] == 10'b1)?4'd6: + (irq_pending_map[15:5] == 11'b1)?4'd5: + (irq_pending_map[15:4] == 12'b1)?4'd4: + (irq_pending_map[15:3] == 13'b1)?4'd3: + (irq_pending_map[15:2] == 14'b1)?4'd2: + (irq_pending_map[15:1] == 15'b1)?4'd1: + (irq_pending_map[15:0] == 16'b1)?4'd0: + 4'd0; // gpip as output to the cpu (ddr bit == 1 -> gpip pin is output) wire [7:0] gpip_cpu_out = (i & ~ddr) | (gpip & ddr); - + // cpu read interface always @(iack, sel, ds, rw, addr, gpip_cpu_out, aer, ddr, ier, ipr, isr, imr, vr, serial_data_out_fifo_full, timera_dat_o, timerb_dat_o, @@ -234,7 +205,7 @@ always @(iack, sel, ds, rw, addr, gpip_cpu_out, aer, ddr, ier, ipr, isr, imr, if(addr == 5'h08) dout = isr[7:0]; if(addr == 5'h0a) dout = imr[7:0]; if(addr == 5'h0b) dout = vr; - + // timers if(addr == 5'h0c) dout = { 3'b000, timera_ctrl_o}; if(addr == 5'h0d) dout = { 3'b000, timerb_ctrl_o}; @@ -248,36 +219,41 @@ always @(iack, sel, ds, rw, addr, gpip_cpu_out, aer, ddr, ier, ipr, isr, imr, if(addr == 5'h16) dout = serial_data_out_fifo_full?8'h00:8'h80; end else if(iack) begin - dout = { vr[7:4], highest_irq_pending }; + dout = irq_vec; end end -// delay inputs to detect changes -reg [7:0] iD, iD2; - // mask of input irqs which are overwritten by timer a/b inputs wire [7:0] ti_irq_mask = { 3'b000, pulse_mode, 3'b000}; wire [7:0] ti_irq = { 3'b000, t_i[0], t_i[1], 3'b000}; +// delay inputs to detect changes +reg [7:0] iD, iD2; reg iackD; -always @(posedge clk) begin + +// latch to keep irq vector stable during irq ack cycle +reg [7:0] irq_vec; + +always @(negedge clk) begin iackD <= iack; + // update the irq vector periodically unless we are in the + // middle of an interrupt acknowledge phase + if(!iack) + irq_vec <= { vr[7:4], highest_irq_pending }; + // delay inputs for irq generation, apply aer (irq edge) iD <= aer ^ ((i & ~ti_irq_mask) | (ti_irq & ti_irq_mask)); iD2 <= iD; -end - -always @(negedge clk) begin if(reset) begin ipr <= 16'h0000; ier <= 16'h0000; imr <= 16'h0000; isr <= 16'h0000; writePout <= 0; - end else begin - + end else begin + // ack pending irqs and set isr if enabled - if(iackD) begin + if(iack && !iackD) begin // remove active bit from ipr ipr[highest_irq_pending] <= 1'b0; @@ -319,6 +295,7 @@ always @(negedge clk) begin if(addr == 5'h06) ipr[7:0] <= ipr[7:0] & din; if(addr == 5'h08) isr[7:0] <= isr[7:0] & din; // zero bits are cleared + if(addr == 5'h0a) imr[7:0] <= din; if(addr == 5'h0b) vr <= din; diff --git a/cores/mist/mfp_timer.v b/cores/mist/mfp_timer.v index e2447c7..6bc245f 100644 --- a/cores/mist/mfp_timer.v +++ b/cores/mist/mfp_timer.v @@ -57,11 +57,19 @@ always @(posedge XCLK_I) begin prescaler_counter <= prescaler_counter + 8'd1; end end - + +// pulse is generate in rising edge and detected in main mfp on falling edge +always @(posedge CLK) begin + T_O_PULSE <= 1'b0; + + if (!RST && count && (down_counter === 8'd1)) + T_O_PULSE <= 1'b1; +end + always @(negedge CLK) begin if (RST === 1'b1) begin - T_O_PULSE <= 1'b0; +// T_O_PULSE <= 1'b0; T_O <= 1'b0; control <= 4'd0; data <= 8'd0; @@ -76,7 +84,7 @@ always @(negedge CLK) begin xclk_r <= (prescaler_counter === 8'd0); xclk_r2 <= xclk_r; - T_O_PULSE <= 1'b0; +// T_O_PULSE <= 1'b0; // if a write request comes from the main unit // then write the data to the appropriate register. @@ -118,7 +126,7 @@ always @(negedge CLK) begin // pulse the timer out T_O <= ~T_O; down_counter <= data; - T_O_PULSE <= 1'b1; +// T_O_PULSE <= 1'b1; end else begin diff --git a/cores/mist/mist_top.v b/cores/mist/mist_top.v index a384d2f..51f8cae 100644 --- a/cores/mist/mist_top.v +++ b/cores/mist/mist_top.v @@ -1,16 +1,15 @@ /********************************************/ /* */ /********************************************/ - - + module mist_top ( // clock inputsxque input wire [ 2-1:0] CLOCK_27, // 27 MHz // LED outputs output wire LED, // LED Yellow // UART - output wire UART_TX, // UART Transmitter - input wire UART_RX, // UART Receiver + output wire UART_TX, // UART Transmitter (MIDI out) + input wire UART_RX, // UART Receiver (MIDI in) // VGA output wire VGA_HS, // VGA H_SYNC output wire VGA_VS, // VGA V_SYNC @@ -65,7 +64,7 @@ wire io_dtack = vreg_sel || mmu_sel || mfp_sel || mfp_iack || // required to properly detect that a blitter is not present. // a bus error is now generated once no dtack is seen for 63 clock cycles. wire tg68_clr_berr; -wire tg68_berr = (dtack_timeout == 3'd7); +wire tg68_berr = (dtack_timeout == 4'd15); // count bus errors for debugging purposes. we can thus trigger for a // certain bus error @@ -86,27 +85,35 @@ always @(posedge clk_8) begin end end -reg [2:0] dtack_timeout; +reg bus_ok, cpu_cycle_L; +always @(negedge clk_8) begin + // bus error if cpu owns bus, but no dtack, nor ram access, + // nor fast cpu cycle + bus_ok <= tg68_dtack || br || cpu2mem || cpu_fast_cycle; + cpu_cycle_L <= cpu_cycle; +end + +reg [3:0] dtack_timeout; always @(posedge clk_8) begin if(reset || tg68_clr_berr) begin - dtack_timeout <= 3'd0; + dtack_timeout <= 4'd0; end else begin - if(cpu_cycle) begin + if(cpu_cycle_L) begin // timeout only when cpu owns the bus and when // neither dtack nor another bus master are active - // also cacheable areas should never generate a - // bus error (TODO: check for write on first eight words) - if(dtack_timeout != 3'd7) begin - if(!tg68_dtack || br || tg68_as || cacheable) - dtack_timeout <= 3'd0; + // also ram areas should never generate a + // bus error for reading. But rom does for writing + if(dtack_timeout != 4'd15) begin + if(bus_ok) + dtack_timeout <= 4'd0; else - dtack_timeout <= dtack_timeout + 3'd1; + dtack_timeout <= dtack_timeout + 4'd1; end end end end - + // no tristate busses exist inside the FPGA. so bus request doesn't do // much more than halting the cpu by suppressing dtack `define BRWIRE @@ -120,21 +127,20 @@ always @(negedge clk_8) `endif // request interrupt ack from mfp for IPL == 6 -wire mfp_iack = cpu_cycle && cpu2iack && address_strobe && (tg68_adr[3:1] == 3'b110); +wire mfp_iack = cpu_cycle && cpu2iack && tg68_as && (tg68_adr[3:1] == 3'b110); -// the tg68k core with the wrapper of the minimig doesn't support non-autovector -// interrupts. Also the existing support for them inside the tg68 kernel is/was broken. -// For the atari i've fixed the non-autovector support inside the kernel and switched -// entirely to non-autovector interrupts. This means that i now have to provide +// the tg68k core doesn't reliably support mixed usage of autovector and non-autovector +// interrupts. +// For the atari we've fixed the non-autovector support inside the kernel and switched +// entirely to non-autovector interrupts. This means that we now have to provide // the vectors for those interrupts that oin the ST are autovector ones. This needs // to be done for IPL2 (hbi) and IPL4 (vbi) -wire auto_iack = cpu_cycle && cpu2iack && address_strobe && - ((tg68_adr[3:1] == 3'b100) || (tg68_adr[3:1] == 3'b010)); +wire auto_iack = cpu_cycle && cpu2iack && tg68_as && + ((tg68_adr[3:1] == 3'b100) || (tg68_adr[3:1] == 3'b010)); wire [7:0] auto_vector_vbi = (auto_iack && (tg68_adr[3:1] == 3'b100))?8'h1c:8'h00; wire [7:0] auto_vector_hbi = (auto_iack && (tg68_adr[3:1] == 3'b010))?8'h1a:8'h00; - wire [7:0] auto_vector = auto_vector_vbi | auto_vector_hbi; - + // interfaces not implemented: // $fff00000 - $fff000ff - IDE // $ffff8780 - $ffff878f - SCSI @@ -142,10 +148,10 @@ wire [7:0] auto_vector = auto_vector_vbi | auto_vector_hbi; // $fffffc20 - $fffffc3f - RTC // $ffff8e00 - $ffff8e0f - VME (only fake implementation) -wire io_sel = cpu_cycle && cpu2io && address_strobe ; +wire io_sel = cpu_cycle && cpu2io && tg68_as ; // dongle interface at $fb0000 - $fbffff -wire dongle_sel = dongle_present && cpu_cycle && address_strobe && tg68_rw && (tg68_adr[23:16] == 8'hfb); +wire dongle_sel = dongle_present && cpu_cycle && tg68_as && tg68_rw && (tg68_adr[23:16] == 8'hfb); wire [7:0] dongle_data_out; // mmu 8 bit interface at $ff8000 - $ff8001 @@ -176,7 +182,7 @@ wire [15:0] ste_dma_snd_data_out; // mfp 8 bit interface at $fffa00 - $fffa3f wire mfp_sel = io_sel && ({tg68_adr[15:6], 6'd0} == 16'hfa00); wire [7:0] mfp_data_out; - + // acia 8 bit interface at $fffc00 - $fffc07 wire acia_sel = io_sel && ({tg68_adr[15:8], 8'd0} == 16'hfc00); wire [7:0] acia_data_out; @@ -203,9 +209,9 @@ wire [15:0] io_data_out = vreg_data_out | dma_data_out | blitter_data_out | wire init = ~pll_locked; video video ( - .clk (clk_32 ), + .clk (clk_32 ), .clk27 (CLOCK_27[0]), - .bus_cycle (bus_cycle ), + .bus_cycle (bus_cycle ), // spi for OSD .sdi (SPI_DI ), @@ -337,12 +343,27 @@ acia acia ( wire [23:1] blitter_master_addr; wire blitter_master_write; wire blitter_master_read; -wire blitter_br; wire blitter_irq; wire [15:0] blitter_master_data_out; +wire blitter_br = blitter_br_out; +wire blitter_bg = 1'b1; + +//wire blitter_bg = blitter_br; + +//reg blitter_br; +//always @(posedge clk_128) begin +// if(blitter_br_out && (tg68_busstate == 2'd0)) +// blitter_br <= 1'b1; + +// else if(!blitter_br_out) +// blitter_br <= 1'b0; +//end + +wire blitter_br_out; + blitter blitter ( - .bus_cycle (bus_cycle_8 ), + .bus_cycle (bus_cycle ), // cpu interface .clk (clk_8 ), @@ -362,7 +383,8 @@ blitter blitter ( .bm_data_in (ram_data_out), .br_in (data_io_br ), - .br_out (blitter_br ), + .br_out (blitter_br_out ), + .bg (blitter_bg ), .irq (blitter_irq ), .turbo (steroids ) @@ -386,7 +408,7 @@ dongle dongle ( .sel (dongle_sel ), .present (dongle_present), .addr (tg68_adr[15:1]), - .cpu_as (address_strobe), + .cpu_as (cpu_cycle && tg68_as && !br), .uds (tg68_uds ), .rw (tg68_rw ), .dout (dongle_data_out) @@ -552,26 +574,22 @@ clock clock ( ); //// 8MHz clock //// -wire [3:0] bus_cycle; reg [3:0] clk_cnt; -reg [1:0] bus_cycle_8; - +reg [1:0] bus_cycle; always @ (posedge clk_32, negedge pll_locked) begin - if (!pll_locked) begin clk_cnt <= #1 4'b0010; - bus_cycle_8 <= 2'd3; + bus_cycle <= 2'd0; end else begin clk_cnt <= #1 clk_cnt + 4'd1; - if(clk_cnt[1:0] == 2'd2) begin - bus_cycle_8 <= bus_cycle_8 + 2'd1; + if(clk_cnt[1:0] == 2'd1) begin + bus_cycle <= bus_cycle + 2'd1; end end end assign clk_8 = clk_cnt[1]; -assign bus_cycle = clk_cnt - 4'd2; // bus cycle counter for debugging reg [31:0] cycle_counter /* synthesis noprune */; @@ -583,62 +601,55 @@ always @ (posedge clk_8) begin end -// tg68 +// tg68 bus interface. These are the signals which are latched +// for the 8MHz bus. wire [15:0] tg68_dat_in; reg [15:0] tg68_dat_out; reg [31:0] tg68_adr; wire [2:0] tg68_IPL; wire tg68_dtack; -reg tg68_as; reg tg68_uds; reg tg68_lds; reg tg68_rw; +reg [2:0] tg68_fc; wire reset = system_ctrl[0]; // ------------- generate VBI (IPL = 4) -------------- -wire vbi_ack; -assign vbi_ack = cpu2iack && address_strobe && (tg68_adr[3:1] == 3'b100); +reg vsD, vsD2, vbi; +wire vbi_ack = cpu2iack && cpu_cycle && tg68_as && (tg68_adr[3:1] == 3'b100); -reg vsD, vsD2, vsI, vbi; -always @(negedge clk_8) +always @(negedge clk_8) begin vsD <= st_vs; - -always @(posedge clk_8) begin vsD2 <= vsD; // delay by one - vsI <= vsD && !vsD2; // create single event if(reset || vbi_ack) vbi <= 1'b0; - else if(vsI) + else if(vsD && !vsD2) vbi <= 1'b1; end // ------------- generate HBI (IPL = 2) -------------- -wire hbi_ack; -assign hbi_ack = cpu2iack && address_strobe && (tg68_adr[3:1] == 3'b010); +reg hsD, hsD2, hbi; +wire hbi_ack = cpu2iack && cpu_cycle && tg68_as && (tg68_adr[3:1] == 3'b010); -reg hsD, hsD2, hsI, hbi; -always @(negedge clk_8) +always @(negedge clk_8) begin hsD <= st_hs; - -always @(posedge clk_8) begin hsD2 <= hsD; // delay by one - hsI <= hsD && !hsD2; // create single event if(reset || hbi_ack) hbi <= 1'b0; - else if(hsI) + else if(hsD && !hsD2) hbi <= 1'b1; end wire mfp_irq; -reg [2:0] ipl; -always @(posedge clk_8) begin - if(reset) begin - ipl <= 3'b111; - end else begin +reg [2:0] ipl; +always @(posedge clk_128) begin + if(reset) + ipl <= 3'b111; + else begin // ipl[0] is tied high on the atari if(mfp_irq) ipl <= 3'b001; // mfp has IPL 6 else if(vbi) ipl <= 3'b011; // vbi has IPL 4 @@ -651,19 +662,14 @@ end /* ------------------------------ TG68 CPU interface ---------------------- */ /* -------------------------------------------------------------------------- */ -// signal indicating that the cpu is making use of the current 8mhz cycle -// this means that the cpu owns the bus and either a normal bus cycle -// ends or a bus error has happened. -wire cpu_uses_8mhz_cycle = cpu_cycle && !br && (!tg68_dtack || tg68_berr || (tg68_busstate == 2'b01)); - -wire cpu_req_bus = !(tg68_busstate == 2'b01); - // the 128 Mhz cpu clock is gated by clkena. Since the CPU cannot run at full 128MHz // speed a certain amount of idle cycles have to be inserted between two subsequent // cpu clocks. This idle time is implemented using the cpu_throttle counter. reg [3:0] cpu_throttle; reg clkena; +reg iCacheStore; +reg dCacheStore; reg cacheUpdate; reg cacheRead; reg cpuDoes8MhzCycle; @@ -672,16 +678,16 @@ reg cpuDoes8MhzCycle; // requirements wire [15:0] tg68_dat_out_S; wire [31:0] tg68_adr_S; +wire [2:0] tg68_fc_S; wire tg68_uds_S; wire tg68_lds_S; wire tg68_rw_S; - -reg address_strobe; // should be "cpu_active" oe similar +reg tg68_as; +reg cpu_fast_cycle; // signal indicating that the cpu runs from cache always @(posedge clk_8) begin // tg68 core does not provide a as signal, so we generate it - tg68_as <= ~(tg68_busstate != 2'b01); - address_strobe <= cpu_cycle_is_next && !tg68_as && !br; + tg68_as <= (tg68_busstate != 2'b01) && !br; // all other output signals are simply latched to make sure // they don't change within a 8Mhz cycle even if the CPU @@ -692,6 +698,7 @@ always @(posedge clk_8) begin tg68_uds <= tg68_uds_S; tg68_lds <= tg68_lds_S; tg68_rw <= tg68_rw_S; + tg68_fc <= tg68_fc_S; end // the CPU throttle counter limits the CPU speed to a rate the tg68 core can @@ -701,7 +708,7 @@ end localparam CPU_THROTTLE = 4'd5; reg [3:0] clkcnt; -reg [15:0] cacheReadLatch; +reg trigger /* synthesis noprune */; always @(posedge clk_128) begin // count 0..15 within a 8MHz cycle @@ -712,44 +719,69 @@ always @(posedge clk_128) begin // default: cpu does not run clkena <= 1'b0; + iCacheStore <= 1'b0; + dCacheStore <= 1'b0; cacheUpdate <= 1'b0; + trigger <= 1'b0; - // assume the cpu uses the following 8 Mhz cycles - if(clkcnt == 15) + // cpuDoes8MhzCycle has same timing as tg68_as + if(clkcnt == 15) begin cpuDoes8MhzCycle <= 1'b1; + cpu_fast_cycle <= 1'b0; + end + + // evaluate cache one cycle before cpu is allowed to access the bus again + // to make sure cache signals are routed to the cpu if the cpu is supposed + // to use it + if(cpu_throttle == 4'd1) // tg68_busstate[0] == 0 -> cpu read access + if(!br && steroids && (tg68_busstate[0] == 1'b0) && cache_hit) + cacheRead <= 1'b1; + + if(clkena) + cacheRead <= 1'b0; // only run cpu if throttle counter has run down if((cpu_throttle == 4'd0) && !reset) begin - cacheRead <= 1'b0; - + // cpu does internal processing -> let it do this immediately - // don't let this happen in the cpu cycle as this may result in a - // read/write state which suddenly happens right in the middle of - // the ongoing cpu cycle - if(tg68_busstate == 2'b01) begin + // cpu wants to read and the requested data is available from the cache -> run immediately + if((tg68_busstate == 2'b01) || cacheRead) begin clkena <= 1'b1; cpu_throttle <= CPU_THROTTLE; - cpuDoes8MhzCycle <= 1'b0; - end else if( !br && steroids && (tg68_busstate == 2'b00) && cache_hit && cacheable) begin - clkena <= 1'b1; - cacheRead <= 1'b1; - cacheReadLatch <= cache_data_out; - cpu_throttle <= CPU_THROTTLE; cpuDoes8MhzCycle <= 1'b0; - end begin + cpu_fast_cycle <= 1'b1; + end else begin // this ends a normal 8MHz bus cycle. This requires that the // cpu/chipset had the entire cycle and not e.g. started just in - // the middle. This is verified using the puDoes8MhzCycle signal - // which is invalidated whenever the cpu uses a - + // the middle. This is verified using the cpuDoes8MhzCycle signal + // which is invalidated whenever the cpu uses a internal cycle or + // runs from cache + // clkcnt == 14 -> clkena in cycle 15 -> cpu runs in cycle 15 - if(clkcnt == 13) begin - if(cpu_uses_8mhz_cycle && cpuDoes8MhzCycle) begin - clkena <= 1'b1; - cpu_throttle <= CPU_THROTTLE; - - // update cache on cpu instruction read + if((clkcnt == 13) && cpuDoes8MhzCycle && cpu_cycle && !br && (tg68_dtack || tg68_berr)) begin + clkena <= 1'b1; + cpu_throttle <= CPU_THROTTLE; + cpuDoes8MhzCycle <= 1'b0; + + // ---------- cache debugging --------------- + // if the cache reports a hit, it should be the same data that's also + // returned by ram. Otherwise the cache is broken +// if(cache_hit && (tg68_busstate[0] == 1'b0)) begin +// if(cache_data_out != system_data_out) +// trigger <= 1'b1; +// end + + if(cacheable && tg68_dtack) begin + // store data in instruction cache on cpu instruction read if(tg68_busstate == 2'b00) + iCacheStore <= 1'b1; + + // store data in data cache on cpu data read + if(tg68_busstate == 2'b10) + dCacheStore <= 1'b1; + + // update cache on data write + if(tg68_busstate == 2'b11) cacheUpdate <= 1'b1; end end @@ -758,10 +790,14 @@ always @(posedge clk_128) begin cpu_throttle <= cpu_throttle - 4'd1; end +// TODO: generate cacheUpdate from ram_wr, so other bus masters also trigger this +// same goes for cache lds/uds and the address used for update16 !!!! + wire [1:0] tg68_busstate; // feed data from cache into the cpu -wire [15:0] cpu_data_in = cacheRead?cacheReadLatch:system_data_out; +wire [15:0] cache_data_out = data_cache_hit?data_cache_data_out:inst_cache_data_out; +wire [15:0] cpu_data_in = cacheRead?cache_data_out:system_data_out; TG68KdotC_Kernel #(2,2,2,2,2,2) tg68k ( .clk (clk_128 ), @@ -769,7 +805,7 @@ TG68KdotC_Kernel #(2,2,2,2,2,2) tg68k ( .clkena_in (clkena ), .data_in (cpu_data_in ), .IPL (ipl ), - .IPL_autovector (1'b0 ), + .IPL_autovector(1'b0 ), .berr (tg68_berr ), .clr_berr (tg68_clr_berr ), .CPU (system_ctrl[5:4] ), // 00=68000 @@ -780,13 +816,15 @@ TG68KdotC_Kernel #(2,2,2,2,2,2) tg68k ( .nWr (tg68_rw_S ), .busstate (tg68_busstate ), // 00-> fetch code 10->read data 11->write data 01->no memaccess .nResetOut ( ), - .FC ( ) + .FC (tg68_fc_S ) ); /* ------------------------------------------------------------------------------ */ /* ---------------------------------- cpu cache --------------------------------- */ /* ------------------------------------------------------------------------------ */ +wire cache_hit = cacheable && (data_cache_hit || inst_cache_hit); + // Any type of memory that may use a cache. Since it's a pure read cache we don't // have to differentiate between ram and rom. Cartridge is not cached since me might // attach dongles there someday @@ -797,14 +835,11 @@ wire cacheable = ((tg68_adr_S[23:22] == 2'b00) || // ordinary 4MB (tg68_adr_S[23:18] == 6'b111000) || // 256k TOS (tg68_adr_S[23:17] == 7'b1111110) || // first 128k of 192k TOS (tg68_adr_S[23:16] == 8'b11111110) ); // second 64k of 192k TOS - -wire cache_hit /* synthesis keep */; -wire [15:0] cache_data_out /* synthesis keep */; + +wire data_cache_hit; +wire [15:0] data_cache_data_out; -wire tg68_rd = (tg68_busstate != 2'b01) && !tg68_rw_S; -wire tg68_wr = (tg68_busstate != 2'b01) && tg68_rw_S; - -cache cache ( +cache data_cache ( .clk_128 ( clk_128 ), .clk_8 ( clk_8 ), .reset ( reset ), @@ -812,18 +847,47 @@ cache cache ( // use the tg68_*_S signals here to quickly react on cpu requests .addr ( tg68_adr_S[23:1] ), -// .ds ( { tg68_uds_S, tg68_lds_S } ), - .wr ( tg68_wr ), - .rd ( tg68_rd ), + .ds ( { ~tg68_lds_S, ~tg68_uds_S } ), - // at the same time the 8mhz ram access is required to monitor - // cpu accesses to the ram as well as other bus masters - - .hit ( cache_hit ), - .dout ( cache_data_out ), + // the interface to the cpus read interface is pretty simple + .hit ( data_cache_hit ), + .dout ( data_cache_data_out ), - .update64 ( cacheUpdate ), + // interface to update entire cache lines on ram read + .store ( dCacheStore ), .din64 ( ram_data_out_64 ), + + // this is a write through cache. Thus the cpus write access to ram + // is not intercepted but only used to update matching cache lines + .update ( cacheUpdate ), + .din16 ( ram_data_in ) +); + +wire inst_cache_hit; +wire [15:0] inst_cache_data_out; + +cache instruction_cache ( + .clk_128 ( clk_128 ), + .clk_8 ( clk_8 ), + .reset ( reset ), + .flush ( br ), + + // use the tg68_*_S signals here to quickly react on cpu requests + .addr ( tg68_adr_S[23:1] ), + .ds ( { ~tg68_lds_S, ~tg68_uds_S } ), + + // the interface to the cpus read interface is pretty simple + .hit ( inst_cache_hit ), + .dout ( inst_cache_data_out ), + + // interface to update entire cache lines on ram read + .store ( iCacheStore ), + .din64 ( ram_data_out_64 ), + + // this is a write through cache. Thus the cpus write access to ram + // is not intercepted but only used to update matching cache lines + .update ( cacheUpdate ), + .din16 ( ram_data_in ) ); /* ------------------------------------------------------------------------------ */ @@ -872,11 +936,11 @@ wire cpu2mem = cpu2ram14 || (tg68_rw && (cpu2tos192k || cpu2tos256k || cpu2cart) // io from 0xff0000 wire cpu2io = (tg68_adr[23:16] == 8'hff); -// irq ack happens on 0xfffffX -wire cpu2iack = (tg68_adr[23:4] == 20'hfffff); +// irq ack happens +wire cpu2iack = (tg68_fc == 3'b111); -// generate dtack (for st ram only and rom, no dtack for rom write) -assign tg68_dtack = ~(((cpu2mem && address_strobe) || io_dtack ) && !br); +// generate dtack (for st ram and rom on read, no dtack for rom write) +assign tg68_dtack = ((cpu2mem && cpu_cycle && tg68_as) || io_dtack ) && !br; /* ------------------------------------------------------------------------------ */ /* ------------------------------- bus multiplexer ------------------------------ */ @@ -888,16 +952,11 @@ wire second_cpu_slot = (mste && enable_16mhz) || steroids; // Two of the four cycles are being used. One for video (+STE audio) and one for // cpu, DMA and Blitter. A third is optionally being used for faster CPU -wire video_cycle = (bus_cycle[3:2] == 0); -wire cpu_cycle = (bus_cycle[3:2] == 1) || (second_cpu_slot && (bus_cycle[3:2] == 3)); - -// if things are to be latched is usually required to know what type the next cycle -// will be -wire video_cycle_is_next = (bus_cycle[3:2] == 3); -wire cpu_cycle_is_next = (bus_cycle[3:2] == 0) || (second_cpu_slot && (bus_cycle[3:2] == 2)); +wire video_cycle = (bus_cycle == 0); +wire cpu_cycle = (bus_cycle == 1) || (second_cpu_slot && (bus_cycle == 3)); // ----------------- RAM address -------------- -wire [22:0] video_cycle_addr = (st_hs&&ste)?ste_dma_snd_addr:video_address; +wire [22:0] video_cycle_addr = (st_hs && ste)?ste_dma_snd_addr:video_address; wire [22:0] cpu_cycle_addr = data_io_br?data_io_addr:(blitter_br?blitter_master_addr:tg68_adr[23:1]); wire [22:0] ram_address = video_cycle?video_cycle_addr:cpu_cycle_addr; @@ -905,12 +964,12 @@ wire [22:0] ram_address = video_cycle?video_cycle_addr:cpu_cycle_addr; // memory access during the video cycle is shared between video and ste_dma_snd wire video_cycle_oe = (st_hs && ste)?ste_dma_snd_read:video_read; // memory access during the cpu cycle is shared between blitter and cpu -wire cpu_cycle_oe = data_io_br?data_io_read:(blitter_br?blitter_master_read:(address_strobe && tg68_rw && cpu2mem)); +wire cpu_cycle_oe = data_io_br?data_io_read:(blitter_br?blitter_master_read:(cpu_cycle && tg68_as && tg68_rw && cpu2mem)); wire ram_oe = video_cycle?video_cycle_oe:(cpu_cycle?cpu_cycle_oe:1'b0); // ----------------- RAM write ----------------- wire video_cycle_wr = 1'b0; -wire cpu_cycle_wr = data_io_br?data_io_write:(blitter_br?blitter_master_write:(address_strobe && ~tg68_rw && cpu2ram)); +wire cpu_cycle_wr = data_io_br?data_io_write:(blitter_br?blitter_master_write:(cpu_cycle && tg68_as && ~tg68_rw && cpu2ram)); wire ram_wr = video_cycle?video_cycle_wr:(cpu_cycle?cpu_cycle_wr:1'b0); wire [15:0] ram_data_out; @@ -921,27 +980,19 @@ wire [15:0] ram_data_in = data_io_br?data_io_dout:(blitter_br?blitter_master_dat wire ram_uds = video_cycle?1'b1:((blitter_br||data_io_br)?1'b1:~tg68_uds); wire ram_lds = video_cycle?1'b1:((blitter_br||data_io_br)?1'b1:~tg68_lds); -assign SDRAM_CKE = 1'b1; - // sdram controller has 64 bit output wire [63:0] ram_data_out_64; -// latch lowest address bits for 64 bit word decomposition at the -// begin of a cpu cycle -//reg [1:0] ram_word_sel; -//always @(posedge clk_8) -// ram_word_sel <= cpu_cycle_addr[1:0]; - -wire [1:0] ram_word_sel = cpu_cycle_addr[1:0] /* synthesis keep */; - // select right word of 64 bit ram output for those devices that only want 16 bits // this is only used for the cpu and other bus masters opoerating within the cpu // cycle but neither video nor dma audio use it. They are both fed with 64 bits assign ram_data_out = - (ram_word_sel == 2'd0)?ram_data_out_64[15:0]: - ((ram_word_sel == 2'd1)?ram_data_out_64[31:16]: - ((ram_word_sel == 2'd2)?ram_data_out_64[47:32]: - ram_data_out_64[63:48])); + (cpu_cycle_addr[1:0] == 2'd0)?ram_data_out_64[15:0]: + ((cpu_cycle_addr[1:0] == 2'd1)?ram_data_out_64[31:16]: + ((cpu_cycle_addr[1:0] == 2'd2)?ram_data_out_64[47:32]: + ram_data_out_64[63:48])); + +assign SDRAM_CKE = 1'b1; sdram sdram ( // interface to the MT48LC16M16 chip @@ -1020,12 +1071,12 @@ wire [22:0] data_io_addr; wire [15:0] data_io_dout; wire data_io_write, data_io_read; wire data_io_br; - + data_io data_io ( // system control .clk_8 (clk_8 ), .reset (init ), - .bus_cycle (bus_cycle[3:2]), + .bus_cycle (bus_cycle ), .ctrl_out (system_ctrl ), // spi diff --git a/cores/mist/sdram.v b/cores/mist/sdram.v index 2f55b51..6e0e848 100644 --- a/cores/mist/sdram.v +++ b/cores/mist/sdram.v @@ -155,7 +155,11 @@ always @(posedge clk_128) begin sd_cmd <= CMD_ACTIVE; sd_addr <= { 1'b0, addr[19:8] }; sd_ba <= addr[21:20]; - sd_dqm <= ~ds; + + // always return both bytes in a read. The cpu may not + // need it, but the caches need to be able to store everything + if(!we) sd_dqm <= 2'b00; + else sd_dqm <= ~ds; // lowest address for burst read burst_addr <= addr[1:0]; diff --git a/cores/mist/ste_dma_snd.v b/cores/mist/ste_dma_snd.v index 775812f..7142533 100644 --- a/cores/mist/ste_dma_snd.v +++ b/cores/mist/ste_dma_snd.v @@ -35,7 +35,7 @@ module ste_dma_snd ( // memory interface input clk32, // 31.875 MHz - input [3:0] bus_cycle, // bus-cycle + input [1:0] bus_cycle, // bus-cycle input hsync, // to synchronize with video output read, output [22:0] saddr, @@ -48,9 +48,28 @@ module ste_dma_snd ( output xsint, output xsint_d ); +// --------------------------------------------------------------------------- +// --------------------------- internal state counter ------------------------ +// --------------------------------------------------------------------------- + +reg [1:0] t /* synthesis noprune */ ; +always @(posedge clk32) begin + // 32Mhz counter synchronous to 8 Mhz clock + // force counter to pass state 0 exactly after the rising edge of clk (8Mhz) + if(((t == 2'd3) && ( clk == 0)) || + ((t == 2'd0) && ( clk == 1)) || + ((t != 2'd3) && (t != 2'd0))) + t <= t + 2'd1; +end + +// create internal bus_cycle signal which is stable on the positive clock +// edge and extends the previous state by half a 32 Mhz clock cycle +reg [3:0] bus_cycle_L; +always @(negedge clk32) + bus_cycle_L <= { bus_cycle, t }; assign saddr = snd_adr; // drive data -assign read = (bus_cycle[3:2] == 0) && hsync && !fifo_full && dma_enable; +assign read = (bus_cycle == 0) && hsync && !fifo_full && dma_enable; // --------------------------------------------------------------------------- // ------------------------------ clock generation --------------------------- @@ -322,7 +341,8 @@ always @(posedge clk32) begin frame_done <= (snd_adr == snd_end_latched-23'd1); // fifo not full? read something during hsync using the video cycle - if((!fifo_full) && hsync && (bus_cycle == 3)) begin + // bus_cycle_L = 3 is the end of the video cycle + if((!fifo_full) && hsync && (bus_cycle_L == 3)) begin if(snd_adr != snd_end_latched) begin // read right word from ram using the 64 bit memory interface diff --git a/cores/mist/video.v b/cores/mist/video.v index 325a7ae..4918014 100644 --- a/cores/mist/video.v +++ b/cores/mist/video.v @@ -20,15 +20,7 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . -// TODO: -// - async timing - -// Overscan: -// http://codercorner.com/fullscrn.txt -// Examples: automation 000 + 001 + 097: bottom border -// automation 196: top + bottom border - -// Todo STE: +// Implemented STE features // http://alive.atari.org/alive12/ste_hwsc.php // http://atari-ste.anvil-soft.com/html/devdocu2.htm // + 3*4 bit palette (4096 colors) @@ -36,13 +28,15 @@ // + video counter writeable // + pixel offset // + line offset -// - undocumented 16 pixel "line offset overscan" +// + botton overscan +// + top overscan (this is really an unreliable hack which barely works for obsession pinball) +// + undocumented 16 pixel "line offset overscan" module video ( // system interface input clk, // 31.875 MHz input clk27, // 27.000 Mhz - input [3:0] bus_cycle, // bus-cycle for sync + input [1:0] bus_cycle, // bus-cycle for sync // SPI interface for OSD input sck, @@ -79,9 +73,9 @@ module video ( input ste, // enable STE featurss // signals not affected by scan doubler for internal use like irqs - output st_de, - output st_vs, - output st_hs + output st_de, + output reg st_vs, + output reg st_hs ); localparam LINE_WIDTH = 10'd640; @@ -92,22 +86,71 @@ localparam STATE_BLANK = 2'd1; localparam STATE_BORDER = 2'd2; localparam STATE_DISP = 2'd3; +// --------------------------------------------------------------------------- +// --------------------------- internal state counter ------------------------ +// --------------------------------------------------------------------------- + +reg [1:0] t; +always @(posedge clk) begin + // 32Mhz counter synchronous to 8 Mhz clock + // force counter to pass state 0 exactly after the rising edge of clk_reg (8Mhz) + if(((t == 2'd3) && ( reg_clk == 0)) || + ((t == 2'd0) && ( reg_clk == 1)) || + ((t != 2'd3) && (t != 2'd0))) + t <= t + 2'd1; +end + +// create internal bus_cycle signal which is stable on the positive clock +// edge and extends the previous state by half a 32 Mhz clock cycle +reg [3:0] bus_cycle_L; +always @(negedge clk) + bus_cycle_L <= { bus_cycle, t }; + // --------------------------------------------------------------------------- // ------------------------------ internal signals --------------------------- // --------------------------------------------------------------------------- // st_de is the internal display enable signal as used by the mfp. This is used // by software to generate a line interrupt and to e.g. do 512 color effects. -// st_de is active low. Using memory enable (me) for this makes sure the cpu has +// st_de is active low. Using display enable (de) for this makes sure the cpu has // plenty of time before data for the next line is starting to be fetched -assign st_de = ~me; +assign st_de = ~de; -// hsync irq is generated at the rising edge of st_hs -assign st_hs = (st_h_state == STATE_SYNC); +always @(posedge clk) begin -// vsync irq is generated at the rising edge of st_vs -assign st_vs = (v_state == STATE_SYNC); + // hsync irq is generated after the rightmost border pixel column has been displayed + // Run st timing at full speed if no scan doubler is being used. Otherwise run + // it at half speed + if((!scan_doubler_enable) || vga_hcnt[0]) begin + + // hsync starts at begin of blanking phase + if(st_hcnt == (t1_h_blank_right - memory_prefetch)) + st_hs <= 1'b1; + + // hsync ends at begin of left border + if(st_hcnt == (t4_h_border_left - memory_prefetch)) + st_hs <= 1'b0; + end + + // vsync irq is generated right after the last border line has been displayed + + // TODO: check where these additional -10'd2 come from. Obsession pinball + // needs this to get the colors right. This means it's needed for the correct + // relationship between vbi and hbi. But why? + + // v_event is the begin of hsync. The hatari video.h says vbi happens 64 clock cycles + // ST hor counter runs at 16Mhz, thus the trigger is 128 events after h_sync + // xyz + if(st_h_active && (st_hcnt == (v_event))) begin + // vsync starts at begin of blanking phase + if(vcnt == t7_v_blank_bot - de_v_offset - 10'd2) st_vs <= 1'b1; + + // vsync ends at begin of top border + if(vcnt == t10_v_border_top - de_v_offset - 10'd2) st_vs <= 1'b0; + end +end + // --------------------------------------------------------------------------- // -------------------------------- video mode ------------------------------- // --------------------------------------------------------------------------- @@ -143,15 +186,13 @@ wire [9:0] t4_h_border_left = config_string[80:71]; wire [9:0] t5_h_end = config_string[70:61]; wire v_sync_pol = config_string[60]; -// in overscan mode the bottom border is removed and data is displayed instead -wire [9:0] t6_v_border_bot = overscan?config_string[49:40]:config_string[59:50]; +wire [9:0] t6_v_border_bot = config_string[59:50]; wire [9:0] t7_v_blank_bot = config_string[49:40]; wire [9:0] t8_v_sync = config_string[39:30]; wire [9:0] t9_v_blank_top = config_string[29:20]; wire [9:0] t10_v_border_top = config_string[19:10]; wire [9:0] t11_v_end = config_string[9:0]; - // default video mode is monochrome parameter DEFAULT_MODE = 2'd2; @@ -182,6 +223,7 @@ reg [3:0] palette_b[15:0]; // STE-only registers reg [7:0] line_offset; // number of words to skip at the end of each line reg [3:0] pixel_offset; // number of pixels to skip at begin of line +reg ste_overscan_enable; // STE has a special 16 bit overscan // --------------------------------------------------------------------------- // ----------------------------- CPU register read --------------------------- @@ -247,6 +289,7 @@ always @(negedge reg_clk) begin // disable STE hard scroll features line_offset <= 8'h00; pixel_offset <= 4'h0; + ste_overscan_enable <= 1'b0; if(DEFAULT_MODE == 0) begin // TOS default palette, can be disabled after tests @@ -292,12 +335,22 @@ always @(negedge reg_clk) begin // writing special STE registers if(ste && !reg_lds) begin if(reg_addr == 6'h07) line_offset <= reg_din[7:0]; - if(reg_addr == 6'h32) pixel_offset <= reg_din[3:0]; + if(reg_addr == 6'h32) begin + pixel_offset <= reg_din[3:0]; + ste_overscan_enable <= 1'b0; + end // Writing the video address counter happens directly inside the // memory engine further below!!! end - + + // byte write of 0 to ff8264 while ff8365 (pixel_offset) != 0 results in extra + // ste overscan + if(ste && !reg_uds && reg_lds) begin + if((reg_addr == 6'h32) && (pixel_offset != 0)) + ste_overscan_enable <= 1'b1; + end + // the color palette registers, always write bit 3 with zero if not in // ste mode as this is the lsb of ste if(reg_addr >= 6'h20 && reg_addr < 6'h30 ) begin @@ -353,7 +406,7 @@ osd osd ( // mono uses the lsb of blue palette entry 0 to invert video wire [3:0] blue0 = palette_b[0]; wire mono_bit = blue0[0]^shift_0[15]; -wire [3:0] mono_rgb = de?{mono_bit, mono_bit, mono_bit, mono_bit}:4'b1000; +wire [3:0] mono_rgb = { mono_bit, mono_bit, mono_bit, mono_bit }; // ------------------------- colour video signal ----------------------------- @@ -368,7 +421,7 @@ wire [3:0] color_b = { color[ 2:0], color[ 3] }; wire [3:0] stvid_r = mono?mono_rgb:color_r; wire [3:0] stvid_g = mono?mono_rgb:color_g; wire [3:0] stvid_b = mono?mono_rgb:color_b; - + // shift registers for up to 4 planes reg [15:0] shift_0, shift_1, shift_2, shift_3; @@ -435,7 +488,12 @@ end // the top border should also be easy. Opening the side borders is basically // impossible as this requires a 100% perfect CPU and shifter timing. -reg last_syncmode, overscan_detect, overscan; +reg last_syncmode; +reg [3:0] bottom_overscan_cnt; +reg [3:0] top_overscan_cnt; + +wire bottom_overscan = (bottom_overscan_cnt != 0) /* synthesis keep */; +wire top_overscan = (top_overscan_cnt != 0) /* synthesis keep */; always @(posedge clk) begin last_syncmode <= syncmode[1]; // delay syncmode to detect changes @@ -443,18 +501,29 @@ always @(posedge clk) begin // this is the magic used to do "overscan". // the magic actually involves more than writing zero (60hz) // within line 200. But this is sufficient for our detection - if(vcnt[9:2] == 8'd99) begin - // syncmode has changed from 1 to 0 (50 to 60 hz) - if((syncmode[1] == 1'b0) && (last_syncmode == 1'b1)) - overscan_detect <= 1'b1; - end - // latch overscan state at topleft screen edge - if((vga_hcnt == t4_h_border_left) && (vcnt == t10_v_border_top)) begin - // save and reset overscan - overscan <= overscan_detect; - overscan_detect <= 1'b0; - end + // trigger in line 198/199 + if((vcnt == { 8'd97, 2'b00} ) && (vga_hcnt == 10'd0) && (bottom_overscan_cnt != 0)) + bottom_overscan_cnt <= bottom_overscan_cnt - 4'd1; + + if((vcnt[9:2] == 8'd98)||(vcnt[9:2] == 8'd99)||(vcnt[9:2] == 8'd100)) begin + // syncmode has changed from 1 to 0 (50 to 60 hz) + if((syncmode[1] == 1'b0) && (last_syncmode == 1'b1)) + bottom_overscan_cnt <= 4'd15; + end + + // trigger in line 284/285 + if((vcnt == {8'd133, 2'b00 }) && (vga_hcnt == 10'd0) && (top_overscan_cnt != 0)) + top_overscan_cnt <= top_overscan_cnt - 4'd1; + + if((vcnt[9:2] == 8'd134)||(vcnt[9:2] == 8'd135)||(vcnt[9:2] == 8'd136)) begin + // syncmode has changed from 1 to 0 (50 to 60 hz) + if((syncmode[1] == 1'b0) && (last_syncmode == 1'b1)) + top_overscan_cnt <= 4'd15; + end + +// top_overscan <= 1'b1; +// bottom_overscan <= 1'b1; end // --------------------------------------------------------------------------- @@ -497,7 +566,7 @@ ste_shifter ste_shifter_3 ( // move data into STE hard scroll shift registers always @(posedge clk) begin - if((bus_cycle == 4'd14) && (plane == 2'd0)) begin + if((bus_cycle_L == 4'd14) && (plane == 2'd0)) begin // shift up 16 pixels and load new data into lower bits of shift registers ste_shift_0 <= { ste_shift_0[15:0], data_latch[0] }; ste_shift_1 <= { ste_shift_1[15:0], (planes > 3'd1)?data_latch[1]:16'h0000 }; @@ -521,7 +590,7 @@ reg [15:0] sd_shift_0, sd_shift_1, sd_shift_2, sd_shift_3; // msb of the shift registers is the index used to access the palette registers. // Return border color index (0) if outside display area -wire [3:0] sd_index = (!me_v)?4'd0: +wire [3:0] sd_index = (!de_v)?4'd0: { sd_shift_3[15], sd_shift_2[15], sd_shift_1[15], sd_shift_0[15]}; // line buffer for two lines of 720 pixels (640 + 2 * 40 border) 3 * 4 (STE!) bit rgb data @@ -537,19 +606,18 @@ always @(posedge clk) begin // vertical state changes at end of hsync (begin of left blank) if(vga_hcnt == v_event) begin - // reset state counter two vga lines before screen start since scan doubler + // reset state counter two vga lines before display start since scan doubler // starts prefetching data two vga lines before - if(vcnt == (t11_v_end-10'd2)) sd_vcnt <= 2'd0; - else sd_vcnt <= sd_vcnt + 2'd1; + if(vcnt == (de_v_start-10'd2)) sd_vcnt <= 2'd0; + else sd_vcnt <= sd_vcnt + 2'd1; end // permanently move data from data_latch into scan doublers shift registers - if((bus_cycle == 4'd15) && (plane == 2'd0)) begin - + if((bus_cycle_L == 4'd15) && (plane == 2'd0)) begin // normally data is directly moved from the input latches into the // shift registers. Only on an ste with pixel scrolling enabled // the data is moved through additional shift registers - if(!ste || (pixel_offset == 0)) begin + if(!ste || (pixel_offset == 0) || ste_overscan_enable) begin // load data into shift registers as required by color depth sd_shift_0 <= data_latch[0]; sd_shift_1 <= (planes > 3'd1)?data_latch[1]:16'h0000; @@ -606,53 +674,60 @@ end // ------------------------------- memory engine ----------------------------- // --------------------------------------------------------------------------- -assign read = (bus_cycle[3:2] == 0) && me; // memory enable can directly be used as a ram read signal +assign read = (bus_cycle == 0) && de; // display enable can directly be used as a ram read signal // current plane to be read from memory reg [1:0] plane; // To be able to output the first pixel we need to have one word for every plane already -// present in memory. We thus need a "memory enable" signal which is (depending on color depth) +// present in memory. We thus need a display enable signal which is (depending on color depth) // 16, 32 or 64 pixel ahead of display enable -reg me, me_v; +reg de, de_v; // required pixel offset allowing for prefetch of 16 pixels in 1, 2 or 4 planes (16, 32 or 64 cycles) wire [9:0] memory_prefetch = scan_doubler_enable?{ 4'd0, planes, 3'd0 }:{ 3'd0, planes, 4'd0 }; +wire [9:0] ste_overscan = ste_overscan_enable?memory_prefetch:10'd0; // ste is starting another 16 pixels earlier if horizontal hard scroll is being used -wire [9:0] ste_prefetch = (ste && (pixel_offset != 0))?memory_prefetch:10'd0; -wire [9:0] me_h_start = t5_h_end - memory_prefetch - ste_prefetch; -wire [9:0] me_h_end = t0_h_border_right - memory_prefetch; +wire [9:0] ste_prefetch = (ste && ((pixel_offset != 0) && !ste_overscan_enable))?memory_prefetch:10'd0; +wire [9:0] de_h_start = t5_h_end - memory_prefetch - ste_prefetch; +wire [9:0] de_h_end = t0_h_border_right - memory_prefetch + ste_overscan; + +// extra lines required by overscan +wire [9:0] de_v_top_extra = top_overscan?10'd58:10'd0; // 29 extra ST lines at top +wire [9:0] de_v_bot_extra = bottom_overscan?10'd76:10'd0; // 38 extra ST lines at bottom // line offset required for scan doubler -wire [9:0] me_v_offset = scan_doubler_enable?10'd2:10'd0; -wire [9:0] me_v_start = t11_v_end - me_v_offset; -wire [9:0] me_v_end = t6_v_border_bot - me_v_offset; +wire [9:0] de_v_offset = scan_doubler_enable?10'd2:10'd0; + +// calculate lines in which active display starts end ends +wire [9:0] de_v_start = t11_v_end - de_v_offset - de_v_top_extra; +wire [9:0] de_v_end = t6_v_border_bot - de_v_offset + de_v_bot_extra; // with scan doubler being active, there are two main clock cycles per st hor counter // st_h_active makes sure these events only trigger once -wire st_h_active = (!scan_doubler_enable || bus_cycle[0]); +wire st_h_active = (!scan_doubler_enable || t[0]); always @(posedge clk) begin // line in which memory access is enabled // in scan doubler mode two lines ahead of vertical display enable if(vga_hcnt == v_event) begin - if(vcnt == me_v_start) me_v <= 1'b1; - if(vcnt == me_v_end) me_v <= 1'b0; + if(vcnt == de_v_start) de_v <= 1'b1; + if(vcnt == de_v_end) de_v <= 1'b0; end - // memory enable signal 16/32/64 bits (16*planes) ahead of display enable (de) + // display enable signal 16/32/64 bits (16*planes) ahead of display enable (de) // include bus cycle to stay in sync in scna doubler mode - if(me_v && st_h_active) begin - if(st_hcnt == me_h_start) me <= 1'b1; - if(st_hcnt == me_h_end) me <= 1'b0; + if(de_v && st_h_active) begin + if(st_hcnt == de_h_start) de <= 1'b1; + if(st_hcnt == de_h_end) de <= 1'b0; end - + // make sure each line starts with plane 0 - if(st_hcnt == me_h_start) + if(st_hcnt == de_h_start) plane <= 2'd0; - // The video address counter is reloaded slightly before vsync - if((vga_hcnt == t4_h_border_left) && (vcnt == t8_v_sync - 10'd3)) begin + // The video address counter is reloaded right before display starts + if((vga_hcnt == t3_h_blank_left) && (vcnt == t7_v_blank_bot)) begin vaddr <= _v_bas_ad; // copy syncmode @@ -660,10 +735,10 @@ always @(posedge clk) begin end else begin // video transfer happens in cycle 3 (end of video cycle) - if(bus_cycle == 3) begin + if(bus_cycle_L == 3) begin - // read if memory enable is active - if(me) begin + // read if display enable is active + if(de) begin // move incoming video data into data latch // ST shifter only uses 16 out of possible 64 bits, so select the right word @@ -689,12 +764,12 @@ always @(posedge clk) begin // STE has additional ways to influence video address if(ste) begin // add line offset at the end of each video line - if(me_v && st_h_active && (st_hcnt == t2_h_sync)) + if(de_v && st_h_active && (st_hcnt == t2_h_sync)) vaddr <= vaddr + line_offset; // STE vaddr write handling // bus_cycle 6 is in the middle of a cpu cycle - if((bus_cycle == 6) && ste_vaddr_write) begin + if((bus_cycle_L == 6) && ste_vaddr_write) begin if(reg_addr == 6'h02) vaddr[22:15] <= reg_din[7:0]; if(reg_addr == 6'h03) vaddr[14: 7] <= reg_din[7:0]; if(reg_addr == 6'h04) vaddr[ 6: 0] <= reg_din[7:1]; @@ -723,7 +798,6 @@ reg [1:0] v_state; // 0=sync, 1=blank, 2=border, 3=display // blank level is also used during sync wire blank = (v_state == STATE_BLANK) || (vga_h_state == STATE_BLANK) || (v_state == STATE_SYNC) || (vga_h_state == STATE_SYNC); -wire de = (v_state == STATE_DISP) && (vga_h_state == STATE_DISP); // time in horizontal timing where vertical states change (at the begin of the sync phase) wire [9:0] v_event = t2_h_sync; @@ -741,7 +815,7 @@ always @(posedge clk) begin // the scan doubler is a special case as the atari line timing then expands over two vga // lines and may/must be asynchronous to the vga timing at the end of the first line if(vga_hcnt == t5_h_end) begin - if((bus_cycle == 4'd15) || (scan_doubler_enable && sd_vcnt[0])) + if((bus_cycle_L == 4'd15) || (scan_doubler_enable && sd_vcnt[0])) vga_hcnt <= 10'd0; end else vga_hcnt <= vga_hcnt + 10'd1; @@ -770,7 +844,7 @@ always @(posedge clk) begin // generate horizontal video signal states if( st_hcnt == t2_h_sync ) st_h_state <= STATE_SYNC; - if((st_hcnt == t0_h_border_right) || (st_hcnt == t4_h_border_left)) st_h_state <= STATE_BORDER; + if((st_hcnt == t0_h_border_right + ste_overscan) || (st_hcnt == t4_h_border_left)) st_h_state <= STATE_BORDER; if((st_hcnt == t1_h_blank_right) || (st_hcnt == t3_h_blank_left)) st_h_state <= STATE_BLANK; if( st_hcnt == t5_h_end) st_h_state <= STATE_DISP; end diff --git a/cores/mist/video_modes.v b/cores/mist/video_modes.v index 0a62045..1d09c6b 100644 --- a/cores/mist/video_modes.v +++ b/cores/mist/video_modes.v @@ -28,6 +28,17 @@ // NTSC 32042400 Hz // MIST 31875000 Hz +// real ST timing + +// Starting with VBI +// Atari Timing as hatari sees it: sync 34, border 29, disp 200, 47 border, 3 ?? = 313, vbi@310 +// 47 bottom border lines seesm to be too much, some intros have artifacts in the lower lines +// 38 bottom border lines seems to be good + +// 60Hz sync 5, border 29, disp 200, 29 border = 263, vbi@261 + +// vbl at cycle counter 64 (64 cycles after hbl) + module video_modes ( inout mono, // select monochrome mode (and not color) input pal, // select pal mode (and not ntsc) if a color mode is selected @@ -42,11 +53,11 @@ module video_modes ( localparam H_ACT = 10'd640; localparam V_ACT = 10'd400; - + // TIMING CONSTRAINTS: // The total width (act+both blank+2*border+sync) must be a multiple of 16, for // scan doubled modes a multiple of 8 - + // --------------------------------------------------------------------------- // ----------------------------- pal56 timing ------------------------------- // --------------------------------------------------------------------------- @@ -57,9 +68,9 @@ localparam V_ACT = 10'd400; wire [121:0] pal56_config_str; conf pal56_conf( -// front porch sync width back porch border width sync polarity - .h_fp ( 10'd44), .h_s (10'd120), .h_bp ( 10'd44), .h_bd (10'd40), .h_sp (1'b1), - .v_fp ( 10'd24), .v_s ( 10'd4), .v_bp ( 10'd24), .v_bd (10'd80), .v_sp (1'b1), + // front porch sync width back porch border width sync polarity + .h_fp ( 10'd44), .h_s (10'd120), .h_bp ( 10'd44), .h_bd (10'd40), .h_sp (1'b1), + .v_fp ( 10'd24), .v_s ( 10'd4), .v_bp ( 10'd24), .v_tb (10'd80), .v_bb (10'd80), .v_sp (1'b1), .str (pal56_config_str) ); @@ -73,9 +84,10 @@ conf pal56_conf( wire [121:0] pal50_config_str; conf pal50_conf( -// front porch sync width back porch border width sync polarity - .h_fp ( 10'd80), .h_s ( 10'd64), .h_bp ( 10'd80), .h_bd (10'd80), .h_sp (1'b1), - .v_fp ( 10'd30), .v_s ( 10'd6), .v_bp ( 10'd30), .v_bd (10'd80), .v_sp (1'b1), + // front porch sync width back porch border width sync polarity + .h_fp ( 10'd80), .h_s ( 10'd64), .h_bp ( 10'd80), .h_bd (10'd80), .h_sp (1'b1), +// .v_fp ( 10'd42), .v_s ( 10'd8), .v_bp ( 10'd42), .v_tb (10'd58), .v_bb (10'd76), .v_sp (1'b1), + .v_fp ( 10'd30), .v_s ( 10'd6), .v_bp ( 10'd30), .v_tb (10'd80), .v_bb (10'd80), .v_sp (1'b1), .str (pal50_config_str) ); @@ -89,9 +101,9 @@ conf pal50_conf( wire [121:0] ntsc_config_str; conf ntsc_conf( -// front porch sync width back porch border width sync polarity - .h_fp ( 10'd76), .h_s ( 10'd64), .h_bp ( 10'd76), .h_bd (10'd80), .h_sp (1'b1), - .v_fp ( 10'd20), .v_s ( 10'd6), .v_bp ( 10'd20), .v_bd (10'd40), .v_sp (1'b0), + // front porch sync width back porch border width sync polarity + .h_fp ( 10'd76), .h_s ( 10'd64), .h_bp ( 10'd76), .h_bd (10'd80), .h_sp (1'b1), + .v_fp ( 10'd20), .v_s ( 10'd6), .v_bp ( 10'd20), .v_tb (10'd40), .v_bb (10'd40), .v_sp (1'b0), .str (ntsc_config_str) ); @@ -105,9 +117,9 @@ conf ntsc_conf( wire [121:0] mono_config_str; conf mono_conf( -// front porch sync width back porch border width sync polarity - .h_fp (10'd108), .h_s ( 10'd40), .h_bp (10'd108), .h_bd ( 10'd0), .h_sp (1'b0), - .v_fp ( 10'd48), .v_s ( 10'd5), .v_bp ( 10'd48), .v_bd ( 10'd0), .v_sp (1'b0), + // front porch sync width back porch border width sync polarity + .h_fp (10'd108), .h_s ( 10'd40), .h_bp (10'd108), .h_bd ( 10'd0), .h_sp (1'b0), + .v_fp ( 10'd48), .v_s ( 10'd5), .v_bp ( 10'd48), .v_tb ( 10'd0), .v_bb ( 10'd0), .v_sp (1'b0), .str (mono_config_str) ); @@ -130,7 +142,8 @@ module conf ( input [9:0] v_fp, // vertical front porch width input [9:0] v_s, // vertical sync width input [9:0] v_bp, // vertical back porch width - input [9:0] v_bd, // vertical border width + input [9:0] v_tb, // vertical border width top + input [9:0] v_bb, // vertical border width bottom input v_sp, // vertical sync polarity output [121:0] str @@ -151,11 +164,11 @@ wire [60:0] h_str = { h_sp, wire [60:0] v_str = { v_sp, V_ACT - 10'd1, - V_ACT + v_bd - 10'd1, - V_ACT + v_bd + v_fp - 10'd1, - V_ACT + v_bd + v_fp + v_s - 10'd1, - V_ACT + v_bd + v_fp + v_s + v_bp - 10'd1, - V_ACT + v_bd + v_fp + v_s + v_bp + v_bd - 10'd1}; + V_ACT + v_bb - 10'd1, + V_ACT + v_bb + v_fp - 10'd1, + V_ACT + v_bb + v_fp + v_s - 10'd1, + V_ACT + v_bb + v_fp + v_s + v_bp - 10'd1, + V_ACT + v_bb + v_fp + v_s + v_bp + v_tb - 10'd1}; assign str = { h_str, v_str };