diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c index 41a14ad..c96a69b 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c @@ -124,6 +124,7 @@ struct scrolltest { #define GOBLIN_FILL _IOW('X', 1, struct scrolltest) #define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest) #define GOBLIN_COPY _IOW('X', 3, struct scrolltest) +#define GOBLIN_COPYREV _IOW('X', 4, struct scrolltest) static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *); static paddr_t goblin_mmap(void *, void *, off_t, int); @@ -146,6 +147,7 @@ static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n); static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop); static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop); +static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop); static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9, 0x00004005,0xfb000809,0x0000000a,0x0000000a }; static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005, @@ -153,24 +155,33 @@ static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013 static const uint32_t program_fill256[14] = { 0x01bc0014,0x001a6087,0x013c6814,0x403c0012,0x00146086,0xe03c1013,0x00165146,0xfe800148, 0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a }; -static const uint32_t program_fill[38] = { 0x11800089,0x110000c9,0x01bc0014,0x0800000d,0x013c2014,0x001400c0,0x00180000,0x403c0192, - 0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00166007,0x00145946, - 0x0014214f,0x00005005,0x00085086,0x08000089,0x001a6087,0x013c6814,0x403c0012,0x00146086, - 0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,0xc03c7013, - 0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a }; +static const uint32_t program_fill[39] = { 0x12000089,0x118000c9,0x01bc0014,0x0880000d,0x013c2014,0x001400c0,0x00180000,0x403c0192, + 0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00226007,0x00208946, + 0x0020220f,0x00008005,0x00088086,0x01048050,0x08000089,0x001a6087,0x013c6814,0x403c0012, + 0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060, + 0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a }; -static const uint32_t program_fillrop[41] = { 0x13000089,0x128000c9,0x01bc0014,0x003c014c,0x0800000d,0x013c2014,0x002000c0,0x00180000, +static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000, 0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007, - 0x00208946,0x0020220f,0x00008005,0x00088086,0x09000089,0x001a6087,0x013c6814,0x403c0012, - 0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,0x0180018d, - 0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a, - 0x0000000a}; + 0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814, + 0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208, + 0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809, + 0x0000000a,0x0000000a }; -static const uint32_t program_copy[49] = { 0x17000089,0x168000c9,0x01bc0014,0x0b80000d,0x013f0014,0x003f0054,0x00380011,0x001400c0,0x00180000,0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00161146,0xfc000148,0x0016f007,0x00145c06,0x0014214f,0x00140150,0x00005005,0x00085086,0x0b800089,0x013f0814,0x00045045,0x003f0054,0x001af087,0x403c0012,0x00146086,0xa0a00013,0x02800149,0x001c0220,0x603c7013,0x00170146,0x20a08015,0xfd800148,0x0280018d,0x013c6814,0x001c0013,0x001c0220,0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a }; +static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x0b00000d,0x013f0014,0x003f0054,0x002400c0,0x00180000, + 0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248, + 0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814, + 0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220, + 0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220, + 0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a }; -static const uint32_t* programs[7] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, NULL }; -static const uint32_t program_len[7] = { 12, 11, 14, 38, 41, 49, 0 }; -static uint32_t program_offset[7]; +static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x003af007,0x00280000,0x002c0040,0x00340080,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x060003c9,0x003c03d0,0x0028f285,0x002cf2c5,0x00800188,0x002b0286,0x003ef2c7,0x020003c9,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013f0014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a }; + +static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill, + program_fillrop, program_copy, program_copyrev, NULL }; +static const uint32_t program_len[8] = { 12, 11, 14, 39, + 42, 48, 66, 0 }; +static uint32_t program_offset[8]; static void goblin_set_depth(struct goblin_softc *, int); @@ -413,6 +424,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l) } break; + case GOBLIN_COPYREV: { + struct scrolltest *st = (struct scrolltest *)data; + jareth_copyrev(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, /* x1 */ st->pm, st->rop); + } + break; + default: return (ENOTTY); } @@ -938,6 +955,53 @@ static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, i return 0; } +static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) { + const uint32_t base = 0; + const int pidx = 6; // copyrev + /* int i; */ + + /* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */ + + power_on(sc); + + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y1 * sc->sc_stride + x1)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,1), (sc->sc_internal_adr + y0 * sc->sc_stride + x0)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,1), (sc->sc_internal_adr + y1 * sc->sc_stride + x1)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n)); + /* for (i = 1 ; i < 8 ; i++) { */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */ + /* } */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,1), (sc->sc_stride)); + jareth_mpstart_write(sc, program_offset[pidx]); + jareth_mplen_write(sc, program_len[pidx]); + +#if 0 + { + uint32_t data[8]; + int i, j; + char buf[512]; + for (i = 0 ; i < 16 ; i++) { + for (j = 0 ; j < 8 ; j++) + data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j)); + snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]); + aprint_normal("reg%d : %s\n", i, buf); + } + } +#endif + + (void)start_job(sc, verbose); + delay(1); + (void)wait_job(sc, 1, verbose); + + power_off(sc); + + return 0; +} + static void jareth_copyrows(void *cookie, int src, int dst, int n) { diff --git a/sbus-to-ztex-gateware-migen/jareth.py b/sbus-to-ztex-gateware-migen/jareth.py index bbd9506..c59fe02 100644 --- a/sbus-to-ztex-gateware-migen/jareth.py +++ b/sbus-to-ztex-gateware-migen/jareth.py @@ -11,31 +11,48 @@ field_latex = "$\mathbf{{F}}_{{{{2^{{255}}}}-19}}$" opcode_bits = 5 # number of bits used to encode the opcode field opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shift a/b/q (star) - "UDF" : [-1, "Placeholder for undefined opcodes"], - "PSA" : [0, "Wd $\gets$ Ra // pass A"], - "PSB" : [1, "Wd $\gets$ Rb // pass B"], # for star version mostly + "UDF" : [-1, "Placeholder for undefined opcodes"], + "PSA" : [0, "Wd $\gets$ Ra // pass A"], + "PSB" : [1, "Wd $\gets$ Rb // pass B"], # for star version mostly "ROP32V" : [2, "Wd $\gets$ ((Rb ROP Ra) & planemask) | (Ra & ~planemask)" ], # replace MSK - "XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"], - "NOT" : [4, "Wd $\gets$ ~Ra // binary invert"], + "XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"], + "NOT" : [4, "Wd $\gets$ ~Ra // binary invert"], "ADD32V" : [5, "Wd[x..x+32] $\gets$ Ra[x..x+32] + Rb[x..x+32] // vector 32-bit binary add"], "SUB32V" : [6, "Wd[x..x+32] $\gets$ Ra[x..x+32] - Rb[x..x+32] // vector 32-bit binary sub"], - "AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL + "AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL "BRNZ32" : [8, "If Ra[0:32] != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if non-zero (32-bits)"], # replace TRD - "BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (32-bits)"], - "FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"], - "SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"], - "SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT - "BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"], - "BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"], + "BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (32-bits)"], + "FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"], + "SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"], + "SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT + "BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"], + "BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"], "MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"], - "BCAST32" : [16, "Wd[x..x+32] $\gets$ Ra[0..32]"], + "MANIP32": [16, "imm[0..1] == 0 BCAST32 = Wd[x..x+32] $\gets$ Ra[0..32], imm[0..1] == 1 SWAP32, imm[0..1] == 2 ROTR32V"], + "GETM": [17, "GETM: getmask" ], + "ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ], # for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ; - "GETM": [17, "GETM: getmask" ], - "ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ], - "MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ], - "SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ], - "LOADH" : [21, "LOADH: imm[7] == 0 for LOAD, address in addr[imm[0...]], high->low & load a+16 into high" ], - "MAX" : [22, "Maximum opcode number (for bounds checking)"], + # "MEM" imm: + # imm[8]: 256 bits mode + # imm[7]: L/S + # imm[6]: post-inc + # imm[5]: post-dec + # imm[4]: + # imm[3]: + # imm[0..2]: adr reg + "MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ], + "SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ], + # "LOADH/L" imm: + # imm[8]: 0 + # imm[7]: 0 + # imm[6]: post-inc + # imm[5]: post-dec + # imm[4]: + # imm[3]: + # imm[0..2]: adr reg + "LOADH" : [21, "LOADH: high->low & load *Adr into high" ], + "LOADL" : [22, "LOADL: low->high & load *Adr into low" ], + "MAX" : [23, "Maximum opcode number (for bounds checking)"], } num_registers = 32 @@ -334,7 +351,7 @@ passthrough. class ExecAddSub(ExecUnit, AutoDoc): def __init__(self, width=256): - ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "BCAST32" ]) + ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "MANIP32" ]) self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f""" """) @@ -347,12 +364,25 @@ class ExecAddSub(ExecUnit, AutoDoc): [ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ], ).Elif(self.instruction.opcode == opcodes["SUB32V"][0], [ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ], - ).Elif(self.instruction.opcode == opcodes["BCAST32"][0], - [ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ], ).Elif(self.instruction.opcode == opcodes["MIN32V"][0], [ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]), self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32]) - ).Else(self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ], + ).Else( + self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) + for x in range(0, width//32) ], + ).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 0), # BCAST32 + [ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ], + ).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 1), # SWAP32 + [ self.q[x*32:(x+1)*32].eq(self.a[(x^1)*32:((x^1)+1)*32]) for x in range(0, width//32) ], + ).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 2), # ROTR32V + Case(self.b[0:2], { + 0: [ self.q[x*32:(x+1)*32].eq( self.a[x*32 :(x+1)*32]) for x in range(0, width//32) ], + 1: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+ 8:(x+1)*32], self.a[x*32:x*32+ 8])) for x in range(0, width//32) ], + 2: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+16:(x+1)*32], self.a[x*32:x*32+16])) for x in range(0, width//32) ], + 3: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+24:(x+1)*32], self.a[x*32:x*32+24])) for x in range(0, width//32) ], + }), + ).Else( + [ self.q[x*32:(x+1)*32].eq(0xDEADBEEF) for x in range(0, width//32) ] ) ] @@ -418,7 +448,7 @@ class ExecRop(ExecUnit, AutoDoc): class ExecLS(ExecUnit, AutoDoc): def __init__(self, width=256, interface=None, memoryport=None, r_dat_f=None, r_dat_m=None, granule=0): - ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "GETM"]) + ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "LOADL", "GETM"]) self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f""" """) @@ -470,7 +500,7 @@ class ExecLS(ExecUnit, AutoDoc): lsseq.act("IDLE", If(start_pipe, - If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]), + If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]), NextValue(cpar, 0), NextValue(address, addresses[self.instruction.immediate[0:log2_int(width//32)]]), NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]] == 0x8)), @@ -554,6 +584,26 @@ class ExecLS(ExecUnit, AutoDoc): NextState("MEMh") ) ) + ).Elif(self.instruction.opcode == opcodes["LOADL"][0], + NextValue(self.has_timeout, 0), + NextValue(self.has_failure, 0), + NextValue(timeout, 2047), + NextValue(lbuf[128:256], self.b[0:128]), + If(wishbone, + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, address), + NextValue(interface.we, self.instruction.immediate[7]), + NextState("MEMl") + ).Else( + memoryport.cmd.we.eq(self.instruction.immediate[7]), + memoryport.cmd.addr.eq(address[0:]), + memoryport.cmd.valid.eq(1), + If(memoryport.cmd.ready, + NextState("MEMl") + ) + ) ) ) for X in range(0, granule_num): @@ -615,6 +665,8 @@ class ExecLS(ExecUnit, AutoDoc): If(wishbone & ~interface.ack, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), + ).Elif(self.instruction.immediate[5], # post-inc + NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), If(self.instruction.immediate[8], NextValue(interface.cyc, 1), @@ -627,7 +679,9 @@ class ExecLS(ExecUnit, AutoDoc): NextValue(interface.dat_w, self.b[128:256])), NextState("MEMh") ).Else( - NextValue(lbuf[128:256], 0), + If(self.instruction.opcode == opcodes["MEM"][0], + NextValue(lbuf[128:256], 0), + ), If(cpar, ## checkme NextState("MEM_ODD") ).Else( @@ -643,12 +697,16 @@ class ExecLS(ExecUnit, AutoDoc): If(memoryport.cmd.ready, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), + ).Elif(self.instruction.immediate[5], # post-inc + NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), NextState("MEMh"), ) ).Else( # no high If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), + ).Elif(self.instruction.immediate[5], # post-inc + NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), NextValue(lbuf[128:256], 0), If(cpar, ## checkme @@ -691,6 +749,8 @@ class ExecLS(ExecUnit, AutoDoc): If(wishbone & ~interface.ack, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), + ).Elif(self.instruction.immediate[5], # post-inc + NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), #NextValue(tries, 0), If(cpar, ## checkme @@ -701,6 +761,8 @@ class ExecLS(ExecUnit, AutoDoc): ).Elif(~wishbone, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), + ).Elif(self.instruction.immediate[5], # post-inc + NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), If(cpar, ## checkme NextState("MEM_ODD") @@ -733,7 +795,7 @@ class ExecLS(ExecUnit, AutoDoc): self.sync.mul_clk += [ If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"), self.q_valid.eq(1), - If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]), + If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]), If(~self.instruction.immediate[7], self.q.eq(lbuf), ).Else( diff --git a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs index 8d0bd81..792cfd2 100644 --- a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs +++ b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs @@ -212,6 +212,7 @@ fn main() -> std::io::Result<()> { fin ); +// FILL ******************************************************************************************************** let mcode_fill = assemble_jareth!( // x..x / $DST in %0 // 128-bits pattern in %1 [assumed to be alignement-homogneous] @@ -269,15 +270,17 @@ fn main() -> std::io::Result<()> { loopX_done: // how much did we do (#6 is 31, #5 is 32) - and %5, %0, #6 + and %8, %0, #6 // compute 32-(x&31) - sub32v %5, #5, %5 + sub32v %8, #5, %8 // compute the proper value - min32v %5, %5, %2 + min32v %8, %8, %2 // add that to the address, which will now be aligned - add32v %0, %0, %5 + add32v %0, %0, %8 // remove from X, as we have done it - sub32v %2, %2, %5 + sub32v %2, %2, %8 + // rotate the pattern to match + rotr32v %1, %1, %8 // fall through the aligned loop if not 0 brz32 done256, %2 @@ -329,6 +332,7 @@ fn main() -> std::io::Result<()> { fin ); +// FILL ROP ******************************************************************************************************** let mcode_fillrop = assemble_jareth!( // x..x / $DST in %0 // 128-bits pattern in %1 [assumed to be alignement-homogeneous] @@ -398,6 +402,8 @@ fn main() -> std::io::Result<()> { add32v %0, %0, %8 // remove from X, as we have done it sub32v %2, %2, %8 + // rotate the pattern to match + rotr32v %1, %1, %8 // fall through the aligned loop if not 0, otherwise done brz32 done256, %2 @@ -455,6 +461,7 @@ fn main() -> std::io::Result<()> { +// COPY ******************************************************************************************************** let mcode_copy = assemble_jareth!( // x..x / $SRC / $DST in %0 // x..x / $DST / $SRC in %1 @@ -490,10 +497,8 @@ fn main() -> std::io::Result<()> { // do the first column to align $DST startX: // set alignement; we shift by the addr offset - //and %14, %2, #15 setmq %15, %0, #16 setma %15, %1, #16 - getm %14 // copy Y psa %9, %3 // copy $SRC / $DST @@ -531,6 +536,7 @@ fn main() -> std::io::Result<()> { sub32v %2, %2, %9 // fall through to the aligned loop if not 0 brz32 done128, %2 + // reset q mask (we will be aligned from now on) setmq %15, #0, #16 // add the count to the addresses, ^1 will have the proper shift for masking @@ -596,6 +602,187 @@ fn main() -> std::io::Result<()> { fin ); +// COPYREV ******************************************************************************************************** + let mcode_copyrev = assemble_jareth!( + // x..x / $SRC / $DST in %0 + // x..x / $DST / $SRC in %1 + // x..x / X size in %2 + // x..x / Y size in %3, + // x..x src_stride / dst_stride in %4 (screen width?) + // ----- + // main loop: + // leftover X in %6 + // data in %7 + // masked data in %7 + // live X count in %9 + // $SRC / $DST in %10 + // $DST / $SRC in %11 + // live Y count in %12, also scratch in header + // todo X count in %13 + // amount of work in tail in %14 + // 0/scrap in %15 + // ----- + // tail loop: + // $SRC / $DST in %0 + // dst data in %7 + // src data in %8 + // live Y count in %9 + // 0/scrap in %15 + + + start: + // if number of line or element in line is 0, exit early + brz32 done128, %2 + brz32 done128, %3 + // reset masks + resm %15 + // compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16) + and %14, %0, #15 + // copy addresses + psa %10, %0 + psa %11, %1 + // set todo X + psa %13, %2 + // if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16) + brz32 skip, %14 + + // it is at most 16-($DST & 15) + sub32v %14, #16, %14 + // compute the proper value by bounding to Xsize + min32v %14, %14, %2 + // more than one address to increment + bcast32 %14, %14 + // add the count to the addresses, SRC will now be aligned + add32v %10, %10, %14 + // add the count to the addresses, DST will have the proper alignment to shift input in the aligned loop + add32v %11, %11, %14 + // so, do we do everything there ? + sub32v %13, %2, %14 + // if 0, we do everything in the tail skip the aligned loop + brz32 startX, %13 + + skip: + // reset q mask (we will be aligned from now on) + setmq %15, #0, #16 + // reset a mask to the proper shifting + setma %15, %11, #16 + + // now we need to figure out where we start to go backward + // currently we have the number of 'tail' (first column) elements in %14 (0 for aligned), number of 'loop' elements in %13, + // and $SRC+%14 & $DST+%14 in $10/$11 we $SRC+%14 aligned. + // compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those + and %6, %13, #15 + // compute the 'aligned' number of elements + sub32v %15, %13, %6 + // if 0, jump to the main loop as we already have the proper addresses + brz32 loop128_y, %15 + + bcast32 %15, %15 + // add the aligned number of element to $SRC+%14 & $DST+%14 + add32v %10, %10, %15 + add32v %11, %11, %15 + + // if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST + brnz32 skip2, %6 + sub32v %10, %10, #16 + skip2: // if $SRC is not aligned, we also need to add 16 (for prefetch) + and %15, %11, #15 + brz32 skip3, %15 + add32v %11, %11, #16 + psa %15, #16 + swap32 %15, %15 + add32v %10, %10, %15 + + skip3: + // copy Y count + psa %12, %3 + + loop128_y: + // set source and destination addresses for current Y // FIXME : +X, -1? + setadr %15, %10 + // then the rounded value in X + sub32v %9, %13, %6 + // prefetch data + + // prefetch data + load128dec %8, ^1 + + // check for line leftovers + loop128_x_begin: + brz4 loop128_x, %6 + + // set the leftovers mask (offset is 0 as we are aligned) + // IMPROVE ME + setmq %15, #0, %6 + // prefetch data + loadl128dec %8, ^1, %8 + // load old data + load128 %7, ^0 + // insert data + psa* %7, %8 + // rewrite data + store128dec %15, ^0, %7 + // reset the Q mask + // IMPROVE ME + setmq %15, #0, #16 + + loop128_x: + // already 0, bypass aligned stuff + brz32 loop128_x_end, %9 + // prefetch data + loadl128dec %8, ^1, %8 + // insert data + psa* %7, %8 + // write data + store128dec %15, ^0, %7 + // sub 16 (#16 is 16) from live rounded X count + sub32v %9, %9, #16 + // if X count is not 0, keep looping + brnz32 loop128_x, %9 + + loop128_x_end: + // decrement Y count + sub32v %12, %12, #1 + // if 0, finished + brz32 startX, %12 + + // add strides to initial addresses + add32v %10, %10, %4 + // loop128 to do next line + brz32 loop128_y, #0 + + startX: + // do the first column if we need to + brz32 done128, %14 + // set alignement; we shift by the addr offset + setmq %15, %0, #16 + setma %15, %1, #16 + // copy Y + psa %9, %3 + loopX_y: + // setadr from the start + setadr %15, %0 + // load src + load256 %8, ^1 + // load old data + load128 %7, ^0 + // insert data + psa* %7, %8 + // rewrite data + store128 %15, ^0, %7 + // increment $SRC / $DST by stride + add32v %0, %0, %4 + // decrement copied Y count + sub32v %9, %9, #1 + // if not zero, continue + brnz32 loopX_y, %9 + + done128: + fin + fin + ); + +// ****** ******************************************************************************************************** let mut pos; @@ -671,5 +858,14 @@ fn main() -> std::io::Result<()> { println!(""); println!("-> {}", mcode_copy.len()); + pos = 0; + println!("copyrev:"); + while pos < mcode_copyrev.len() { + print!("0x{:08x},", mcode_copyrev[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", mcode_copyrev.len()); + Ok(()) }