diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c index 0f71ac5..015d319 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c @@ -160,40 +160,21 @@ static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, i static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop); static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop); static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop); + +static const uint32_t program_fill[35] = { 0x10000089,0x0f8000c9,0x01bc0014,0x0780000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,0xc03c1033,0x00184185,0x00161146,0xfd800148,0x00226007,0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x07000089,0x001a6087,0x013c6814,0x403c0012,0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0080018d,0xc03c1033,0x000e10c6,0x010000c9,0x00004005,0xfa000809,0x0000000a,0x0000000a }; + +static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000,0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007,0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814,0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,0x0000000a }; + +static const uint32_t program_copy[43] = { 0x14000089,0x138000c9,0x01bc0014,0x013c2014,0x00bf0054,0x0900000d,0x002400c0,0x00180000,0x403c0192,0x80a00013,0x403c8033,0x00184185,0x00261246,0xfd000248,0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0a000089,0x013f0814,0x00049045,0x00bf0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02000249,0x603c8033,0x00270246,0x20a08015,0xfe000248,0x0180018d,0x013c6814,0x403c8033,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,0x0000000a }; + +static const uint32_t program_copyrev[61] = { 0x1d000089,0x1c8000c9,0x01bc0014,0x00280000,0x002c0040,0x00340080,0x003af007,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x11000349,0x013f0814,0x00bf02d4,0x001af347,0x003c6346,0x003c03d0,0x0028f285,0x002cf2c5,0x02000188,0x003c0c00,0x003c03d0,0x0028f286,0x002cf2c6,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0200018d,0x013c6814,0x10a08016,0x503c8033,0x013f0814,0x02000249,0x10a08016,0x503c8033,0x00270246,0xfd800248,0x00321306,0x01000309,0x00284285,0xf7800809,0x04800389,0x013c2014,0x00bf0054,0x002400c0,0x403c0012,0x80a00013,0x403c8033,0x00004005,0x00261246,0xfd000248,0x0000000a,0x0000000a }; + static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9, 0x00004005,0xfb000809,0x0000000a,0x0000000a }; -static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005, - 0xfb800809,0x0000000a,0x0000000a }; -static const uint32_t program_fill256[14] = { 0x01bc0014,0x001a6087,0x013c6814,0x403c0012,0x00146086,0xe03c1013,0x00165146,0xfe800148, - 0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a }; -static const uint32_t program_fill[39] = { 0x12000089,0x118000c9,0x01bc0014,0x0880000d,0x013c2014,0x001400c0,0x00180000,0x403c0192, - 0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00226007,0x00208946, - 0x0020220f,0x00008005,0x00088086,0x01048050,0x08000089,0x001a6087,0x013c6814,0x403c0012, - 0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060, - 0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a }; - -static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000, - 0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007, - 0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814, - 0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208, - 0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809, - 0x0000000a,0x0000000a }; - -static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x013c2014,0x003f0054,0x0a00000d,0x002400c0,0x00180000, - 0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248, - 0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814, - 0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220, - 0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220, - 0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a }; - -static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x00280000,0x002c0040,0x00340080,0x003af007,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x003c03d0,0x0028f285,0x002cf2c5,0x02000188,0x003c0c00,0x003c03d0,0x0028f286,0x002cf2c6,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013c2014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a }; - -static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill, - program_fillrop, program_copy, program_copyrev, NULL }; -static const uint32_t program_len[8] = { 12, 11, 14, 39, - 42, 48, 66, 0 }; -static uint32_t program_offset[8]; +static const uint32_t* programs[6] = { program_fill, program_fillrop, program_copy, program_copyrev, program_scroll128, NULL }; +static const uint32_t program_len[6] = { 35, 42, 43, 61, 12, 0 }; +static uint32_t program_offset[6]; static void goblin_set_depth(struct goblin_softc *, int); @@ -468,10 +449,10 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l) return ENXIO; } switch (fn->off) { - case JARETH_FN_NUM_FILL: pidx = 3; break; - case JARETH_FN_NUM_FILLROP: pidx = 4; break; - case JARETH_FN_NUM_COPY: pidx = 5; break; - case JARETH_FN_NUM_COPYREV: pidx = 6; break; + case JARETH_FN_NUM_FILL: pidx = 0; break; + case JARETH_FN_NUM_FILLROP: pidx = 1; break; + case JARETH_FN_NUM_COPY: pidx = 2; break; + case JARETH_FN_NUM_COPYREV: pidx = 3; break; } if (pidx != -1) { fn->off = program_offset[pidx]; @@ -864,7 +845,7 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n) { const uint32_t base = 0; - const int pidx = 0; + const int pidx = 4; /* int i; */ power_on(sc); @@ -901,7 +882,7 @@ static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n) { const uint32_t base = 0; - const int pidx = 3; // fill + const int pidx = 0; // fill int i; power_on(sc); @@ -931,7 +912,7 @@ static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, i static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop) { const uint32_t base = 0; - const int pidx = 4; // fillrop + const int pidx = 1; // fillrop int i; power_on(sc); @@ -963,7 +944,7 @@ static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) { const uint32_t base = 0; - const int pidx = 5; // copy + const int pidx = 2; // copy /* int i; */ /* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */ @@ -1010,7 +991,7 @@ static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, i static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) { const uint32_t base = 0; - const int pidx = 6; // copyrev + const int pidx = 3; // copyrev /* int i; */ /* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */ diff --git a/sbus-to-ztex-gateware-migen/jareth.py b/sbus-to-ztex-gateware-migen/jareth.py index 80c5cbd..037ec34 100644 --- a/sbus-to-ztex-gateware-migen/jareth.py +++ b/sbus-to-ztex-gateware-migen/jareth.py @@ -551,11 +551,17 @@ class ExecLS(ExecUnit, AutoDoc): If(wishbone, NextValue(interface.cyc, 1), NextValue(interface.stb, 1), - NextValue(interface.sel, 2**len(interface.sel)-1), NextValue(interface.adr, address), NextValue(interface.we, self.instruction.immediate[7]), + NextValue(interface.sel, 2**len(interface.sel)-1), If(self.instruction.immediate[7], # do we need those tests or could we always update dat_w/dat_r ? - NextValue(interface.dat_w, self.b[0:128])), + If(self.instruction.shift, + NextValue(interface.dat_w, (self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[0:128]), + NextValue(interface.sel, r_dat_m[2][0:16]), + ).Else( + NextValue(interface.dat_w, self.b[0:128]), + ), + ), NextState("MEMl") # MEMl ).Else( memoryport.cmd.we.eq(self.instruction.immediate[7]), @@ -660,8 +666,13 @@ class ExecLS(ExecUnit, AutoDoc): NextValue(lbuf[0:128], memoryport.rdata.data), NextState("MEMl2"), ).Elif(~wishbone & self.instruction.immediate[7], - memoryport.wdata.data.eq(self.b[0:128]), memoryport.wdata.valid.eq(1), + If(self.instruction.shift, + memoryport.wdata.data.eq((self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[0:128]), + memoryport.wdata.we.eq(r_dat_m[2][0:16]), + ).Else( + memoryport.wdata.data.eq(self.b[0:128]), + ), If(memoryport.wdata.ready, NextState("MEMl2"), ), @@ -682,12 +693,18 @@ class ExecLS(ExecUnit, AutoDoc): If(self.instruction.immediate[8], NextValue(interface.cyc, 1), NextValue(interface.stb, 1), - NextValue(interface.sel, 2**len(interface.sel)-1), NextValue(interface.adr, address + 1), NextValue(interface.we, self.instruction.immediate[7]), + NextValue(interface.sel, 2**len(interface.sel)-1), NextValue(timeout, 2047), If(self.instruction.immediate[7], - NextValue(interface.dat_w, self.b[128:256])), + If(self.instruction.shift, + NextValue(interface.dat_w, (self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[128:256]), + NextValue(interface.sel, r_dat_m[2][16:32]), + ).Else( + NextValue(interface.dat_w, self.b[128:256]), + ), + ), NextState("MEMh") ).Else( If(self.instruction.opcode == opcodes["MEM"][0], @@ -746,8 +763,13 @@ class ExecLS(ExecUnit, AutoDoc): NextValue(lbuf[128:256], memoryport.rdata.data), NextState("MEMh2"), ).Elif(~wishbone & self.instruction.immediate[7], - memoryport.wdata.data.eq(self.b[128:256]), memoryport.wdata.valid.eq(1), + If(self.instruction.shift, + memoryport.wdata.data.eq((self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[128:256]), + memoryport.wdata.we.eq(r_dat_m[2][16:32]), + ).Else( + memoryport.wdata.data.eq(self.b[128:256]), + ), If(memoryport.wdata.ready, NextState("MEMh2"), ), @@ -849,6 +871,7 @@ class ExecLS(ExecUnit, AutoDoc): self.sync.mul_clk += self.state[6].eq(lsseq.ongoing("MEM_EVEN1")) self.sync.mul_clk += self.state[7].eq(lsseq.ongoing("MEM_EVEN2")) self.sync.mul_clk += self.state[8].eq(lsseq.ongoing("MEM_ERR")) + self.sync.mul_clk += self.state[9].eq(lsseq.ongoing("GENMASK_R0")) self.sync.mul_clk += self.state[28:30].eq((self.state[28:30] & Replicate(~start_pipe, 2)) | self.has_timeout) self.sync.mul_clk += self.state[30:32].eq((self.state[30:32] & Replicate(~start_pipe, 2)) | self.has_failure) @@ -1433,7 +1456,7 @@ Here are the currently implemented opcodes for The Engine: "exec_logic": True, "exec_addsub": False, "exec_rop": True, - "exec_ls": False, + "exec_ls": True, } exec_unit_shift_num = { } index = 0 diff --git a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs index 01c4b18..8660566 100644 --- a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs +++ b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs @@ -255,12 +255,8 @@ fn main() -> std::io::Result<()> { loopX_y: // setadr setadr %15, %6 - // load old data - load256 %7, ^0 - // insert pattern - psa* %7, %1 - // rewrite data - store256 %15, ^0, %7 + // write partial data + store256* %15, ^0, %1 // increment copied $DST by stride add32v %6, %6, %4 // decrement copied Y count @@ -308,13 +304,8 @@ fn main() -> std::io::Result<()> { // check for line leftovers loop256_x_end: brz4 done256_x, %6 - - // load old data - load256 %7, ^0 - // insert pattern - psa* %7, %1 - // rewrite data - store256 %15, ^0, %7 + // write partial data + store256* %15, ^0, %1 done256_x: // decrement Y count @@ -474,13 +465,13 @@ fn main() -> std::io::Result<()> { // leftover X in %6 // // live Y count in %3 // data in %7 - // masked data in %7 + // // masked data in %7 // 0/scrap in %15 // ----- // header loop: // live Y count in %9 // $SRC / $DST in %6 - // dst data in %7 + // // dst data in %7 // src data in %8 // 0/scrap in %15 @@ -493,7 +484,8 @@ fn main() -> std::io::Result<()> { resm %15 // set alignement; we shift by the addr offset setmq %15, %0, %2 - setma %15, %1, #16 + // we use b as that's the data input for Stores + setmb %15, %1, #16 // if $DST is aligned on 128 bits, jump to aligned loop brz4 start128, %0 @@ -508,12 +500,8 @@ fn main() -> std::io::Result<()> { setadr %15, %6 // load src load256 %8, ^1 - // load old data - load128 %7, ^0 - // insert data - psa* %7, %8 - // rewrite data - store128 %15, ^0, %7 + // write partial data + store128* %15, ^0, %8 // increment copied $SRC / $DST by stride add32v %6, %6, %4 // decrement copied Y count @@ -542,7 +530,7 @@ fn main() -> std::io::Result<()> { // add the count to the addresses, ^1 will have the proper shift for masking add32v %1, %1, %9 // reset a mask to the proper shifting - setma %15, %1, #16 + setmb %15, %1, #16 start128: // compute X leftovers (modulo 16 -> #15 is 15) @@ -559,10 +547,8 @@ fn main() -> std::io::Result<()> { brz32 loop128_x_end, %9 loop128_x: - // merge data from input - psa* %7, %8 // store to DST w/ post-increment - store128inc %15, ^0, %7 + store128inc* %15, ^0, %8 // sub 16 (#16 is 16) from live rounded X count sub32v %9, %9, #16 // prefetch data @@ -576,12 +562,8 @@ fn main() -> std::io::Result<()> { // set the leftovers mask (offset is 0 as we are aligned) // IMPROVE ME setmq %15, #0, %6 - // load old data - load128 %7, ^0 - // insert pattern - psa* %7, %8 // rewrite data - store128 %15, ^0, %7 + store128* %15, ^0, %8 // reset the Q mask // IMPROVE ME setmq %15, #0, #16 @@ -613,7 +595,7 @@ fn main() -> std::io::Result<()> { // main loop: // leftover X in %6 // data in %7 - // masked data in %7 + // // masked data in %7 // src data in %8 // live X count in %9 // $SRC / $DST in %10 @@ -625,7 +607,7 @@ fn main() -> std::io::Result<()> { // ----- // tail loop: // $SRC / $DST in %0 - // dst data in %7 + // // dst data in %7 // src data in %8 // live Y count in %9 // 0/scrap in %15 @@ -665,8 +647,8 @@ fn main() -> std::io::Result<()> { skip: // reset q mask (we will be aligned from now on) setmq %15, #0, #16 - // set a mask to the proper shifting - setma %15, %11, #16 + // set b mask to the proper shifting for Stores + setmb %15, %11, #16 // now we need to figure out where we start to go backward // currently we have the number of 'tail' (first column...) elements in %14 (0 for aligned), number of 'loop' elements in %13, @@ -726,12 +708,8 @@ fn main() -> std::io::Result<()> { setmq %15, #0, %6 // prefetch data loadl128dec %8, ^1, %8 - // load old data - load128 %7, ^0 - // insert data - psa* %7, %8 - // rewrite data - store128dec %15, ^0, %7 + // write partial data + store128dec* %15, ^0, %8 // reset the Q mask // IMPROVE ME setmq %15, #0, #16 @@ -741,10 +719,8 @@ fn main() -> std::io::Result<()> { brz32 loop128_x_end, %9 // prefetch data loadl128dec %8, ^1, %8 - // insert data - psa* %7, %8 // write data - store128dec %15, ^0, %7 + store128dec* %15, ^0, %8 // sub 16 (#16 is 16) from live rounded X count sub32v %9, %9, #16 // if X count is not 0, keep looping @@ -766,7 +742,7 @@ fn main() -> std::io::Result<()> { brz32 done128, %14 // set alignement; we shift by the addr offset setmq %15, %0, %2 - setma %15, %1, #16 + setmb %15, %1, #16 // copy Y psa %9, %3 loopX_y: @@ -774,12 +750,8 @@ fn main() -> std::io::Result<()> { setadr %15, %0 // load src load256 %8, ^1 - // load old data - load128 %7, ^0 - // insert data - psa* %7, %8 - // rewrite data - store128 %15, ^0, %7 + // write partial data + store128* %15, ^0, %8 // increment $SRC / $DST by stride add32v %0, %0, %4 // decrement copied Y count