more aceel

2026-05-23 22:00:44 +00:00 · 2022-03-16 23:27:07 +01:00
parent a75b2a2d18
commit d8504c8713
3 changed files with 370 additions and 48 deletions
--- a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c
+++ b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c
@@ -124,6 +124,7 @@ struct scrolltest {
 #define GOBLIN_FILL     _IOW('X', 1, struct scrolltest)
 #define GOBLIN_FILLROP  _IOW('X', 2, struct scrolltest)
 #define GOBLIN_COPY     _IOW('X', 3, struct scrolltest)
+#define GOBLIN_COPYREV  _IOW('X', 4, struct scrolltest)

 static int 	goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *);
 static paddr_t	goblin_mmap(void *, void *, off_t, int);
@@ -146,6 +147,7 @@ static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose,
 static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n);
 static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop);
 static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
+static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
 static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,
 												0x00004005,0xfb000809,0x0000000a,0x0000000a };
 static const uint32_t program_fill128[11] =   { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,
@@ -153,24 +155,33 @@ static const uint32_t program_fill128[11] =   { 0x407c0012,0x00140080,0x607c1013
 static const uint32_t program_fill256[14] =   { 0x01bc0014,0x001a6087,0x013c6814,0x403c0012,0x00146086,0xe03c1013,0x00165146,0xfe800148,
 												0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a };

-static const uint32_t program_fill[38] =      { 0x11800089,0x110000c9,0x01bc0014,0x0800000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
-												0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00166007,0x00145946,
-												0x0014214f,0x00005005,0x00085086,0x08000089,0x001a6087,0x013c6814,0x403c0012,0x00146086,
-												0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,0xc03c7013,
-												0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };
+static const uint32_t program_fill[39] =      { 0x12000089,0x118000c9,0x01bc0014,0x0880000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
+												0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00226007,0x00208946,
+												0x0020220f,0x00008005,0x00088086,0x01048050,0x08000089,0x001a6087,0x013c6814,0x403c0012,
+												0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,
+												0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };

-static const uint32_t program_fillrop[41] =   { 0x13000089,0x128000c9,0x01bc0014,0x003c014c,0x0800000d,0x013c2014,0x002000c0,0x00180000,
+static const uint32_t program_fillrop[42] =   { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000,
 												0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007,
-												0x00208946,0x0020220f,0x00008005,0x00088086,0x09000089,0x001a6087,0x013c6814,0x403c0012,
-												0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,0x0180018d,
-												0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,
-												0x0000000a};
+												0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814,
+												0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,
+												0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,
+												0x0000000a,0x0000000a };

-static const uint32_t program_copy[49] =      { 0x17000089,0x168000c9,0x01bc0014,0x0b80000d,0x013f0014,0x003f0054,0x00380011,0x001400c0,0x00180000,0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00161146,0xfc000148,0x0016f007,0x00145c06,0x0014214f,0x00140150,0x00005005,0x00085086,0x0b800089,0x013f0814,0x00045045,0x003f0054,0x001af087,0x403c0012,0x00146086,0xa0a00013,0x02800149,0x001c0220,0x603c7013,0x00170146,0x20a08015,0xfd800148,0x0280018d,0x013c6814,0x001c0013,0x001c0220,0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
+static const uint32_t program_copy[48] =      {  0x16800089,0x160000c9,0x01bc0014,0x0b00000d,0x013f0014,0x003f0054,0x002400c0,0x00180000,
+												 0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248,
+												 0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814,
+												 0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220,
+												 0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220,
+												 0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };

-static const uint32_t* programs[7] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, NULL };
-static const uint32_t program_len[7] = { 12, 11, 14, 38, 41, 49, 0 };
-static       uint32_t program_offset[7];
+static const uint32_t program_copyrev[66] =   {  0x1f800089,0x1f0000c9,0x01bc0014,0x003af007,0x00280000,0x002c0040,0x00340080,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x060003c9,0x003c03d0,0x0028f285,0x002cf2c5,0x00800188,0x002b0286,0x003ef2c7,0x020003c9,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013f0014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a };
+
+static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill,
+									   program_fillrop,   program_copy,    program_copyrev, NULL };
+static const uint32_t program_len[8] = { 12, 11, 14, 39,
+										 42, 48, 66,  0 };
+static       uint32_t program_offset[8];

 static void goblin_set_depth(struct goblin_softc *, int);

@@ -413,6 +424,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
 	}
 		break;

+	case GOBLIN_COPYREV: {
+		struct scrolltest *st = (struct scrolltest *)data;
+		jareth_copyrev(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, /* x1 */ st->pm, st->rop);
+	}
+		break;
+
 	default:
 		return (ENOTTY);
 	}
@@ -938,6 +955,53 @@ static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, i
 	return 0;
 }

+static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) {
+	const uint32_t base = 0;
+	const int pidx = 6; // copyrev
+	/* int i; */
+
+	/* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */
+
+	power_on(sc);
+
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,1), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,1), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w));
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n));
+	/* for (i = 1 ; i < 8 ; i++) { */
+	/* 	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */
+	/* 	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */
+	/* } */
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride));
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,1), (sc->sc_stride));
+	jareth_mpstart_write(sc, program_offset[pidx]);
+	jareth_mplen_write(sc, program_len[pidx]);
+
+#if 0
+	{
+			uint32_t data[8];
+			int i, j;
+			char buf[512];
+			for (i = 0 ; i < 16 ; i++) {
+				for (j = 0 ; j < 8 ; j++)
+					data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j));
+				snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]);
+				aprint_normal("reg%d : %s\n", i, buf);
+			}
+		}
+#endif
+	
+	(void)start_job(sc, verbose);
+	delay(1);
+	(void)wait_job(sc, 1, verbose);
+
+	power_off(sc);
+
+	return 0;
+}
+
 static void
 jareth_copyrows(void *cookie, int src, int dst, int n)
 {
--- a/sbus-to-ztex-gateware-migen/jareth.py
+++ b/sbus-to-ztex-gateware-migen/jareth.py
@@ -11,31 +11,48 @@ field_latex = "$\mathbf{{F}}_{{{{2^{{255}}}}-19}}$"

 opcode_bits = 5  # number of bits used to encode the opcode field
 opcodes = {  # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shift a/b/q (star)
-    "UDF" : [-1, "Placeholder for undefined opcodes"],
-    "PSA" : [0, "Wd $\gets$ Ra  // pass A"],
-    "PSB" : [1, "Wd $\gets$ Rb  // pass B"], # for star version mostly
+    "UDF" :    [-1, "Placeholder for undefined opcodes"],
+    "PSA" :    [0, "Wd $\gets$ Ra  // pass A"],
+    "PSB" :    [1, "Wd $\gets$ Rb  // pass B"], # for star version mostly
    "ROP32V" : [2, "Wd $\gets$ ((Rb ROP Ra) & planemask) | (Ra & ~planemask)" ], # replace MSK
-    "XOR" : [3, "Wd $\gets$ Ra ^ Rb  // bitwise XOR"],
-    "NOT" : [4, "Wd $\gets$ ~Ra   // binary invert"],
+    "XOR" :    [3, "Wd $\gets$ Ra ^ Rb  // bitwise XOR"],
+    "NOT" :    [4, "Wd $\gets$ ~Ra   // binary invert"],
    "ADD32V" : [5, "Wd[x..x+32] $\gets$ Ra[x..x+32] + Rb[x..x+32] // vector 32-bit binary add"],
    "SUB32V" : [6, "Wd[x..x+32] $\gets$ Ra[x..x+32] - Rb[x..x+32] // vector 32-bit binary sub"],
-    "AND" : [7, "Wd $\gets$ Ra & Rb  // bitwise AND"], # replace MUL
+    "AND" :    [7, "Wd $\gets$ Ra & Rb  // bitwise AND"], # replace MUL
    "BRNZ32" : [8, "If Ra[0:32] != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if non-zero (32-bits)"], # replace TRD
-    "BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if zero (32-bits)"],
-    "FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
-    "SHL" : [11, "Wd $\gets$ Ra << 1  // shift Ra left by one and store in Wd"],
-    "SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT
-    "BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if zero (4-bits)"],
-    "BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if zero (5-bits)"],
+    "BRZ32" :  [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if zero (32-bits)"],
+    "FIN" :    [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
+    "SHL" :    [11, "Wd $\gets$ Ra << 1  // shift Ra left by one and store in Wd"],
+    "SROP" :   [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT
+    "BRZ4" :   [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if zero (4-bits)"],
+    "BRZ5" :   [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if zero (5-bits)"],
    "MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"],
-    "BCAST32" : [16, "Wd[x..x+32] $\gets$ Ra[0..32]"],
+    "MANIP32": [16, "imm[0..1] == 0 BCAST32 = Wd[x..x+32] $\gets$ Ra[0..32], imm[0..1] == 1 SWAP32, imm[0..1] == 2 ROTR32V"],
+    "GETM":    [17, "GETM: getmask" ],
+    "ADR":     [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
    # for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ;
-    "GETM": [17, "GETM: getmask" ],
-    "ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
-    "MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ],
-    "SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ],
-    "LOADH" : [21, "LOADH: imm[7] == 0 for LOAD, address in addr[imm[0...]], high->low & load a+16 into high" ],
-    "MAX" : [22, "Maximum opcode number (for bounds checking)"],
+    # "MEM" imm:
+    #   imm[8]: 256 bits mode
+    #   imm[7]: L/S
+    #   imm[6]: post-inc
+    #   imm[5]: post-dec
+    #   imm[4]: 
+    #   imm[3]: 
+    #   imm[0..2]: adr reg
+    "MEM" :    [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ],
+    "SETM" :   [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ],
+    # "LOADH/L" imm:
+    #   imm[8]: 0
+    #   imm[7]: 0
+    #   imm[6]: post-inc
+    #   imm[5]: post-dec
+    #   imm[4]: 
+    #   imm[3]: 
+    #   imm[0..2]: adr reg
+    "LOADH" :  [21, "LOADH: high->low & load *Adr into high" ],
+    "LOADL" :  [22, "LOADL: low->high & load *Adr into low" ],
+    "MAX" :    [23, "Maximum opcode number (for bounds checking)"],
 }

 num_registers = 32
@@ -334,7 +351,7 @@ passthrough.

 class ExecAddSub(ExecUnit, AutoDoc):
    def __init__(self, width=256):
-        ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "BCAST32" ])
+        ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "MANIP32" ])
        self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f"""
        """)

@@ -347,12 +364,25 @@ class ExecAddSub(ExecUnit, AutoDoc):
                   [ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
            ).Elif(self.instruction.opcode == opcodes["SUB32V"][0],
                   [ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
-            ).Elif(self.instruction.opcode == opcodes["BCAST32"][0],
-                   [ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
            ).Elif(self.instruction.opcode == opcodes["MIN32V"][0],
                   [ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]),
                        self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32])
-                       ).Else(self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
+                       ).Else(
+                           self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32]))
+                     for x in range(0, width//32) ],
+            ).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 0), # BCAST32
+                   [ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
+            ).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 1), # SWAP32
+                   [ self.q[x*32:(x+1)*32].eq(self.a[(x^1)*32:((x^1)+1)*32]) for x in range(0, width//32) ], 
+            ).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 2), # ROTR32V
+                   Case(self.b[0:2], {
+                       0: [ self.q[x*32:(x+1)*32].eq(    self.a[x*32   :(x+1)*32])                        for x in range(0, width//32) ],
+                       1: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+ 8:(x+1)*32], self.a[x*32:x*32+ 8])) for x in range(0, width//32) ],
+                       2: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+16:(x+1)*32], self.a[x*32:x*32+16])) for x in range(0, width//32) ],
+                       3: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+24:(x+1)*32], self.a[x*32:x*32+24])) for x in range(0, width//32) ],
+                   }),
+            ).Else(
+                [ self.q[x*32:(x+1)*32].eq(0xDEADBEEF) for x in range(0, width//32) ]
            )
        ]
        
@@ -418,7 +448,7 @@ class ExecRop(ExecUnit, AutoDoc):

 class ExecLS(ExecUnit, AutoDoc):
    def __init__(self, width=256, interface=None, memoryport=None, r_dat_f=None, r_dat_m=None, granule=0):
-        ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "GETM"])
+        ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "LOADL", "GETM"])
        
        self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f"""
        """)
@@ -470,7 +500,7 @@ class ExecLS(ExecUnit, AutoDoc):

        lsseq.act("IDLE",
                  If(start_pipe,
-                     If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]),
+                     If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]),
                        NextValue(cpar, 0),
                        NextValue(address, addresses[self.instruction.immediate[0:log2_int(width//32)]]),
                        NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]] == 0x8)),
@@ -554,6 +584,26 @@ class ExecLS(ExecUnit, AutoDoc):
                                NextState("MEMh")
                             )
                         )
+                  ).Elif(self.instruction.opcode == opcodes["LOADL"][0],
+                         NextValue(self.has_timeout, 0),
+                         NextValue(self.has_failure, 0),
+                         NextValue(timeout, 2047),
+                         NextValue(lbuf[128:256], self.b[0:128]),
+                         If(wishbone,
+                            NextValue(interface.cyc, 1),
+                            NextValue(interface.stb, 1),
+                            NextValue(interface.sel, 2**len(interface.sel)-1),
+                            NextValue(interface.adr, address),
+                            NextValue(interface.we, self.instruction.immediate[7]),
+                            NextState("MEMl")
+                         ).Else(
+                             memoryport.cmd.we.eq(self.instruction.immediate[7]),
+                             memoryport.cmd.addr.eq(address[0:]),
+                             memoryport.cmd.valid.eq(1),
+                             If(memoryport.cmd.ready,
+                                NextState("MEMl")
+                             )
+                         )
                  )
        )
        for X in range(0, granule_num):
@@ -615,6 +665,8 @@ class ExecLS(ExecUnit, AutoDoc):
                  If(wishbone & ~interface.ack,
                     If(self.instruction.immediate[6], # post-inc
                        NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
+                     ).Elif(self.instruction.immediate[5], # post-inc
+                        NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
                     ),
                     If(self.instruction.immediate[8],
                        NextValue(interface.cyc, 1),
@@ -627,7 +679,9 @@ class ExecLS(ExecUnit, AutoDoc):
                           NextValue(interface.dat_w, self.b[128:256])),
                        NextState("MEMh")
                     ).Else(
-                         NextValue(lbuf[128:256], 0),
+                         If(self.instruction.opcode == opcodes["MEM"][0],
+                            NextValue(lbuf[128:256], 0),
+                         ),
                         If(cpar, ## checkme
                            NextState("MEM_ODD")
                         ).Else(
@@ -643,12 +697,16 @@ class ExecLS(ExecUnit, AutoDoc):
                            If(memoryport.cmd.ready,
                               If(self.instruction.immediate[6], # post-inc
                                  NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
+                               ).Elif(self.instruction.immediate[5], # post-inc
+                                      NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
                               ),
                               NextState("MEMh"),
                            )
                         ).Else( # no high
                             If(self.instruction.immediate[6], # post-inc
                                NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
+                             ).Elif(self.instruction.immediate[5], # post-inc
+                                    NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
                             ),
                             NextValue(lbuf[128:256], 0),
                             If(cpar, ## checkme
@@ -691,6 +749,8 @@ class ExecLS(ExecUnit, AutoDoc):
                  If(wishbone & ~interface.ack,
                     If(self.instruction.immediate[6], # post-inc
                        NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
+                     ).Elif(self.instruction.immediate[5], # post-inc
+                        NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
                     ),
                     #NextValue(tries, 0),
                     If(cpar, ## checkme
@@ -701,6 +761,8 @@ class ExecLS(ExecUnit, AutoDoc):
                  ).Elif(~wishbone,
                         If(self.instruction.immediate[6], # post-inc
                            NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
+                         ).Elif(self.instruction.immediate[5], # post-inc
+                                NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
                         ),
                         If(cpar, ## checkme
                            NextState("MEM_ODD")
@@ -733,7 +795,7 @@ class ExecLS(ExecUnit, AutoDoc):
        self.sync.mul_clk += [
            If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"),
               self.q_valid.eq(1),
-               If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]),
+               If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]),
                  If(~self.instruction.immediate[7],
                     self.q.eq(lbuf),
                  ).Else(
--- a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs
+++ b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs
@@ -212,6 +212,7 @@ fn main() -> std::io::Result<()> {
 				fin
 	);

+//	FILL ********************************************************************************************************
    let mcode_fill = assemble_jareth!(
 	// x..x / $DST in %0
 	// 128-bits pattern in %1 [assumed to be alignement-homogneous]
@@ -269,15 +270,17 @@ fn main() -> std::io::Result<()> {

 		loopX_done:
 				// how much did we do (#6 is 31, #5 is 32)
-				and %5, %0, #6
+				and %8, %0, #6
 				// compute 32-(x&31)
-				sub32v %5, #5, %5
+				sub32v %8, #5, %8
 				// compute the proper value
-				min32v %5, %5, %2
+				min32v %8, %8, %2
 				// add that to the address, which will now be aligned
-				add32v %0, %0, %5
+				add32v %0, %0, %8
 				// remove from X, as we have done it
-				sub32v %2, %2, %5
+				sub32v %2, %2, %8
+				// rotate the pattern to match
+				rotr32v %1, %1, %8
 				// fall through the aligned loop if not 0
 				brz32 done256, %2

@@ -329,6 +332,7 @@ fn main() -> std::io::Result<()> {
 				fin
 	);
 	
+//	FILL ROP ********************************************************************************************************
    let mcode_fillrop = assemble_jareth!(
 	// x..x / $DST in %0
 	// 128-bits pattern in %1 [assumed to be alignement-homogeneous]
@@ -398,6 +402,8 @@ fn main() -> std::io::Result<()> {
 				add32v %0, %0, %8
 				// remove from X, as we have done it
 				sub32v %2, %2, %8
+				// rotate the pattern to match
+				rotr32v %1, %1, %8
 				// fall through the aligned loop if not 0, otherwise done
 				brz32 done256, %2

@@ -455,6 +461,7 @@ fn main() -> std::io::Result<()> {



+//	COPY ********************************************************************************************************
    let mcode_copy = assemble_jareth!(
 	// x..x / $SRC / $DST in %0
 	// x..x / $DST / $SRC in %1
@@ -490,10 +497,8 @@ fn main() -> std::io::Result<()> {
 				// do the first column to align $DST
 		startX:
 				// set alignement; we shift by the addr offset
-				//and %14, %2, #15
 				setmq %15, %0, #16
 				setma %15, %1, #16
-				getm %14
 				// copy Y
 				psa %9, %3
 				// copy $SRC / $DST
@@ -531,6 +536,7 @@ fn main() -> std::io::Result<()> {
 				sub32v %2, %2, %9
 				// fall through to the aligned loop if not 0
 				brz32 done128, %2
+				
 				// reset q mask (we will be aligned from now on)
 				setmq %15, #0, #16
 				// add the count to the addresses, ^1 will have the proper shift for masking
@@ -596,6 +602,187 @@ fn main() -> std::io::Result<()> {
 				fin
 	);

+//	COPYREV  ********************************************************************************************************
+    let mcode_copyrev = assemble_jareth!(
+	// x..x / $SRC / $DST in %0
+	// x..x / $DST / $SRC in %1
+	// x..x / X size in %2
+	// x..x / Y size in %3,
+	// x..x src_stride / dst_stride in %4 (screen width?)
+	// -----
+	// main loop:
+	// leftover X in %6
+	// data in %7
+	// masked data in %7
+	// live X count in %9
+	// $SRC / $DST in %10
+	// $DST / $SRC in %11
+	// live Y count in %12, also scratch in header
+	// todo X count in %13
+	// amount of work in tail in %14
+	// 0/scrap in %15
+	// -----
+	// tail loop:
+	// $SRC / $DST in %0
+	// dst data in %7
+	// src data in %8
+	// live Y count in %9
+	// 0/scrap in %15
+	
+
+        start:
+				// if number of line or element in line is 0, exit early
+				brz32 done128, %2
+				brz32 done128, %3
+				// reset masks
+				resm %15
+				// compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16)
+				and %14, %0, #15
+				// copy addresses
+				psa %10, %0
+				psa %11, %1
+				// set todo X
+				psa %13, %2
+				// if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16)
+				brz32 skip, %14
+				
+				// it is at most 16-($DST & 15)
+				sub32v %14, #16, %14
+				// compute the proper value by bounding to Xsize
+				min32v %14, %14, %2
+				// more than one address to increment
+				bcast32 %14, %14
+				// add the count to the addresses, SRC will now be aligned
+				add32v %10, %10, %14
+				// add the count to the addresses, DST will have the proper alignment to shift input in the aligned loop
+				add32v %11, %11, %14
+				// so, do we do everything there ?
+				sub32v %13, %2, %14
+				// if 0, we do everything in the tail skip the aligned loop
+				brz32 startX, %13
+				
+		skip:
+				// reset q mask (we will be aligned from now on)
+				setmq %15, #0, #16
+				// reset a mask to the proper shifting
+				setma %15, %11, #16
+
+				// now we need to figure out where we start to go backward
+				// currently we have the number of 'tail' (first column) elements in %14 (0 for aligned), number of 'loop' elements in %13,
+				// and $SRC+%14 & $DST+%14 in $10/$11 we $SRC+%14 aligned.
+				// compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those
+				and %6, %13, #15
+				// compute the 'aligned' number of elements
+				sub32v %15, %13, %6
+				// if 0, jump to the main loop as we already have the proper addresses
+				brz32 loop128_y, %15
+				
+				bcast32 %15, %15
+				// add the aligned number of element to $SRC+%14 & $DST+%14
+				add32v %10, %10, %15
+				add32v %11, %11, %15
+				
+				// if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST
+				brnz32 skip2, %6
+				sub32v %10, %10, #16
+		skip2:  // if $SRC is not aligned, we also need to add 16 (for prefetch)
+				and %15, %11, #15
+				brz32 skip3, %15
+				add32v %11, %11, #16
+				psa %15, #16
+				swap32 %15, %15
+				add32v %10, %10, %15
+
+		skip3:
+				// copy Y count
+				psa %12, %3
+				
+		loop128_y:
+				// set source and destination addresses for current Y // FIXME : +X, -1?
+				setadr %15, %10
+				// then the rounded value in X
+				sub32v %9, %13, %6
+				// prefetch data
+				
+				// prefetch data
+				load128dec %8, ^1
+				
+				// check for line leftovers
+		loop128_x_begin:
+				brz4 loop128_x, %6
+				
+				// set the leftovers mask (offset is 0 as we are aligned)
+				// IMPROVE ME
+				setmq %15, #0, %6
+				// prefetch data
+				loadl128dec %8, ^1, %8
+				// load old data
+				load128 %7, ^0
+				// insert data
+				psa* %7, %8
+				// rewrite data
+				store128dec %15, ^0, %7
+				// reset the Q mask
+				// IMPROVE ME
+				setmq %15, #0, #16
+				
+		loop128_x:
+				// already 0, bypass aligned stuff
+				brz32 loop128_x_end, %9
+				// prefetch data
+				loadl128dec %8, ^1, %8
+				// insert data
+				psa* %7, %8
+				// write data
+				store128dec %15, ^0, %7
+				// sub 16 (#16 is 16) from live rounded X count
+				sub32v %9, %9, #16
+				// if X count is not 0, keep looping
+				brnz32 loop128_x, %9
+
+		loop128_x_end:
+				// decrement Y count
+				sub32v %12, %12, #1
+				// if 0, finished
+				brz32 startX, %12
+				
+				// add strides to initial addresses
+				add32v %10, %10, %4
+				// loop128 to do next line
+				brz32 loop128_y, #0
+
+		startX:
+				// do the first column if we need to
+				brz32 done128, %14
+				// set alignement; we shift by the addr offset
+				setmq %15, %0, #16
+				setma %15, %1, #16
+				// copy Y
+				psa %9, %3
+		loopX_y:
+				// setadr from the start
+				setadr %15, %0
+				// load src
+				load256 %8, ^1
+				// load old data
+				load128 %7, ^0
+				// insert data
+				psa* %7, %8
+				// rewrite data
+				store128 %15, ^0, %7
+				// increment $SRC / $DST by stride
+				add32v %0, %0, %4
+				// decrement copied Y count
+				sub32v %9, %9, #1
+				// if not zero, continue
+				brnz32 loopX_y, %9
+				
+		done128:		
+				fin
+				fin
+	);
+
+//	******  ********************************************************************************************************

    let mut pos;

@@ -671,5 +858,14 @@ fn main() -> std::io::Result<()> {
 	println!("");
 	println!("-> {}", mcode_copy.len());

+	pos = 0;
+	println!("copyrev:");
+    while pos < mcode_copyrev.len() {
+		  print!("0x{:08x},", mcode_copyrev[pos]);
+		  pos = pos + 1;
+    }
+	println!("");
+	println!("-> {}", mcode_copyrev.len());
+
 	Ok(())
 }