1
0
mirror of synced 2026-03-03 17:56:30 +00:00

more aceel

This commit is contained in:
Romain Dolbeau
2022-03-16 23:27:07 +01:00
parent a75b2a2d18
commit d8504c8713
3 changed files with 370 additions and 48 deletions

View File

@@ -124,6 +124,7 @@ struct scrolltest {
#define GOBLIN_FILL _IOW('X', 1, struct scrolltest)
#define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest)
#define GOBLIN_COPY _IOW('X', 3, struct scrolltest)
#define GOBLIN_COPYREV _IOW('X', 4, struct scrolltest)
static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *);
static paddr_t goblin_mmap(void *, void *, off_t, int);
@@ -146,6 +147,7 @@ static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose,
static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n);
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop);
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,
0x00004005,0xfb000809,0x0000000a,0x0000000a };
static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,
@@ -153,24 +155,33 @@ static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013
static const uint32_t program_fill256[14] = { 0x01bc0014,0x001a6087,0x013c6814,0x403c0012,0x00146086,0xe03c1013,0x00165146,0xfe800148,
0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a };
static const uint32_t program_fill[38] = { 0x11800089,0x110000c9,0x01bc0014,0x0800000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00166007,0x00145946,
0x0014214f,0x00005005,0x00085086,0x08000089,0x001a6087,0x013c6814,0x403c0012,0x00146086,
0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,0xc03c7013,
0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };
static const uint32_t program_fill[39] = { 0x12000089,0x118000c9,0x01bc0014,0x0880000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00226007,0x00208946,
0x0020220f,0x00008005,0x00088086,0x01048050,0x08000089,0x001a6087,0x013c6814,0x403c0012,
0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,
0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };
static const uint32_t program_fillrop[41] = { 0x13000089,0x128000c9,0x01bc0014,0x003c014c,0x0800000d,0x013c2014,0x002000c0,0x00180000,
static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000,
0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007,
0x00208946,0x0020220f,0x00008005,0x00088086,0x09000089,0x001a6087,0x013c6814,0x403c0012,
0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,0x0180018d,
0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,
0x0000000a};
0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814,
0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,
0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,
0x0000000a,0x0000000a };
static const uint32_t program_copy[49] = { 0x17000089,0x168000c9,0x01bc0014,0x0b80000d,0x013f0014,0x003f0054,0x00380011,0x001400c0,0x00180000,0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00161146,0xfc000148,0x0016f007,0x00145c06,0x0014214f,0x00140150,0x00005005,0x00085086,0x0b800089,0x013f0814,0x00045045,0x003f0054,0x001af087,0x403c0012,0x00146086,0xa0a00013,0x02800149,0x001c0220,0x603c7013,0x00170146,0x20a08015,0xfd800148,0x0280018d,0x013c6814,0x001c0013,0x001c0220,0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x0b00000d,0x013f0014,0x003f0054,0x002400c0,0x00180000,
0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248,
0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814,
0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220,
0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220,
0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
static const uint32_t* programs[7] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, NULL };
static const uint32_t program_len[7] = { 12, 11, 14, 38, 41, 49, 0 };
static uint32_t program_offset[7];
static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x003af007,0x00280000,0x002c0040,0x00340080,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x060003c9,0x003c03d0,0x0028f285,0x002cf2c5,0x00800188,0x002b0286,0x003ef2c7,0x020003c9,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013f0014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a };
static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill,
program_fillrop, program_copy, program_copyrev, NULL };
static const uint32_t program_len[8] = { 12, 11, 14, 39,
42, 48, 66, 0 };
static uint32_t program_offset[8];
static void goblin_set_depth(struct goblin_softc *, int);
@@ -413,6 +424,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
}
break;
case GOBLIN_COPYREV: {
struct scrolltest *st = (struct scrolltest *)data;
jareth_copyrev(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, /* x1 */ st->pm, st->rop);
}
break;
default:
return (ENOTTY);
}
@@ -938,6 +955,53 @@ static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, i
return 0;
}
static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) {
const uint32_t base = 0;
const int pidx = 6; // copyrev
/* int i; */
/* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */
power_on(sc);
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,1), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,1), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n));
/* for (i = 1 ; i < 8 ; i++) { */
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */
/* } */
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,1), (sc->sc_stride));
jareth_mpstart_write(sc, program_offset[pidx]);
jareth_mplen_write(sc, program_len[pidx]);
#if 0
{
uint32_t data[8];
int i, j;
char buf[512];
for (i = 0 ; i < 16 ; i++) {
for (j = 0 ; j < 8 ; j++)
data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j));
snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]);
aprint_normal("reg%d : %s\n", i, buf);
}
}
#endif
(void)start_job(sc, verbose);
delay(1);
(void)wait_job(sc, 1, verbose);
power_off(sc);
return 0;
}
static void
jareth_copyrows(void *cookie, int src, int dst, int n)
{

View File

@@ -11,31 +11,48 @@ field_latex = "$\mathbf{{F}}_{{{{2^{{255}}}}-19}}$"
opcode_bits = 5 # number of bits used to encode the opcode field
opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shift a/b/q (star)
"UDF" : [-1, "Placeholder for undefined opcodes"],
"PSA" : [0, "Wd $\gets$ Ra // pass A"],
"PSB" : [1, "Wd $\gets$ Rb // pass B"], # for star version mostly
"UDF" : [-1, "Placeholder for undefined opcodes"],
"PSA" : [0, "Wd $\gets$ Ra // pass A"],
"PSB" : [1, "Wd $\gets$ Rb // pass B"], # for star version mostly
"ROP32V" : [2, "Wd $\gets$ ((Rb ROP Ra) & planemask) | (Ra & ~planemask)" ], # replace MSK
"XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"],
"NOT" : [4, "Wd $\gets$ ~Ra // binary invert"],
"XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"],
"NOT" : [4, "Wd $\gets$ ~Ra // binary invert"],
"ADD32V" : [5, "Wd[x..x+32] $\gets$ Ra[x..x+32] + Rb[x..x+32] // vector 32-bit binary add"],
"SUB32V" : [6, "Wd[x..x+32] $\gets$ Ra[x..x+32] - Rb[x..x+32] // vector 32-bit binary sub"],
"AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL
"AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL
"BRNZ32" : [8, "If Ra[0:32] != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if non-zero (32-bits)"], # replace TRD
"BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (32-bits)"],
"FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
"SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"],
"SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT
"BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"],
"BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"],
"BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (32-bits)"],
"FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
"SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"],
"SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT
"BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"],
"BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"],
"MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"],
"BCAST32" : [16, "Wd[x..x+32] $\gets$ Ra[0..32]"],
"MANIP32": [16, "imm[0..1] == 0 BCAST32 = Wd[x..x+32] $\gets$ Ra[0..32], imm[0..1] == 1 SWAP32, imm[0..1] == 2 ROTR32V"],
"GETM": [17, "GETM: getmask" ],
"ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
# for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ;
"GETM": [17, "GETM: getmask" ],
"ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
"MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ],
"SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ],
"LOADH" : [21, "LOADH: imm[7] == 0 for LOAD, address in addr[imm[0...]], high->low & load a+16 into high" ],
"MAX" : [22, "Maximum opcode number (for bounds checking)"],
# "MEM" imm:
# imm[8]: 256 bits mode
# imm[7]: L/S
# imm[6]: post-inc
# imm[5]: post-dec
# imm[4]:
# imm[3]:
# imm[0..2]: adr reg
"MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ],
"SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ],
# "LOADH/L" imm:
# imm[8]: 0
# imm[7]: 0
# imm[6]: post-inc
# imm[5]: post-dec
# imm[4]:
# imm[3]:
# imm[0..2]: adr reg
"LOADH" : [21, "LOADH: high->low & load *Adr into high" ],
"LOADL" : [22, "LOADL: low->high & load *Adr into low" ],
"MAX" : [23, "Maximum opcode number (for bounds checking)"],
}
num_registers = 32
@@ -334,7 +351,7 @@ passthrough.
class ExecAddSub(ExecUnit, AutoDoc):
def __init__(self, width=256):
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "BCAST32" ])
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "MANIP32" ])
self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f"""
""")
@@ -347,12 +364,25 @@ class ExecAddSub(ExecUnit, AutoDoc):
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
).Elif(self.instruction.opcode == opcodes["SUB32V"][0],
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
).Elif(self.instruction.opcode == opcodes["BCAST32"][0],
[ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
).Elif(self.instruction.opcode == opcodes["MIN32V"][0],
[ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]),
self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32])
).Else(self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
).Else(
self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32]))
for x in range(0, width//32) ],
).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 0), # BCAST32
[ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 1), # SWAP32
[ self.q[x*32:(x+1)*32].eq(self.a[(x^1)*32:((x^1)+1)*32]) for x in range(0, width//32) ],
).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 2), # ROTR32V
Case(self.b[0:2], {
0: [ self.q[x*32:(x+1)*32].eq( self.a[x*32 :(x+1)*32]) for x in range(0, width//32) ],
1: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+ 8:(x+1)*32], self.a[x*32:x*32+ 8])) for x in range(0, width//32) ],
2: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+16:(x+1)*32], self.a[x*32:x*32+16])) for x in range(0, width//32) ],
3: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+24:(x+1)*32], self.a[x*32:x*32+24])) for x in range(0, width//32) ],
}),
).Else(
[ self.q[x*32:(x+1)*32].eq(0xDEADBEEF) for x in range(0, width//32) ]
)
]
@@ -418,7 +448,7 @@ class ExecRop(ExecUnit, AutoDoc):
class ExecLS(ExecUnit, AutoDoc):
def __init__(self, width=256, interface=None, memoryport=None, r_dat_f=None, r_dat_m=None, granule=0):
ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "GETM"])
ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "LOADL", "GETM"])
self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f"""
""")
@@ -470,7 +500,7 @@ class ExecLS(ExecUnit, AutoDoc):
lsseq.act("IDLE",
If(start_pipe,
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]),
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]),
NextValue(cpar, 0),
NextValue(address, addresses[self.instruction.immediate[0:log2_int(width//32)]]),
NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]] == 0x8)),
@@ -554,6 +584,26 @@ class ExecLS(ExecUnit, AutoDoc):
NextState("MEMh")
)
)
).Elif(self.instruction.opcode == opcodes["LOADL"][0],
NextValue(self.has_timeout, 0),
NextValue(self.has_failure, 0),
NextValue(timeout, 2047),
NextValue(lbuf[128:256], self.b[0:128]),
If(wishbone,
NextValue(interface.cyc, 1),
NextValue(interface.stb, 1),
NextValue(interface.sel, 2**len(interface.sel)-1),
NextValue(interface.adr, address),
NextValue(interface.we, self.instruction.immediate[7]),
NextState("MEMl")
).Else(
memoryport.cmd.we.eq(self.instruction.immediate[7]),
memoryport.cmd.addr.eq(address[0:]),
memoryport.cmd.valid.eq(1),
If(memoryport.cmd.ready,
NextState("MEMl")
)
)
)
)
for X in range(0, granule_num):
@@ -615,6 +665,8 @@ class ExecLS(ExecUnit, AutoDoc):
If(wishbone & ~interface.ack,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
If(self.instruction.immediate[8],
NextValue(interface.cyc, 1),
@@ -627,7 +679,9 @@ class ExecLS(ExecUnit, AutoDoc):
NextValue(interface.dat_w, self.b[128:256])),
NextState("MEMh")
).Else(
NextValue(lbuf[128:256], 0),
If(self.instruction.opcode == opcodes["MEM"][0],
NextValue(lbuf[128:256], 0),
),
If(cpar, ## checkme
NextState("MEM_ODD")
).Else(
@@ -643,12 +697,16 @@ class ExecLS(ExecUnit, AutoDoc):
If(memoryport.cmd.ready,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
NextState("MEMh"),
)
).Else( # no high
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
NextValue(lbuf[128:256], 0),
If(cpar, ## checkme
@@ -691,6 +749,8 @@ class ExecLS(ExecUnit, AutoDoc):
If(wishbone & ~interface.ack,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
#NextValue(tries, 0),
If(cpar, ## checkme
@@ -701,6 +761,8 @@ class ExecLS(ExecUnit, AutoDoc):
).Elif(~wishbone,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
If(cpar, ## checkme
NextState("MEM_ODD")
@@ -733,7 +795,7 @@ class ExecLS(ExecUnit, AutoDoc):
self.sync.mul_clk += [
If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"),
self.q_valid.eq(1),
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]),
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]),
If(~self.instruction.immediate[7],
self.q.eq(lbuf),
).Else(

View File

@@ -212,6 +212,7 @@ fn main() -> std::io::Result<()> {
fin
);
// FILL ********************************************************************************************************
let mcode_fill = assemble_jareth!(
// x..x / $DST in %0
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
@@ -269,15 +270,17 @@ fn main() -> std::io::Result<()> {
loopX_done:
// how much did we do (#6 is 31, #5 is 32)
and %5, %0, #6
and %8, %0, #6
// compute 32-(x&31)
sub32v %5, #5, %5
sub32v %8, #5, %8
// compute the proper value
min32v %5, %5, %2
min32v %8, %8, %2
// add that to the address, which will now be aligned
add32v %0, %0, %5
add32v %0, %0, %8
// remove from X, as we have done it
sub32v %2, %2, %5
sub32v %2, %2, %8
// rotate the pattern to match
rotr32v %1, %1, %8
// fall through the aligned loop if not 0
brz32 done256, %2
@@ -329,6 +332,7 @@ fn main() -> std::io::Result<()> {
fin
);
// FILL ROP ********************************************************************************************************
let mcode_fillrop = assemble_jareth!(
// x..x / $DST in %0
// 128-bits pattern in %1 [assumed to be alignement-homogeneous]
@@ -398,6 +402,8 @@ fn main() -> std::io::Result<()> {
add32v %0, %0, %8
// remove from X, as we have done it
sub32v %2, %2, %8
// rotate the pattern to match
rotr32v %1, %1, %8
// fall through the aligned loop if not 0, otherwise done
brz32 done256, %2
@@ -455,6 +461,7 @@ fn main() -> std::io::Result<()> {
// COPY ********************************************************************************************************
let mcode_copy = assemble_jareth!(
// x..x / $SRC / $DST in %0
// x..x / $DST / $SRC in %1
@@ -490,10 +497,8 @@ fn main() -> std::io::Result<()> {
// do the first column to align $DST
startX:
// set alignement; we shift by the addr offset
//and %14, %2, #15
setmq %15, %0, #16
setma %15, %1, #16
getm %14
// copy Y
psa %9, %3
// copy $SRC / $DST
@@ -531,6 +536,7 @@ fn main() -> std::io::Result<()> {
sub32v %2, %2, %9
// fall through to the aligned loop if not 0
brz32 done128, %2
// reset q mask (we will be aligned from now on)
setmq %15, #0, #16
// add the count to the addresses, ^1 will have the proper shift for masking
@@ -596,6 +602,187 @@ fn main() -> std::io::Result<()> {
fin
);
// COPYREV ********************************************************************************************************
let mcode_copyrev = assemble_jareth!(
// x..x / $SRC / $DST in %0
// x..x / $DST / $SRC in %1
// x..x / X size in %2
// x..x / Y size in %3,
// x..x src_stride / dst_stride in %4 (screen width?)
// -----
// main loop:
// leftover X in %6
// data in %7
// masked data in %7
// live X count in %9
// $SRC / $DST in %10
// $DST / $SRC in %11
// live Y count in %12, also scratch in header
// todo X count in %13
// amount of work in tail in %14
// 0/scrap in %15
// -----
// tail loop:
// $SRC / $DST in %0
// dst data in %7
// src data in %8
// live Y count in %9
// 0/scrap in %15
start:
// if number of line or element in line is 0, exit early
brz32 done128, %2
brz32 done128, %3
// reset masks
resm %15
// compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16)
and %14, %0, #15
// copy addresses
psa %10, %0
psa %11, %1
// set todo X
psa %13, %2
// if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16)
brz32 skip, %14
// it is at most 16-($DST & 15)
sub32v %14, #16, %14
// compute the proper value by bounding to Xsize
min32v %14, %14, %2
// more than one address to increment
bcast32 %14, %14
// add the count to the addresses, SRC will now be aligned
add32v %10, %10, %14
// add the count to the addresses, DST will have the proper alignment to shift input in the aligned loop
add32v %11, %11, %14
// so, do we do everything there ?
sub32v %13, %2, %14
// if 0, we do everything in the tail skip the aligned loop
brz32 startX, %13
skip:
// reset q mask (we will be aligned from now on)
setmq %15, #0, #16
// reset a mask to the proper shifting
setma %15, %11, #16
// now we need to figure out where we start to go backward
// currently we have the number of 'tail' (first column) elements in %14 (0 for aligned), number of 'loop' elements in %13,
// and $SRC+%14 & $DST+%14 in $10/$11 we $SRC+%14 aligned.
// compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those
and %6, %13, #15
// compute the 'aligned' number of elements
sub32v %15, %13, %6
// if 0, jump to the main loop as we already have the proper addresses
brz32 loop128_y, %15
bcast32 %15, %15
// add the aligned number of element to $SRC+%14 & $DST+%14
add32v %10, %10, %15
add32v %11, %11, %15
// if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST
brnz32 skip2, %6
sub32v %10, %10, #16
skip2: // if $SRC is not aligned, we also need to add 16 (for prefetch)
and %15, %11, #15
brz32 skip3, %15
add32v %11, %11, #16
psa %15, #16
swap32 %15, %15
add32v %10, %10, %15
skip3:
// copy Y count
psa %12, %3
loop128_y:
// set source and destination addresses for current Y // FIXME : +X, -1?
setadr %15, %10
// then the rounded value in X
sub32v %9, %13, %6
// prefetch data
// prefetch data
load128dec %8, ^1
// check for line leftovers
loop128_x_begin:
brz4 loop128_x, %6
// set the leftovers mask (offset is 0 as we are aligned)
// IMPROVE ME
setmq %15, #0, %6
// prefetch data
loadl128dec %8, ^1, %8
// load old data
load128 %7, ^0
// insert data
psa* %7, %8
// rewrite data
store128dec %15, ^0, %7
// reset the Q mask
// IMPROVE ME
setmq %15, #0, #16
loop128_x:
// already 0, bypass aligned stuff
brz32 loop128_x_end, %9
// prefetch data
loadl128dec %8, ^1, %8
// insert data
psa* %7, %8
// write data
store128dec %15, ^0, %7
// sub 16 (#16 is 16) from live rounded X count
sub32v %9, %9, #16
// if X count is not 0, keep looping
brnz32 loop128_x, %9
loop128_x_end:
// decrement Y count
sub32v %12, %12, #1
// if 0, finished
brz32 startX, %12
// add strides to initial addresses
add32v %10, %10, %4
// loop128 to do next line
brz32 loop128_y, #0
startX:
// do the first column if we need to
brz32 done128, %14
// set alignement; we shift by the addr offset
setmq %15, %0, #16
setma %15, %1, #16
// copy Y
psa %9, %3
loopX_y:
// setadr from the start
setadr %15, %0
// load src
load256 %8, ^1
// load old data
load128 %7, ^0
// insert data
psa* %7, %8
// rewrite data
store128 %15, ^0, %7
// increment $SRC / $DST by stride
add32v %0, %0, %4
// decrement copied Y count
sub32v %9, %9, #1
// if not zero, continue
brnz32 loopX_y, %9
done128:
fin
fin
);
// ****** ********************************************************************************************************
let mut pos;
@@ -671,5 +858,14 @@ fn main() -> std::io::Result<()> {
println!("");
println!("-> {}", mcode_copy.len());
pos = 0;
println!("copyrev:");
while pos < mcode_copyrev.len() {
print!("0x{:08x},", mcode_copyrev[pos]);
pos = pos + 1;
}
println!("");
println!("-> {}", mcode_copyrev.len());
Ok(())
}