more aceel
This commit is contained in:
@@ -124,6 +124,7 @@ struct scrolltest {
|
||||
#define GOBLIN_FILL _IOW('X', 1, struct scrolltest)
|
||||
#define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest)
|
||||
#define GOBLIN_COPY _IOW('X', 3, struct scrolltest)
|
||||
#define GOBLIN_COPYREV _IOW('X', 4, struct scrolltest)
|
||||
|
||||
static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *);
|
||||
static paddr_t goblin_mmap(void *, void *, off_t, int);
|
||||
@@ -146,6 +147,7 @@ static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose,
|
||||
static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n);
|
||||
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop);
|
||||
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
|
||||
static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
|
||||
static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,
|
||||
0x00004005,0xfb000809,0x0000000a,0x0000000a };
|
||||
static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,
|
||||
@@ -153,24 +155,33 @@ static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013
|
||||
static const uint32_t program_fill256[14] = { 0x01bc0014,0x001a6087,0x013c6814,0x403c0012,0x00146086,0xe03c1013,0x00165146,0xfe800148,
|
||||
0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_fill[38] = { 0x11800089,0x110000c9,0x01bc0014,0x0800000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
|
||||
0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00166007,0x00145946,
|
||||
0x0014214f,0x00005005,0x00085086,0x08000089,0x001a6087,0x013c6814,0x403c0012,0x00146086,
|
||||
0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,0xc03c7013,
|
||||
0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };
|
||||
static const uint32_t program_fill[39] = { 0x12000089,0x118000c9,0x01bc0014,0x0880000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
|
||||
0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00226007,0x00208946,
|
||||
0x0020220f,0x00008005,0x00088086,0x01048050,0x08000089,0x001a6087,0x013c6814,0x403c0012,
|
||||
0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,
|
||||
0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_fillrop[41] = { 0x13000089,0x128000c9,0x01bc0014,0x003c014c,0x0800000d,0x013c2014,0x002000c0,0x00180000,
|
||||
static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000,
|
||||
0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007,
|
||||
0x00208946,0x0020220f,0x00008005,0x00088086,0x09000089,0x001a6087,0x013c6814,0x403c0012,
|
||||
0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,0x0180018d,
|
||||
0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,
|
||||
0x0000000a};
|
||||
0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814,
|
||||
0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,
|
||||
0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,
|
||||
0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_copy[49] = { 0x17000089,0x168000c9,0x01bc0014,0x0b80000d,0x013f0014,0x003f0054,0x00380011,0x001400c0,0x00180000,0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00161146,0xfc000148,0x0016f007,0x00145c06,0x0014214f,0x00140150,0x00005005,0x00085086,0x0b800089,0x013f0814,0x00045045,0x003f0054,0x001af087,0x403c0012,0x00146086,0xa0a00013,0x02800149,0x001c0220,0x603c7013,0x00170146,0x20a08015,0xfd800148,0x0280018d,0x013c6814,0x001c0013,0x001c0220,0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
|
||||
static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x0b00000d,0x013f0014,0x003f0054,0x002400c0,0x00180000,
|
||||
0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248,
|
||||
0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814,
|
||||
0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220,
|
||||
0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220,
|
||||
0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t* programs[7] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, NULL };
|
||||
static const uint32_t program_len[7] = { 12, 11, 14, 38, 41, 49, 0 };
|
||||
static uint32_t program_offset[7];
|
||||
static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x003af007,0x00280000,0x002c0040,0x00340080,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x060003c9,0x003c03d0,0x0028f285,0x002cf2c5,0x00800188,0x002b0286,0x003ef2c7,0x020003c9,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013f0014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill,
|
||||
program_fillrop, program_copy, program_copyrev, NULL };
|
||||
static const uint32_t program_len[8] = { 12, 11, 14, 39,
|
||||
42, 48, 66, 0 };
|
||||
static uint32_t program_offset[8];
|
||||
|
||||
static void goblin_set_depth(struct goblin_softc *, int);
|
||||
|
||||
@@ -413,6 +424,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
|
||||
}
|
||||
break;
|
||||
|
||||
case GOBLIN_COPYREV: {
|
||||
struct scrolltest *st = (struct scrolltest *)data;
|
||||
jareth_copyrev(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, /* x1 */ st->pm, st->rop);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
return (ENOTTY);
|
||||
}
|
||||
@@ -938,6 +955,53 @@ static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, i
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) {
|
||||
const uint32_t base = 0;
|
||||
const int pidx = 6; // copyrev
|
||||
/* int i; */
|
||||
|
||||
/* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */
|
||||
|
||||
power_on(sc);
|
||||
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,1), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,1), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n));
|
||||
/* for (i = 1 ; i < 8 ; i++) { */
|
||||
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */
|
||||
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */
|
||||
/* } */
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,1), (sc->sc_stride));
|
||||
jareth_mpstart_write(sc, program_offset[pidx]);
|
||||
jareth_mplen_write(sc, program_len[pidx]);
|
||||
|
||||
#if 0
|
||||
{
|
||||
uint32_t data[8];
|
||||
int i, j;
|
||||
char buf[512];
|
||||
for (i = 0 ; i < 16 ; i++) {
|
||||
for (j = 0 ; j < 8 ; j++)
|
||||
data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j));
|
||||
snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]);
|
||||
aprint_normal("reg%d : %s\n", i, buf);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
(void)start_job(sc, verbose);
|
||||
delay(1);
|
||||
(void)wait_job(sc, 1, verbose);
|
||||
|
||||
power_off(sc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
jareth_copyrows(void *cookie, int src, int dst, int n)
|
||||
{
|
||||
|
||||
@@ -11,31 +11,48 @@ field_latex = "$\mathbf{{F}}_{{{{2^{{255}}}}-19}}$"
|
||||
|
||||
opcode_bits = 5 # number of bits used to encode the opcode field
|
||||
opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shift a/b/q (star)
|
||||
"UDF" : [-1, "Placeholder for undefined opcodes"],
|
||||
"PSA" : [0, "Wd $\gets$ Ra // pass A"],
|
||||
"PSB" : [1, "Wd $\gets$ Rb // pass B"], # for star version mostly
|
||||
"UDF" : [-1, "Placeholder for undefined opcodes"],
|
||||
"PSA" : [0, "Wd $\gets$ Ra // pass A"],
|
||||
"PSB" : [1, "Wd $\gets$ Rb // pass B"], # for star version mostly
|
||||
"ROP32V" : [2, "Wd $\gets$ ((Rb ROP Ra) & planemask) | (Ra & ~planemask)" ], # replace MSK
|
||||
"XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"],
|
||||
"NOT" : [4, "Wd $\gets$ ~Ra // binary invert"],
|
||||
"XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"],
|
||||
"NOT" : [4, "Wd $\gets$ ~Ra // binary invert"],
|
||||
"ADD32V" : [5, "Wd[x..x+32] $\gets$ Ra[x..x+32] + Rb[x..x+32] // vector 32-bit binary add"],
|
||||
"SUB32V" : [6, "Wd[x..x+32] $\gets$ Ra[x..x+32] - Rb[x..x+32] // vector 32-bit binary sub"],
|
||||
"AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL
|
||||
"AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL
|
||||
"BRNZ32" : [8, "If Ra[0:32] != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if non-zero (32-bits)"], # replace TRD
|
||||
"BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (32-bits)"],
|
||||
"FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
|
||||
"SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"],
|
||||
"SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT
|
||||
"BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"],
|
||||
"BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"],
|
||||
"BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (32-bits)"],
|
||||
"FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
|
||||
"SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"],
|
||||
"SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT
|
||||
"BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"],
|
||||
"BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"],
|
||||
"MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"],
|
||||
"BCAST32" : [16, "Wd[x..x+32] $\gets$ Ra[0..32]"],
|
||||
"MANIP32": [16, "imm[0..1] == 0 BCAST32 = Wd[x..x+32] $\gets$ Ra[0..32], imm[0..1] == 1 SWAP32, imm[0..1] == 2 ROTR32V"],
|
||||
"GETM": [17, "GETM: getmask" ],
|
||||
"ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
|
||||
# for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ;
|
||||
"GETM": [17, "GETM: getmask" ],
|
||||
"ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
|
||||
"MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ],
|
||||
"SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ],
|
||||
"LOADH" : [21, "LOADH: imm[7] == 0 for LOAD, address in addr[imm[0...]], high->low & load a+16 into high" ],
|
||||
"MAX" : [22, "Maximum opcode number (for bounds checking)"],
|
||||
# "MEM" imm:
|
||||
# imm[8]: 256 bits mode
|
||||
# imm[7]: L/S
|
||||
# imm[6]: post-inc
|
||||
# imm[5]: post-dec
|
||||
# imm[4]:
|
||||
# imm[3]:
|
||||
# imm[0..2]: adr reg
|
||||
"MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ],
|
||||
"SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ],
|
||||
# "LOADH/L" imm:
|
||||
# imm[8]: 0
|
||||
# imm[7]: 0
|
||||
# imm[6]: post-inc
|
||||
# imm[5]: post-dec
|
||||
# imm[4]:
|
||||
# imm[3]:
|
||||
# imm[0..2]: adr reg
|
||||
"LOADH" : [21, "LOADH: high->low & load *Adr into high" ],
|
||||
"LOADL" : [22, "LOADL: low->high & load *Adr into low" ],
|
||||
"MAX" : [23, "Maximum opcode number (for bounds checking)"],
|
||||
}
|
||||
|
||||
num_registers = 32
|
||||
@@ -334,7 +351,7 @@ passthrough.
|
||||
|
||||
class ExecAddSub(ExecUnit, AutoDoc):
|
||||
def __init__(self, width=256):
|
||||
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "BCAST32" ])
|
||||
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "MANIP32" ])
|
||||
self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f"""
|
||||
""")
|
||||
|
||||
@@ -347,12 +364,25 @@ class ExecAddSub(ExecUnit, AutoDoc):
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
|
||||
).Elif(self.instruction.opcode == opcodes["SUB32V"][0],
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
|
||||
).Elif(self.instruction.opcode == opcodes["BCAST32"][0],
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
|
||||
).Elif(self.instruction.opcode == opcodes["MIN32V"][0],
|
||||
[ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]),
|
||||
self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32])
|
||||
).Else(self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
|
||||
).Else(
|
||||
self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32]))
|
||||
for x in range(0, width//32) ],
|
||||
).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 0), # BCAST32
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
|
||||
).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 1), # SWAP32
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[(x^1)*32:((x^1)+1)*32]) for x in range(0, width//32) ],
|
||||
).Elif((self.instruction.opcode == opcodes["MANIP32"][0]) & (self.instruction.immediate[0:2] == 2), # ROTR32V
|
||||
Case(self.b[0:2], {
|
||||
0: [ self.q[x*32:(x+1)*32].eq( self.a[x*32 :(x+1)*32]) for x in range(0, width//32) ],
|
||||
1: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+ 8:(x+1)*32], self.a[x*32:x*32+ 8])) for x in range(0, width//32) ],
|
||||
2: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+16:(x+1)*32], self.a[x*32:x*32+16])) for x in range(0, width//32) ],
|
||||
3: [ self.q[x*32:(x+1)*32].eq(Cat(self.a[x*32+24:(x+1)*32], self.a[x*32:x*32+24])) for x in range(0, width//32) ],
|
||||
}),
|
||||
).Else(
|
||||
[ self.q[x*32:(x+1)*32].eq(0xDEADBEEF) for x in range(0, width//32) ]
|
||||
)
|
||||
]
|
||||
|
||||
@@ -418,7 +448,7 @@ class ExecRop(ExecUnit, AutoDoc):
|
||||
|
||||
class ExecLS(ExecUnit, AutoDoc):
|
||||
def __init__(self, width=256, interface=None, memoryport=None, r_dat_f=None, r_dat_m=None, granule=0):
|
||||
ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "GETM"])
|
||||
ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "LOADL", "GETM"])
|
||||
|
||||
self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f"""
|
||||
""")
|
||||
@@ -470,7 +500,7 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
|
||||
lsseq.act("IDLE",
|
||||
If(start_pipe,
|
||||
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]),
|
||||
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]),
|
||||
NextValue(cpar, 0),
|
||||
NextValue(address, addresses[self.instruction.immediate[0:log2_int(width//32)]]),
|
||||
NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]] == 0x8)),
|
||||
@@ -554,6 +584,26 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
NextState("MEMh")
|
||||
)
|
||||
)
|
||||
).Elif(self.instruction.opcode == opcodes["LOADL"][0],
|
||||
NextValue(self.has_timeout, 0),
|
||||
NextValue(self.has_failure, 0),
|
||||
NextValue(timeout, 2047),
|
||||
NextValue(lbuf[128:256], self.b[0:128]),
|
||||
If(wishbone,
|
||||
NextValue(interface.cyc, 1),
|
||||
NextValue(interface.stb, 1),
|
||||
NextValue(interface.sel, 2**len(interface.sel)-1),
|
||||
NextValue(interface.adr, address),
|
||||
NextValue(interface.we, self.instruction.immediate[7]),
|
||||
NextState("MEMl")
|
||||
).Else(
|
||||
memoryport.cmd.we.eq(self.instruction.immediate[7]),
|
||||
memoryport.cmd.addr.eq(address[0:]),
|
||||
memoryport.cmd.valid.eq(1),
|
||||
If(memoryport.cmd.ready,
|
||||
NextState("MEMl")
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
for X in range(0, granule_num):
|
||||
@@ -615,6 +665,8 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
If(wishbone & ~interface.ack,
|
||||
If(self.instruction.immediate[6], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
|
||||
).Elif(self.instruction.immediate[5], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
|
||||
),
|
||||
If(self.instruction.immediate[8],
|
||||
NextValue(interface.cyc, 1),
|
||||
@@ -627,7 +679,9 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
NextValue(interface.dat_w, self.b[128:256])),
|
||||
NextState("MEMh")
|
||||
).Else(
|
||||
NextValue(lbuf[128:256], 0),
|
||||
If(self.instruction.opcode == opcodes["MEM"][0],
|
||||
NextValue(lbuf[128:256], 0),
|
||||
),
|
||||
If(cpar, ## checkme
|
||||
NextState("MEM_ODD")
|
||||
).Else(
|
||||
@@ -643,12 +697,16 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
If(memoryport.cmd.ready,
|
||||
If(self.instruction.immediate[6], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
|
||||
).Elif(self.instruction.immediate[5], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
|
||||
),
|
||||
NextState("MEMh"),
|
||||
)
|
||||
).Else( # no high
|
||||
If(self.instruction.immediate[6], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
|
||||
).Elif(self.instruction.immediate[5], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
|
||||
),
|
||||
NextValue(lbuf[128:256], 0),
|
||||
If(cpar, ## checkme
|
||||
@@ -691,6 +749,8 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
If(wishbone & ~interface.ack,
|
||||
If(self.instruction.immediate[6], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
|
||||
).Elif(self.instruction.immediate[5], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
|
||||
),
|
||||
#NextValue(tries, 0),
|
||||
If(cpar, ## checkme
|
||||
@@ -701,6 +761,8 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
).Elif(~wishbone,
|
||||
If(self.instruction.immediate[6], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
|
||||
).Elif(self.instruction.immediate[5], # post-inc
|
||||
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
|
||||
),
|
||||
If(cpar, ## checkme
|
||||
NextState("MEM_ODD")
|
||||
@@ -733,7 +795,7 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
self.sync.mul_clk += [
|
||||
If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"),
|
||||
self.q_valid.eq(1),
|
||||
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]),
|
||||
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]),
|
||||
If(~self.instruction.immediate[7],
|
||||
self.q.eq(lbuf),
|
||||
).Else(
|
||||
|
||||
@@ -212,6 +212,7 @@ fn main() -> std::io::Result<()> {
|
||||
fin
|
||||
);
|
||||
|
||||
// FILL ********************************************************************************************************
|
||||
let mcode_fill = assemble_jareth!(
|
||||
// x..x / $DST in %0
|
||||
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
|
||||
@@ -269,15 +270,17 @@ fn main() -> std::io::Result<()> {
|
||||
|
||||
loopX_done:
|
||||
// how much did we do (#6 is 31, #5 is 32)
|
||||
and %5, %0, #6
|
||||
and %8, %0, #6
|
||||
// compute 32-(x&31)
|
||||
sub32v %5, #5, %5
|
||||
sub32v %8, #5, %8
|
||||
// compute the proper value
|
||||
min32v %5, %5, %2
|
||||
min32v %8, %8, %2
|
||||
// add that to the address, which will now be aligned
|
||||
add32v %0, %0, %5
|
||||
add32v %0, %0, %8
|
||||
// remove from X, as we have done it
|
||||
sub32v %2, %2, %5
|
||||
sub32v %2, %2, %8
|
||||
// rotate the pattern to match
|
||||
rotr32v %1, %1, %8
|
||||
// fall through the aligned loop if not 0
|
||||
brz32 done256, %2
|
||||
|
||||
@@ -329,6 +332,7 @@ fn main() -> std::io::Result<()> {
|
||||
fin
|
||||
);
|
||||
|
||||
// FILL ROP ********************************************************************************************************
|
||||
let mcode_fillrop = assemble_jareth!(
|
||||
// x..x / $DST in %0
|
||||
// 128-bits pattern in %1 [assumed to be alignement-homogeneous]
|
||||
@@ -398,6 +402,8 @@ fn main() -> std::io::Result<()> {
|
||||
add32v %0, %0, %8
|
||||
// remove from X, as we have done it
|
||||
sub32v %2, %2, %8
|
||||
// rotate the pattern to match
|
||||
rotr32v %1, %1, %8
|
||||
// fall through the aligned loop if not 0, otherwise done
|
||||
brz32 done256, %2
|
||||
|
||||
@@ -455,6 +461,7 @@ fn main() -> std::io::Result<()> {
|
||||
|
||||
|
||||
|
||||
// COPY ********************************************************************************************************
|
||||
let mcode_copy = assemble_jareth!(
|
||||
// x..x / $SRC / $DST in %0
|
||||
// x..x / $DST / $SRC in %1
|
||||
@@ -490,10 +497,8 @@ fn main() -> std::io::Result<()> {
|
||||
// do the first column to align $DST
|
||||
startX:
|
||||
// set alignement; we shift by the addr offset
|
||||
//and %14, %2, #15
|
||||
setmq %15, %0, #16
|
||||
setma %15, %1, #16
|
||||
getm %14
|
||||
// copy Y
|
||||
psa %9, %3
|
||||
// copy $SRC / $DST
|
||||
@@ -531,6 +536,7 @@ fn main() -> std::io::Result<()> {
|
||||
sub32v %2, %2, %9
|
||||
// fall through to the aligned loop if not 0
|
||||
brz32 done128, %2
|
||||
|
||||
// reset q mask (we will be aligned from now on)
|
||||
setmq %15, #0, #16
|
||||
// add the count to the addresses, ^1 will have the proper shift for masking
|
||||
@@ -596,6 +602,187 @@ fn main() -> std::io::Result<()> {
|
||||
fin
|
||||
);
|
||||
|
||||
// COPYREV ********************************************************************************************************
|
||||
let mcode_copyrev = assemble_jareth!(
|
||||
// x..x / $SRC / $DST in %0
|
||||
// x..x / $DST / $SRC in %1
|
||||
// x..x / X size in %2
|
||||
// x..x / Y size in %3,
|
||||
// x..x src_stride / dst_stride in %4 (screen width?)
|
||||
// -----
|
||||
// main loop:
|
||||
// leftover X in %6
|
||||
// data in %7
|
||||
// masked data in %7
|
||||
// live X count in %9
|
||||
// $SRC / $DST in %10
|
||||
// $DST / $SRC in %11
|
||||
// live Y count in %12, also scratch in header
|
||||
// todo X count in %13
|
||||
// amount of work in tail in %14
|
||||
// 0/scrap in %15
|
||||
// -----
|
||||
// tail loop:
|
||||
// $SRC / $DST in %0
|
||||
// dst data in %7
|
||||
// src data in %8
|
||||
// live Y count in %9
|
||||
// 0/scrap in %15
|
||||
|
||||
|
||||
start:
|
||||
// if number of line or element in line is 0, exit early
|
||||
brz32 done128, %2
|
||||
brz32 done128, %3
|
||||
// reset masks
|
||||
resm %15
|
||||
// compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16)
|
||||
and %14, %0, #15
|
||||
// copy addresses
|
||||
psa %10, %0
|
||||
psa %11, %1
|
||||
// set todo X
|
||||
psa %13, %2
|
||||
// if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16)
|
||||
brz32 skip, %14
|
||||
|
||||
// it is at most 16-($DST & 15)
|
||||
sub32v %14, #16, %14
|
||||
// compute the proper value by bounding to Xsize
|
||||
min32v %14, %14, %2
|
||||
// more than one address to increment
|
||||
bcast32 %14, %14
|
||||
// add the count to the addresses, SRC will now be aligned
|
||||
add32v %10, %10, %14
|
||||
// add the count to the addresses, DST will have the proper alignment to shift input in the aligned loop
|
||||
add32v %11, %11, %14
|
||||
// so, do we do everything there ?
|
||||
sub32v %13, %2, %14
|
||||
// if 0, we do everything in the tail skip the aligned loop
|
||||
brz32 startX, %13
|
||||
|
||||
skip:
|
||||
// reset q mask (we will be aligned from now on)
|
||||
setmq %15, #0, #16
|
||||
// reset a mask to the proper shifting
|
||||
setma %15, %11, #16
|
||||
|
||||
// now we need to figure out where we start to go backward
|
||||
// currently we have the number of 'tail' (first column) elements in %14 (0 for aligned), number of 'loop' elements in %13,
|
||||
// and $SRC+%14 & $DST+%14 in $10/$11 we $SRC+%14 aligned.
|
||||
// compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those
|
||||
and %6, %13, #15
|
||||
// compute the 'aligned' number of elements
|
||||
sub32v %15, %13, %6
|
||||
// if 0, jump to the main loop as we already have the proper addresses
|
||||
brz32 loop128_y, %15
|
||||
|
||||
bcast32 %15, %15
|
||||
// add the aligned number of element to $SRC+%14 & $DST+%14
|
||||
add32v %10, %10, %15
|
||||
add32v %11, %11, %15
|
||||
|
||||
// if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST
|
||||
brnz32 skip2, %6
|
||||
sub32v %10, %10, #16
|
||||
skip2: // if $SRC is not aligned, we also need to add 16 (for prefetch)
|
||||
and %15, %11, #15
|
||||
brz32 skip3, %15
|
||||
add32v %11, %11, #16
|
||||
psa %15, #16
|
||||
swap32 %15, %15
|
||||
add32v %10, %10, %15
|
||||
|
||||
skip3:
|
||||
// copy Y count
|
||||
psa %12, %3
|
||||
|
||||
loop128_y:
|
||||
// set source and destination addresses for current Y // FIXME : +X, -1?
|
||||
setadr %15, %10
|
||||
// then the rounded value in X
|
||||
sub32v %9, %13, %6
|
||||
// prefetch data
|
||||
|
||||
// prefetch data
|
||||
load128dec %8, ^1
|
||||
|
||||
// check for line leftovers
|
||||
loop128_x_begin:
|
||||
brz4 loop128_x, %6
|
||||
|
||||
// set the leftovers mask (offset is 0 as we are aligned)
|
||||
// IMPROVE ME
|
||||
setmq %15, #0, %6
|
||||
// prefetch data
|
||||
loadl128dec %8, ^1, %8
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128dec %15, ^0, %7
|
||||
// reset the Q mask
|
||||
// IMPROVE ME
|
||||
setmq %15, #0, #16
|
||||
|
||||
loop128_x:
|
||||
// already 0, bypass aligned stuff
|
||||
brz32 loop128_x_end, %9
|
||||
// prefetch data
|
||||
loadl128dec %8, ^1, %8
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// write data
|
||||
store128dec %15, ^0, %7
|
||||
// sub 16 (#16 is 16) from live rounded X count
|
||||
sub32v %9, %9, #16
|
||||
// if X count is not 0, keep looping
|
||||
brnz32 loop128_x, %9
|
||||
|
||||
loop128_x_end:
|
||||
// decrement Y count
|
||||
sub32v %12, %12, #1
|
||||
// if 0, finished
|
||||
brz32 startX, %12
|
||||
|
||||
// add strides to initial addresses
|
||||
add32v %10, %10, %4
|
||||
// loop128 to do next line
|
||||
brz32 loop128_y, #0
|
||||
|
||||
startX:
|
||||
// do the first column if we need to
|
||||
brz32 done128, %14
|
||||
// set alignement; we shift by the addr offset
|
||||
setmq %15, %0, #16
|
||||
setma %15, %1, #16
|
||||
// copy Y
|
||||
psa %9, %3
|
||||
loopX_y:
|
||||
// setadr from the start
|
||||
setadr %15, %0
|
||||
// load src
|
||||
load256 %8, ^1
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128 %15, ^0, %7
|
||||
// increment $SRC / $DST by stride
|
||||
add32v %0, %0, %4
|
||||
// decrement copied Y count
|
||||
sub32v %9, %9, #1
|
||||
// if not zero, continue
|
||||
brnz32 loopX_y, %9
|
||||
|
||||
done128:
|
||||
fin
|
||||
fin
|
||||
);
|
||||
|
||||
// ****** ********************************************************************************************************
|
||||
|
||||
let mut pos;
|
||||
|
||||
@@ -671,5 +858,14 @@ fn main() -> std::io::Result<()> {
|
||||
println!("");
|
||||
println!("-> {}", mcode_copy.len());
|
||||
|
||||
pos = 0;
|
||||
println!("copyrev:");
|
||||
while pos < mcode_copyrev.len() {
|
||||
print!("0x{:08x},", mcode_copyrev[pos]);
|
||||
pos = pos + 1;
|
||||
}
|
||||
println!("");
|
||||
println!("-> {}", mcode_copyrev.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user