From 938d931e51347fe8e6b871f6d7cfc1b5e8a628cf Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Fri, 18 Mar 2022 23:00:03 +0100 Subject: [PATCH] more jareth --- NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c | 53 +++++++-- .../mit/xf86-video-goblin/dist/src/goblin.h | 14 +++ .../xf86-video-goblin/dist/src/goblin_accel.c | 45 +++++--- .../dist/src/goblin_driver.c | 37 +++++++ .../xf86-video-goblin/dist/src/goblin_regs.h | 1 + sbus-to-ztex-gateware-migen/jareth.py | 103 +++++++++++------- .../jareth_code/jareth_code.rs | 52 +++++---- 7 files changed, 221 insertions(+), 84 deletions(-) diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c index c96a69b..5824d54 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c @@ -120,12 +120,24 @@ struct scrolltest { int pm; int rop; }; + +/* debug only, to remove */ #define GOBLIN_SCROLL _IOW('X', 0, struct scrolltest) #define GOBLIN_FILL _IOW('X', 1, struct scrolltest) #define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest) #define GOBLIN_COPY _IOW('X', 3, struct scrolltest) #define GOBLIN_COPYREV _IOW('X', 4, struct scrolltest) +#define JARETH_FN_NUM_FILL 0 +#define JARETH_FN_NUM_FILLROP 1 +#define JARETH_FN_NUM_COPY 2 +#define JARETH_FN_NUM_COPYREV 3 +struct jareth_fn { + int off; + int len; +}; +#define JARETH_FN _IOWR('j', 0, struct jareth_fn) + static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *); static paddr_t goblin_mmap(void *, void *, off_t, int); static void goblin_init_screen(void *, struct vcons_screen *, int, long *); @@ -168,14 +180,14 @@ static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014 0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809, 0x0000000a,0x0000000a }; -static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x0b00000d,0x013f0014,0x003f0054,0x002400c0,0x00180000, - 0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248, - 0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814, - 0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220, - 0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220, - 0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a }; +static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x013c2014,0x003f0054,0x0a00000d,0x002400c0,0x00180000, + 0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248, + 0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814, + 0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220, + 0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220, + 0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a }; -static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x003af007,0x00280000,0x002c0040,0x00340080,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x060003c9,0x003c03d0,0x0028f285,0x002cf2c5,0x00800188,0x002b0286,0x003ef2c7,0x020003c9,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013f0014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a }; +static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x00280000,0x002c0040,0x00340080,0x003af007,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x003c03d0,0x0028f285,0x002cf2c5,0x02000188,0x003c0c00,0x003c03d0,0x0028f286,0x002cf2c6,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013c2014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a }; static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, program_copyrev, NULL }; @@ -430,6 +442,28 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l) } break; + case JARETH_FN: { + struct jareth_fn *fn = (struct jareth_fn *)data; + int pidx = -1; + if (!sc->sc_has_jareth) { + return ENXIO; + } + switch (fn->off) { + case JARETH_FN_NUM_FILL: pidx = 3; break; + case JARETH_FN_NUM_FILLROP: pidx = 4; break; + case JARETH_FN_NUM_COPY: pidx = 5; break; + case JARETH_FN_NUM_COPYREV: pidx = 6; break; + } + if (pidx != -1) { + fn->off = program_offset[pidx]; + fn->len = program_len[pidx]; + } else { + fn->off = -1; + fn->len = -1; + } + } + break; + default: return (ENOTTY); } @@ -1143,7 +1177,10 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi } else { //aprint_normal_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, jareth_ls_status_read(sc)); } - + +#if 1 + device_printf(sc->sc_dev, "last run took %d cycle (eng_clk)\n", jareth_cyc_counter_read(sc)); +#endif return 0; } diff --git a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin.h b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin.h index 07fe083..02b6db0 100644 --- a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin.h +++ b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin.h @@ -76,6 +76,10 @@ typedef struct { uint32_t fg; int xdir, ydir; uint32_t srcoff, srcpitch; + int fill_off, fill_len; + int fillrop_off, fillrop_len; + int copy_off, copy_len; + int copyrev_off, copyrev_len; } GoblinRec, *GoblinPtr; extern int GoblinScreenPrivateIndex; @@ -106,8 +110,18 @@ int GOBLINEXAInit(ScreenPtr); #include #include #define GOBLIN_SET_PIXELMODE _IOW('M', 3, int) +#define JARETH_FN_NUM_FILL 0 +#define JARETH_FN_NUM_FILLROP 1 +#define JARETH_FN_NUM_COPY 2 +#define JARETH_FN_NUM_COPYREV 3 +struct jareth_fn { + int off; + int len; +}; +#define JARETH_FN _IOWR('j', 0, struct jareth_fn) #else #define GOBLIN_SET_PIXELMODE (('M' << 8) | 3) +#error "toto" #endif #endif /* GOBLIN_H */ diff --git a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c index 72cec2a..6dc1a05 100644 --- a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c +++ b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c @@ -35,7 +35,7 @@ /* DGA stuff */ -#define DEBUG_GOBLIN 1 +//#define DEBUG_GOBLIN 1 #ifdef DEBUG_GOBLIN #define ENTER xf86Msg(X_ERROR, "%s>\n", __func__); @@ -249,6 +249,8 @@ GoblinWait(GoblinPtr pGoblin) if (status & 1) { xf86Msg(X_ERROR, "Jareth wait for idle timed out %08x %08x\n", status); + } else { + xf86Msg(X_INFO, "Jareth: last operation took %d cycles (eng_clk)\n", pGoblin->jreg->cyc_counter); } } @@ -348,12 +350,12 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg) if ((alu == 0x3) && // GCcopy (planemask == 0xFFFFFFFF)) { // full pattern // fill - pGoblin->jreg->mpstart = 37; // FIXME - pGoblin->jreg->mplen = 38; + pGoblin->jreg->mpstart = pGoblin->fill_off; + pGoblin->jreg->mplen = pGoblin->fill_len; } else { // fillrop - pGoblin->jreg->mpstart = 75; // FIXME - pGoblin->jreg->mplen = 41; + pGoblin->jreg->mpstart = pGoblin->fillrop_off; + pGoblin->jreg->mplen = pGoblin->fillrop_len; } return TRUE; } @@ -429,18 +431,31 @@ GoblinPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap, pGoblin->last_mask = planemask; pGoblin->last_rop = alu; - if ((alu == 0x3) && // GCcopy - (planemask == 0xFFFFFFFF)) { // full pattern - // fill - pGoblin->jreg->mpstart = 116; // FIXME - pGoblin->jreg->mplen = 49; + if (pGoblin->xdir > 0) { + if ((alu == 0x3) && // GCcopy + (planemask == 0xFFFFFFFF)) { // full pattern + // fill + pGoblin->jreg->mpstart = pGoblin->copy_off; + pGoblin->jreg->mplen = pGoblin->copy_len; + } else { + // fillrop + pGoblin->jreg->mpstart = pGoblin->copy_off; // FIXME + pGoblin->jreg->mplen = pGoblin->copy_len; + } } else { - // fillrop - pGoblin->jreg->mpstart = 116; // FIXME FIXME FIXME - pGoblin->jreg->mplen = 49; + if ((alu == 0x3) && // GCcopy + (planemask == 0xFFFFFFFF)) { // full pattern + // fill + pGoblin->jreg->mpstart = pGoblin->copyrev_off; + pGoblin->jreg->mplen = pGoblin->copyrev_len; + } else { + // fillrop + pGoblin->jreg->mpstart = pGoblin->copyrev_off; // FIXME + pGoblin->jreg->mplen = pGoblin->copyrev_len; + } } - DPRINTF(X_ERROR, "PrepareCopy: alu %d, pm 0x%08\n", alu, planemask); + DPRINTF(X_ERROR, "PrepareCopy: alu %d, pm 0x%08x, xdir/ydir %d/%d\n", alu, planemask, xdir, ydir); return TRUE; } @@ -471,8 +486,6 @@ GoblinCopy(PixmapPtr pDstPixmap, dstpitch = -dstpitch; } - // FIXME: xdir < 0 - // 32 bits w = w*4; diff --git a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_driver.c b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_driver.c index 10648e9..7010542 100644 --- a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_driver.c +++ b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_driver.c @@ -554,7 +554,44 @@ GOBLINScreenInit(SCREEN_INIT_ARGS_DECL) xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "xf86MapSbusMem failed for Jareth\n"); pGoblin->has_accel = FALSE; } else { + struct jareth_fn jfn; xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Jareth successfully mapped\n"); + // get some functions + jfn.off = JARETH_FN_NUM_FILL; + if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) { + xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Fill function retrieval failed for Jareth\n"); + pGoblin->has_accel = FALSE; + } else { + pGoblin->fill_off = jfn.off; + pGoblin->fill_len = jfn.len; + } + jfn.off = JARETH_FN_NUM_FILLROP; + if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) { + xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Fillrop function retrieval failed for Jareth\n"); + pGoblin->has_accel = FALSE; + } else { + pGoblin->fillrop_off = jfn.off; + pGoblin->fillrop_len = jfn.len; + } + jfn.off = JARETH_FN_NUM_COPY; + if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) { + xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Copy function retrieval failed for Jareth\n"); + pGoblin->has_accel = FALSE; + } else { + pGoblin->copy_off = jfn.off; + pGoblin->copy_len = jfn.len; + } + jfn.off = JARETH_FN_NUM_COPYREV; + if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) { + xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Copyrev function retrieval failed for Jareth\n"); + pGoblin->has_accel = FALSE; + } else { + pGoblin->copyrev_off = jfn.off; + pGoblin->copyrev_len = jfn.len; + } + xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Jareth functions: fill %d %d, fillrop %d %d, copy %d %d, copyrev %d %d\n", + pGoblin->fill_off, pGoblin->fill_len, pGoblin->fillrop_off, pGoblin->fillrop_len, + pGoblin->copy_off, pGoblin->copy_len, pGoblin->copyrev_off, pGoblin->copyrev_len); } } diff --git a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_regs.h b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_regs.h index 133b8ce..ba378e1 100644 --- a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_regs.h +++ b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_regs.h @@ -72,6 +72,7 @@ typedef struct jareth_reg { volatile uint32_t ev_enable; volatile uint32_t instruction; volatile uint32_t ls_status; + volatile uint32_t cyc_counter; } JarethReg, *JarethRegPtr; typedef struct jareth_microcode { diff --git a/sbus-to-ztex-gateware-migen/jareth.py b/sbus-to-ztex-gateware-migen/jareth.py index c59fe02..80c5cbd 100644 --- a/sbus-to-ztex-gateware-migen/jareth.py +++ b/sbus-to-ztex-gateware-migen/jareth.py @@ -503,7 +503,7 @@ class ExecLS(ExecUnit, AutoDoc): If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]), NextValue(cpar, 0), NextValue(address, addresses[self.instruction.immediate[0:log2_int(width//32)]]), - NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]] == 0x8)), + NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]][24:28] == 0x8)), NextState("DOMEM"), ).Elif(self.instruction.opcode == opcodes["SETM"][0], Case(self.instruction.immediate[0:2], @@ -516,7 +516,8 @@ class ExecLS(ExecUnit, AutoDoc): NextState("MEM_ODD") ], 0x2 : [ NextValue(r_dat_f[2], self.a[(granule_bits-3):len(r_dat_f[2])]), NextValue(offset, self.a[(granule_bits-3):len(r_dat_f[2])]), - NextValue(offsetpsize, self.b[0:max_size_bits] + ((self.a[(granule_bits-3):len(r_dat_f[2])]) << (granule_bits-3)) ), + #NextValue(offsetpsize, self.b[0:max_size_bits] + ((self.a[(granule_bits-3):len(r_dat_f[2])]) << (granule_bits-3)) ), + NextValue(offsetpsize, self.b[0:max_size_bits]), NextState("GENMASK_R0"), ], 0x1 : [ NextValue(r_dat_f[1], self.a[(granule_bits-3):len(r_dat_f[1])]), @@ -606,27 +607,37 @@ class ExecLS(ExecUnit, AutoDoc): ) ) ) - for X in range(0, granule_num): - lsseq.act("GENMASK_R" + str(X), - NextValue(cpar, cpar ^ 1), - If((offsetpsize > X) & (X >= offset), - NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 1), - ).Else( - NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 0), - ), - If(X == (granule_num-1), - If(cpar, ## checkme - NextState("MEM_ODD") - ).Else( - NextState("MEM_EVEN1") - ) - ).Else( - NextState("GENMASK_R" + str(X+1)), - ), - ) - lsseq.act("GENMASK_R"+str(granule_num), # avoids MiGen complaining, unreachable + #for X in range(0, granule_num): + # lsseq.act("GENMASK_R" + str(X), + # NextValue(cpar, cpar ^ 1), + # If((offsetpsize > X) & (X >= offset), + # NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 1), + # ).Else( + # NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 0), + # ), + # If(X == (granule_num-1), + # If(cpar, + # NextState("MEM_ODD") + # ).Else( + # NextState("MEM_EVEN1") + # ) + # ).Else( + # NextState("GENMASK_R" + str(X+1)), + # ), + # ) + #lsseq.act("GENMASK_R"+str(granule_num), # avoids MiGen complaining, unreachable + # NextValue(cpar, cpar ^ 1), + # If(cpar, + # NextState("MEM_ODD") + # ).Else( + # NextState("MEM_EVEN1") + # ) + #) + lsseq.act("GENMASK_R0", NextValue(cpar, cpar ^ 1), - If(cpar, ## checkme + NextValue(r_dat_m[self.instruction.immediate[0:2]], + (((Signal(33, reset=1) << offsetpsize) - 1) << (offset))), + If(cpar, NextState("MEM_ODD") ).Else( NextState("MEM_EVEN1") @@ -665,7 +676,7 @@ class ExecLS(ExecUnit, AutoDoc): If(wishbone & ~interface.ack, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), - ).Elif(self.instruction.immediate[5], # post-inc + ).Elif(self.instruction.immediate[5], # post-dec NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), If(self.instruction.immediate[8], @@ -682,7 +693,7 @@ class ExecLS(ExecUnit, AutoDoc): If(self.instruction.opcode == opcodes["MEM"][0], NextValue(lbuf[128:256], 0), ), - If(cpar, ## checkme + If(cpar, NextState("MEM_ODD") ).Else( NextState("MEM_EVEN1") @@ -697,7 +708,7 @@ class ExecLS(ExecUnit, AutoDoc): If(memoryport.cmd.ready, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), - ).Elif(self.instruction.immediate[5], # post-inc + ).Elif(self.instruction.immediate[5], # post-dec NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), NextState("MEMh"), @@ -705,11 +716,13 @@ class ExecLS(ExecUnit, AutoDoc): ).Else( # no high If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), - ).Elif(self.instruction.immediate[5], # post-inc + ).Elif(self.instruction.immediate[5], # post-dec NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), - NextValue(lbuf[128:256], 0), - If(cpar, ## checkme + If(self.instruction.opcode == opcodes["MEM"][0], + NextValue(lbuf[128:256], 0), + ), + If(cpar, NextState("MEM_ODD") ).Else( NextState("MEM_EVEN1") @@ -749,11 +762,11 @@ class ExecLS(ExecUnit, AutoDoc): If(wishbone & ~interface.ack, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), - ).Elif(self.instruction.immediate[5], # post-inc + ).Elif(self.instruction.immediate[5], # post-dec NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), #NextValue(tries, 0), - If(cpar, ## checkme + If(cpar, NextState("MEM_ODD") ).Else( NextState("MEM_EVEN1") @@ -761,10 +774,10 @@ class ExecLS(ExecUnit, AutoDoc): ).Elif(~wishbone, If(self.instruction.immediate[6], # post-inc NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), - ).Elif(self.instruction.immediate[5], # post-inc + ).Elif(self.instruction.immediate[5], # post-dec NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1), ), - If(cpar, ## checkme + If(cpar, NextState("MEM_ODD") ).Else( NextState("MEM_EVEN1") @@ -785,7 +798,7 @@ class ExecLS(ExecUnit, AutoDoc): # NextValue(tries, 1), # NextState("IDLE") #).Else(NextValue(tries, 0), # no third attempt, give up - If(cpar, ## checkme + If(cpar, NextState("MEM_ODD") ).Else( NextState("MEM_EVEN1") @@ -802,21 +815,21 @@ class ExecLS(ExecUnit, AutoDoc): self.q.eq(0), #self.a ) ).Elif(self.instruction.opcode == opcodes["SETM"][0], - self.q.eq(0), #self.a + self.q.eq(0), #self.a ).Elif(self.instruction.opcode == opcodes["ADR"][0], - If(~self.instruction.immediate[7], + If(~self.instruction.immediate[7], # getadr [ self.q[x*32:(x+1)*32].eq(Cat(Signal(4, reset = 0), addresses[x])) for x in range(width//32) ], ).Else( self.q.eq(0), ) ).Elif(self.instruction.opcode == opcodes["GETM"][0], - self.q.eq(Cat(Cat(r_dat_f[0], Signal(28, reset = 0)), + self.q.eq(Cat(Cat(r_dat_f[0], Signal(32-len(r_dat_f[0]), reset = 0)), r_dat_m[0], - Cat(r_dat_f[1], Signal(28, reset = 0)), + Cat(r_dat_f[1], Signal(32-len(r_dat_f[1]), reset = 0)), r_dat_m[1], - Cat(r_dat_f[2], Signal(28, reset = 0)), + Cat(r_dat_f[2], Signal(32-len(r_dat_f[2]), reset = 0)), r_dat_m[2], - Cat(r_dat_f[3], Signal(28, reset = 0)), + Cat(r_dat_f[3], Signal(32-len(r_dat_f[3]), reset = 0)), r_dat_m[3])), ).Else( self.q.eq(0xBADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000), @@ -1047,6 +1060,7 @@ Here are the currently implemented opcodes for The Engine: ] self.ls_status = CSRStatus(32, description="Status of the L/S unit") + self.cyc_counter = CSRStatus(32, description="Cycle counter for each run") ### wishbone bus interface: decode the two address spaces and dispatch accordingly self.bus = bus = wishbone.Interface() @@ -1495,6 +1509,17 @@ Here are the currently implemented opcodes for The Engine: self.sync += abort.eq((abort & ~engine_go) | (self.exec_ls.has_failure[0] | self.exec_ls.has_failure[1] | self.exec_ls.has_timeout[0] | self.exec_ls.has_timeout[1])) self.comb += self.ls_status.status.eq(self.exec_ls.state) + cycctr = Signal(32) + engine_go_old = Signal() + self.sync.eng_clk += [ + engine_go_old.eq(engine_go), + If(running, + cycctr.eq(cycctr + 1)), + If(engine_go & ~engine_go_old, # pos edge + cycctr.eq(0)), + ] + self.comb += self.cyc_counter.status.eq(cycctr) + ##### TIMING CONSTRAINTS -- you want these. Trust me. clk50 = "clk50" diff --git a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs index 792cfd2..01c4b18 100644 --- a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs +++ b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs @@ -491,14 +491,14 @@ fn main() -> std::io::Result<()> { brz32 done128, %3 // reset masks resm %15 + // set alignement; we shift by the addr offset + setmq %15, %0, %2 + setma %15, %1, #16 // if $DST is aligned on 128 bits, jump to aligned loop brz4 start128, %0 // do the first column to align $DST startX: - // set alignement; we shift by the addr offset - setmq %15, %0, #16 - setma %15, %1, #16 // copy Y psa %9, %3 // copy $SRC / $DST @@ -614,6 +614,7 @@ fn main() -> std::io::Result<()> { // leftover X in %6 // data in %7 // masked data in %7 + // src data in %8 // live X count in %9 // $SRC / $DST in %10 // $DST / $SRC in %11 @@ -636,13 +637,13 @@ fn main() -> std::io::Result<()> { brz32 done128, %3 // reset masks resm %15 - // compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16) - and %14, %0, #15 // copy addresses psa %10, %0 psa %11, %1 // set todo X psa %13, %2 + // compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16), first the offset + and %14, %0, #15 // if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16) brz32 skip, %14 @@ -652,9 +653,9 @@ fn main() -> std::io::Result<()> { min32v %14, %14, %2 // more than one address to increment bcast32 %14, %14 - // add the count to the addresses, SRC will now be aligned + // add the count to the addresses, DST will now be aligned add32v %10, %10, %14 - // add the count to the addresses, DST will have the proper alignment to shift input in the aligned loop + // add the count to the addresses, SRC will have the proper alignment to shift input in the aligned loop add32v %11, %11, %14 // so, do we do everything there ? sub32v %13, %2, %14 @@ -664,41 +665,50 @@ fn main() -> std::io::Result<()> { skip: // reset q mask (we will be aligned from now on) setmq %15, #0, #16 - // reset a mask to the proper shifting + // set a mask to the proper shifting setma %15, %11, #16 // now we need to figure out where we start to go backward - // currently we have the number of 'tail' (first column) elements in %14 (0 for aligned), number of 'loop' elements in %13, - // and $SRC+%14 & $DST+%14 in $10/$11 we $SRC+%14 aligned. + // currently we have the number of 'tail' (first column...) elements in %14 (0 for aligned), number of 'loop' elements in %13, + // and $SRC+%14 & $DST+%14 in $10/$11 with $SRC+%14 aligned. // compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those and %6, %13, #15 // compute the 'aligned' number of elements sub32v %15, %13, %6 - // if 0, jump to the main loop as we already have the proper addresses - brz32 loop128_y, %15 - bcast32 %15, %15 + // add the aligned number of element to $SRC+%14 & $DST+%14 add32v %10, %10, %15 add32v %11, %11, %15 - // if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST + // if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST and $SRC brnz32 skip2, %6 - sub32v %10, %10, #16 - skip2: // if $SRC is not aligned, we also need to add 16 (for prefetch) - and %15, %11, #15 - brz32 skip3, %15 + psa %15, #16 + bcast32 %15, %15 + sub32v %10, %10, %15 + sub32v %11, %11, %15 + + skip2: // // if $SRC+%13 is not aligned, we also need to add 16 (for prefetch) + // add32v %15, %11, %6 + // and %15, %15, #15 + // brz32 skip3, %15 + add32v %11, %11, #16 psa %15, #16 swap32 %15, %15 add32v %10, %10, %15 + + // add32v %15, %6, #16 + // add32v %11, %11, %15 + // swap32 %15, %15 + // add32v %10, %10, %15 skip3: // copy Y count psa %12, %3 loop128_y: - // set source and destination addresses for current Y // FIXME : +X, -1? + // set source and destination addresses for current Y setadr %15, %10 // then the rounded value in X sub32v %9, %13, %6 @@ -755,7 +765,7 @@ fn main() -> std::io::Result<()> { // do the first column if we need to brz32 done128, %14 // set alignement; we shift by the addr offset - setmq %15, %0, #16 + setmq %15, %0, %2 setma %15, %1, #16 // copy Y psa %9, %3 @@ -777,7 +787,7 @@ fn main() -> std::io::Result<()> { // if not zero, continue brnz32 loopX_y, %9 - done128: + done128: fin fin );