1
0
mirror of synced 2026-03-06 10:43:38 +00:00

more jareth

This commit is contained in:
Romain Dolbeau
2022-03-18 23:00:03 +01:00
parent d8504c8713
commit 938d931e51
7 changed files with 221 additions and 84 deletions

View File

@@ -120,12 +120,24 @@ struct scrolltest {
int pm;
int rop;
};
/* debug only, to remove */
#define GOBLIN_SCROLL _IOW('X', 0, struct scrolltest)
#define GOBLIN_FILL _IOW('X', 1, struct scrolltest)
#define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest)
#define GOBLIN_COPY _IOW('X', 3, struct scrolltest)
#define GOBLIN_COPYREV _IOW('X', 4, struct scrolltest)
#define JARETH_FN_NUM_FILL 0
#define JARETH_FN_NUM_FILLROP 1
#define JARETH_FN_NUM_COPY 2
#define JARETH_FN_NUM_COPYREV 3
struct jareth_fn {
int off;
int len;
};
#define JARETH_FN _IOWR('j', 0, struct jareth_fn)
static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *);
static paddr_t goblin_mmap(void *, void *, off_t, int);
static void goblin_init_screen(void *, struct vcons_screen *, int, long *);
@@ -168,14 +180,14 @@ static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014
0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,
0x0000000a,0x0000000a };
static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x0b00000d,0x013f0014,0x003f0054,0x002400c0,0x00180000,
0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248,
0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814,
0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220,
0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220,
0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x013c2014,0x003f0054,0x0a00000d,0x002400c0,0x00180000,
0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248,
0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814,
0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220,
0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220,
0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x003af007,0x00280000,0x002c0040,0x00340080,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x060003c9,0x003c03d0,0x0028f285,0x002cf2c5,0x00800188,0x002b0286,0x003ef2c7,0x020003c9,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013f0014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a };
static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x00280000,0x002c0040,0x00340080,0x003af007,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x003c03d0,0x0028f285,0x002cf2c5,0x02000188,0x003c0c00,0x003c03d0,0x0028f286,0x002cf2c6,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013c2014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a };
static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill,
program_fillrop, program_copy, program_copyrev, NULL };
@@ -430,6 +442,28 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
}
break;
case JARETH_FN: {
struct jareth_fn *fn = (struct jareth_fn *)data;
int pidx = -1;
if (!sc->sc_has_jareth) {
return ENXIO;
}
switch (fn->off) {
case JARETH_FN_NUM_FILL: pidx = 3; break;
case JARETH_FN_NUM_FILLROP: pidx = 4; break;
case JARETH_FN_NUM_COPY: pidx = 5; break;
case JARETH_FN_NUM_COPYREV: pidx = 6; break;
}
if (pidx != -1) {
fn->off = program_offset[pidx];
fn->len = program_len[pidx];
} else {
fn->off = -1;
fn->len = -1;
}
}
break;
default:
return (ENOTTY);
}
@@ -1143,7 +1177,10 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi
} else {
//aprint_normal_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, jareth_ls_status_read(sc));
}
#if 1
device_printf(sc->sc_dev, "last run took %d cycle (eng_clk)\n", jareth_cyc_counter_read(sc));
#endif
return 0;
}

View File

@@ -76,6 +76,10 @@ typedef struct {
uint32_t fg;
int xdir, ydir;
uint32_t srcoff, srcpitch;
int fill_off, fill_len;
int fillrop_off, fillrop_len;
int copy_off, copy_len;
int copyrev_off, copyrev_len;
} GoblinRec, *GoblinPtr;
extern int GoblinScreenPrivateIndex;
@@ -106,8 +110,18 @@ int GOBLINEXAInit(ScreenPtr);
#include <dev/sun/fbio.h>
#include <sys/ioccom.h>
#define GOBLIN_SET_PIXELMODE _IOW('M', 3, int)
#define JARETH_FN_NUM_FILL 0
#define JARETH_FN_NUM_FILLROP 1
#define JARETH_FN_NUM_COPY 2
#define JARETH_FN_NUM_COPYREV 3
struct jareth_fn {
int off;
int len;
};
#define JARETH_FN _IOWR('j', 0, struct jareth_fn)
#else
#define GOBLIN_SET_PIXELMODE (('M' << 8) | 3)
#error "toto"
#endif
#endif /* GOBLIN_H */

View File

@@ -35,7 +35,7 @@
/* DGA stuff */
#define DEBUG_GOBLIN 1
//#define DEBUG_GOBLIN 1
#ifdef DEBUG_GOBLIN
#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
@@ -249,6 +249,8 @@ GoblinWait(GoblinPtr pGoblin)
if (status & 1) {
xf86Msg(X_ERROR, "Jareth wait for idle timed out %08x %08x\n", status);
} else {
xf86Msg(X_INFO, "Jareth: last operation took %d cycles (eng_clk)\n", pGoblin->jreg->cyc_counter);
}
}
@@ -348,12 +350,12 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
if ((alu == 0x3) && // GCcopy
(planemask == 0xFFFFFFFF)) { // full pattern
// fill
pGoblin->jreg->mpstart = 37; // FIXME
pGoblin->jreg->mplen = 38;
pGoblin->jreg->mpstart = pGoblin->fill_off;
pGoblin->jreg->mplen = pGoblin->fill_len;
} else {
// fillrop
pGoblin->jreg->mpstart = 75; // FIXME
pGoblin->jreg->mplen = 41;
pGoblin->jreg->mpstart = pGoblin->fillrop_off;
pGoblin->jreg->mplen = pGoblin->fillrop_len;
}
return TRUE;
}
@@ -429,18 +431,31 @@ GoblinPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
pGoblin->last_mask = planemask;
pGoblin->last_rop = alu;
if ((alu == 0x3) && // GCcopy
(planemask == 0xFFFFFFFF)) { // full pattern
// fill
pGoblin->jreg->mpstart = 116; // FIXME
pGoblin->jreg->mplen = 49;
if (pGoblin->xdir > 0) {
if ((alu == 0x3) && // GCcopy
(planemask == 0xFFFFFFFF)) { // full pattern
// fill
pGoblin->jreg->mpstart = pGoblin->copy_off;
pGoblin->jreg->mplen = pGoblin->copy_len;
} else {
// fillrop
pGoblin->jreg->mpstart = pGoblin->copy_off; // FIXME
pGoblin->jreg->mplen = pGoblin->copy_len;
}
} else {
// fillrop
pGoblin->jreg->mpstart = 116; // FIXME FIXME FIXME
pGoblin->jreg->mplen = 49;
if ((alu == 0x3) && // GCcopy
(planemask == 0xFFFFFFFF)) { // full pattern
// fill
pGoblin->jreg->mpstart = pGoblin->copyrev_off;
pGoblin->jreg->mplen = pGoblin->copyrev_len;
} else {
// fillrop
pGoblin->jreg->mpstart = pGoblin->copyrev_off; // FIXME
pGoblin->jreg->mplen = pGoblin->copyrev_len;
}
}
DPRINTF(X_ERROR, "PrepareCopy: alu %d, pm 0x%08\n", alu, planemask);
DPRINTF(X_ERROR, "PrepareCopy: alu %d, pm 0x%08x, xdir/ydir %d/%d\n", alu, planemask, xdir, ydir);
return TRUE;
}
@@ -471,8 +486,6 @@ GoblinCopy(PixmapPtr pDstPixmap,
dstpitch = -dstpitch;
}
// FIXME: xdir < 0
// 32 bits
w = w*4;

View File

@@ -554,7 +554,44 @@ GOBLINScreenInit(SCREEN_INIT_ARGS_DECL)
xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "xf86MapSbusMem failed for Jareth\n");
pGoblin->has_accel = FALSE;
} else {
struct jareth_fn jfn;
xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Jareth successfully mapped\n");
// get some functions
jfn.off = JARETH_FN_NUM_FILL;
if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) {
xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Fill function retrieval failed for Jareth\n");
pGoblin->has_accel = FALSE;
} else {
pGoblin->fill_off = jfn.off;
pGoblin->fill_len = jfn.len;
}
jfn.off = JARETH_FN_NUM_FILLROP;
if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) {
xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Fillrop function retrieval failed for Jareth\n");
pGoblin->has_accel = FALSE;
} else {
pGoblin->fillrop_off = jfn.off;
pGoblin->fillrop_len = jfn.len;
}
jfn.off = JARETH_FN_NUM_COPY;
if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) {
xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Copy function retrieval failed for Jareth\n");
pGoblin->has_accel = FALSE;
} else {
pGoblin->copy_off = jfn.off;
pGoblin->copy_len = jfn.len;
}
jfn.off = JARETH_FN_NUM_COPYREV;
if (ioctl (pGoblin->psdp->fd, JARETH_FN, &jfn) || (jfn.off == -1)) {
xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Copyrev function retrieval failed for Jareth\n");
pGoblin->has_accel = FALSE;
} else {
pGoblin->copyrev_off = jfn.off;
pGoblin->copyrev_len = jfn.len;
}
xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Jareth functions: fill %d %d, fillrop %d %d, copy %d %d, copyrev %d %d\n",
pGoblin->fill_off, pGoblin->fill_len, pGoblin->fillrop_off, pGoblin->fillrop_len,
pGoblin->copy_off, pGoblin->copy_len, pGoblin->copyrev_off, pGoblin->copyrev_len);
}
}

View File

@@ -72,6 +72,7 @@ typedef struct jareth_reg {
volatile uint32_t ev_enable;
volatile uint32_t instruction;
volatile uint32_t ls_status;
volatile uint32_t cyc_counter;
} JarethReg, *JarethRegPtr;
typedef struct jareth_microcode {

View File

@@ -503,7 +503,7 @@ class ExecLS(ExecUnit, AutoDoc):
If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]) | (self.instruction.opcode == opcodes["LOADL"][0]),
NextValue(cpar, 0),
NextValue(address, addresses[self.instruction.immediate[0:log2_int(width//32)]]),
NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]] == 0x8)),
NextValue(wishbone, ~(addresses[self.instruction.immediate[0:log2_int(width//32)]][24:28] == 0x8)),
NextState("DOMEM"),
).Elif(self.instruction.opcode == opcodes["SETM"][0],
Case(self.instruction.immediate[0:2],
@@ -516,7 +516,8 @@ class ExecLS(ExecUnit, AutoDoc):
NextState("MEM_ODD") ],
0x2 : [ NextValue(r_dat_f[2], self.a[(granule_bits-3):len(r_dat_f[2])]),
NextValue(offset, self.a[(granule_bits-3):len(r_dat_f[2])]),
NextValue(offsetpsize, self.b[0:max_size_bits] + ((self.a[(granule_bits-3):len(r_dat_f[2])]) << (granule_bits-3)) ),
#NextValue(offsetpsize, self.b[0:max_size_bits] + ((self.a[(granule_bits-3):len(r_dat_f[2])]) << (granule_bits-3)) ),
NextValue(offsetpsize, self.b[0:max_size_bits]),
NextState("GENMASK_R0"),
],
0x1 : [ NextValue(r_dat_f[1], self.a[(granule_bits-3):len(r_dat_f[1])]),
@@ -606,27 +607,37 @@ class ExecLS(ExecUnit, AutoDoc):
)
)
)
for X in range(0, granule_num):
lsseq.act("GENMASK_R" + str(X),
NextValue(cpar, cpar ^ 1),
If((offsetpsize > X) & (X >= offset),
NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 1),
).Else(
NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 0),
),
If(X == (granule_num-1),
If(cpar, ## checkme
NextState("MEM_ODD")
).Else(
NextState("MEM_EVEN1")
)
).Else(
NextState("GENMASK_R" + str(X+1)),
),
)
lsseq.act("GENMASK_R"+str(granule_num), # avoids MiGen complaining, unreachable
#for X in range(0, granule_num):
# lsseq.act("GENMASK_R" + str(X),
# NextValue(cpar, cpar ^ 1),
# If((offsetpsize > X) & (X >= offset),
# NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 1),
# ).Else(
# NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 0),
# ),
# If(X == (granule_num-1),
# If(cpar,
# NextState("MEM_ODD")
# ).Else(
# NextState("MEM_EVEN1")
# )
# ).Else(
# NextState("GENMASK_R" + str(X+1)),
# ),
# )
#lsseq.act("GENMASK_R"+str(granule_num), # avoids MiGen complaining, unreachable
# NextValue(cpar, cpar ^ 1),
# If(cpar,
# NextState("MEM_ODD")
# ).Else(
# NextState("MEM_EVEN1")
# )
#)
lsseq.act("GENMASK_R0",
NextValue(cpar, cpar ^ 1),
If(cpar, ## checkme
NextValue(r_dat_m[self.instruction.immediate[0:2]],
(((Signal(33, reset=1) << offsetpsize) - 1) << (offset))),
If(cpar,
NextState("MEM_ODD")
).Else(
NextState("MEM_EVEN1")
@@ -665,7 +676,7 @@ class ExecLS(ExecUnit, AutoDoc):
If(wishbone & ~interface.ack,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
).Elif(self.instruction.immediate[5], # post-dec
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
If(self.instruction.immediate[8],
@@ -682,7 +693,7 @@ class ExecLS(ExecUnit, AutoDoc):
If(self.instruction.opcode == opcodes["MEM"][0],
NextValue(lbuf[128:256], 0),
),
If(cpar, ## checkme
If(cpar,
NextState("MEM_ODD")
).Else(
NextState("MEM_EVEN1")
@@ -697,7 +708,7 @@ class ExecLS(ExecUnit, AutoDoc):
If(memoryport.cmd.ready,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
).Elif(self.instruction.immediate[5], # post-dec
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
NextState("MEMh"),
@@ -705,11 +716,13 @@ class ExecLS(ExecUnit, AutoDoc):
).Else( # no high
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
).Elif(self.instruction.immediate[5], # post-dec
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
NextValue(lbuf[128:256], 0),
If(cpar, ## checkme
If(self.instruction.opcode == opcodes["MEM"][0],
NextValue(lbuf[128:256], 0),
),
If(cpar,
NextState("MEM_ODD")
).Else(
NextState("MEM_EVEN1")
@@ -749,11 +762,11 @@ class ExecLS(ExecUnit, AutoDoc):
If(wishbone & ~interface.ack,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
).Elif(self.instruction.immediate[5], # post-dec
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
#NextValue(tries, 0),
If(cpar, ## checkme
If(cpar,
NextState("MEM_ODD")
).Else(
NextState("MEM_EVEN1")
@@ -761,10 +774,10 @@ class ExecLS(ExecUnit, AutoDoc):
).Elif(~wishbone,
If(self.instruction.immediate[6], # post-inc
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
).Elif(self.instruction.immediate[5], # post-inc
).Elif(self.instruction.immediate[5], # post-dec
NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] - 1),
),
If(cpar, ## checkme
If(cpar,
NextState("MEM_ODD")
).Else(
NextState("MEM_EVEN1")
@@ -785,7 +798,7 @@ class ExecLS(ExecUnit, AutoDoc):
# NextValue(tries, 1),
# NextState("IDLE")
#).Else(NextValue(tries, 0), # no third attempt, give up
If(cpar, ## checkme
If(cpar,
NextState("MEM_ODD")
).Else(
NextState("MEM_EVEN1")
@@ -802,21 +815,21 @@ class ExecLS(ExecUnit, AutoDoc):
self.q.eq(0), #self.a
)
).Elif(self.instruction.opcode == opcodes["SETM"][0],
self.q.eq(0), #self.a
self.q.eq(0), #self.a
).Elif(self.instruction.opcode == opcodes["ADR"][0],
If(~self.instruction.immediate[7],
If(~self.instruction.immediate[7], # getadr
[ self.q[x*32:(x+1)*32].eq(Cat(Signal(4, reset = 0), addresses[x])) for x in range(width//32) ],
).Else(
self.q.eq(0),
)
).Elif(self.instruction.opcode == opcodes["GETM"][0],
self.q.eq(Cat(Cat(r_dat_f[0], Signal(28, reset = 0)),
self.q.eq(Cat(Cat(r_dat_f[0], Signal(32-len(r_dat_f[0]), reset = 0)),
r_dat_m[0],
Cat(r_dat_f[1], Signal(28, reset = 0)),
Cat(r_dat_f[1], Signal(32-len(r_dat_f[1]), reset = 0)),
r_dat_m[1],
Cat(r_dat_f[2], Signal(28, reset = 0)),
Cat(r_dat_f[2], Signal(32-len(r_dat_f[2]), reset = 0)),
r_dat_m[2],
Cat(r_dat_f[3], Signal(28, reset = 0)),
Cat(r_dat_f[3], Signal(32-len(r_dat_f[3]), reset = 0)),
r_dat_m[3])),
).Else(
self.q.eq(0xBADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000),
@@ -1047,6 +1060,7 @@ Here are the currently implemented opcodes for The Engine:
]
self.ls_status = CSRStatus(32, description="Status of the L/S unit")
self.cyc_counter = CSRStatus(32, description="Cycle counter for each run")
### wishbone bus interface: decode the two address spaces and dispatch accordingly
self.bus = bus = wishbone.Interface()
@@ -1495,6 +1509,17 @@ Here are the currently implemented opcodes for The Engine:
self.sync += abort.eq((abort & ~engine_go) | (self.exec_ls.has_failure[0] | self.exec_ls.has_failure[1] | self.exec_ls.has_timeout[0] | self.exec_ls.has_timeout[1]))
self.comb += self.ls_status.status.eq(self.exec_ls.state)
cycctr = Signal(32)
engine_go_old = Signal()
self.sync.eng_clk += [
engine_go_old.eq(engine_go),
If(running,
cycctr.eq(cycctr + 1)),
If(engine_go & ~engine_go_old, # pos edge
cycctr.eq(0)),
]
self.comb += self.cyc_counter.status.eq(cycctr)
##### TIMING CONSTRAINTS -- you want these. Trust me.
clk50 = "clk50"

View File

@@ -491,14 +491,14 @@ fn main() -> std::io::Result<()> {
brz32 done128, %3
// reset masks
resm %15
// set alignement; we shift by the addr offset
setmq %15, %0, %2
setma %15, %1, #16
// if $DST is aligned on 128 bits, jump to aligned loop
brz4 start128, %0
// do the first column to align $DST
startX:
// set alignement; we shift by the addr offset
setmq %15, %0, #16
setma %15, %1, #16
// copy Y
psa %9, %3
// copy $SRC / $DST
@@ -614,6 +614,7 @@ fn main() -> std::io::Result<()> {
// leftover X in %6
// data in %7
// masked data in %7
// src data in %8
// live X count in %9
// $SRC / $DST in %10
// $DST / $SRC in %11
@@ -636,13 +637,13 @@ fn main() -> std::io::Result<()> {
brz32 done128, %3
// reset masks
resm %15
// compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16)
and %14, %0, #15
// copy addresses
psa %10, %0
psa %11, %1
// set todo X
psa %13, %2
// compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16), first the offset
and %14, %0, #15
// if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16)
brz32 skip, %14
@@ -652,9 +653,9 @@ fn main() -> std::io::Result<()> {
min32v %14, %14, %2
// more than one address to increment
bcast32 %14, %14
// add the count to the addresses, SRC will now be aligned
// add the count to the addresses, DST will now be aligned
add32v %10, %10, %14
// add the count to the addresses, DST will have the proper alignment to shift input in the aligned loop
// add the count to the addresses, SRC will have the proper alignment to shift input in the aligned loop
add32v %11, %11, %14
// so, do we do everything there ?
sub32v %13, %2, %14
@@ -664,41 +665,50 @@ fn main() -> std::io::Result<()> {
skip:
// reset q mask (we will be aligned from now on)
setmq %15, #0, #16
// reset a mask to the proper shifting
// set a mask to the proper shifting
setma %15, %11, #16
// now we need to figure out where we start to go backward
// currently we have the number of 'tail' (first column) elements in %14 (0 for aligned), number of 'loop' elements in %13,
// and $SRC+%14 & $DST+%14 in $10/$11 we $SRC+%14 aligned.
// currently we have the number of 'tail' (first column...) elements in %14 (0 for aligned), number of 'loop' elements in %13,
// and $SRC+%14 & $DST+%14 in $10/$11 with $SRC+%14 aligned.
// compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those
and %6, %13, #15
// compute the 'aligned' number of elements
sub32v %15, %13, %6
// if 0, jump to the main loop as we already have the proper addresses
brz32 loop128_y, %15
bcast32 %15, %15
// add the aligned number of element to $SRC+%14 & $DST+%14
add32v %10, %10, %15
add32v %11, %11, %15
// if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST
// if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST and $SRC
brnz32 skip2, %6
sub32v %10, %10, #16
skip2: // if $SRC is not aligned, we also need to add 16 (for prefetch)
and %15, %11, #15
brz32 skip3, %15
psa %15, #16
bcast32 %15, %15
sub32v %10, %10, %15
sub32v %11, %11, %15
skip2: // // if $SRC+%13 is not aligned, we also need to add 16 (for prefetch)
// add32v %15, %11, %6
// and %15, %15, #15
// brz32 skip3, %15
add32v %11, %11, #16
psa %15, #16
swap32 %15, %15
add32v %10, %10, %15
// add32v %15, %6, #16
// add32v %11, %11, %15
// swap32 %15, %15
// add32v %10, %10, %15
skip3:
// copy Y count
psa %12, %3
loop128_y:
// set source and destination addresses for current Y // FIXME : +X, -1?
// set source and destination addresses for current Y
setadr %15, %10
// then the rounded value in X
sub32v %9, %13, %6
@@ -755,7 +765,7 @@ fn main() -> std::io::Result<()> {
// do the first column if we need to
brz32 done128, %14
// set alignement; we shift by the addr offset
setmq %15, %0, #16
setmq %15, %0, %2
setma %15, %1, #16
// copy Y
psa %9, %3
@@ -777,7 +787,7 @@ fn main() -> std::io::Result<()> {
// if not zero, continue
brnz32 loopX_y, %9
done128:
done128:
fin
fin
);