From a75b2a2d186981b8fdf736a90b802f43eb33304d Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Mon, 14 Mar 2022 23:13:46 +0100 Subject: [PATCH] more EXA --- NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c | 79 ++++++++- .../xf86-video-goblin/dist/src/goblin_accel.c | 106 +++++++----- sbus-to-ztex-gateware-migen/jareth.py | 12 +- .../jareth_code/jareth_code.rs | 159 +++++++++++++++++- 4 files changed, 307 insertions(+), 49 deletions(-) diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c index 3c3934e..41a14ad 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/goblin.c @@ -123,6 +123,7 @@ struct scrolltest { #define GOBLIN_SCROLL _IOW('X', 0, struct scrolltest) #define GOBLIN_FILL _IOW('X', 1, struct scrolltest) #define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest) +#define GOBLIN_COPY _IOW('X', 3, struct scrolltest) static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *); static paddr_t goblin_mmap(void *, void *, off_t, int); @@ -144,6 +145,7 @@ static int power_off(struct goblin_softc *sc); static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n); static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n); static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop); +static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop); static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9, 0x00004005,0xfb000809,0x0000000a,0x0000000a }; static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005, @@ -164,9 +166,11 @@ static const uint32_t program_fillrop[41] = { 0x13000089,0x128000c9,0x01bc0014 0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a, 0x0000000a}; -static const uint32_t* programs[6] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, NULL }; -static const uint32_t program_len[6] = { 12, 11, 14, 38, 41, 0 }; -static uint32_t program_offset[6]; +static const uint32_t program_copy[49] = { 0x17000089,0x168000c9,0x01bc0014,0x0b80000d,0x013f0014,0x003f0054,0x00380011,0x001400c0,0x00180000,0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00161146,0xfc000148,0x0016f007,0x00145c06,0x0014214f,0x00140150,0x00005005,0x00085086,0x0b800089,0x013f0814,0x00045045,0x003f0054,0x001af087,0x403c0012,0x00146086,0xa0a00013,0x02800149,0x001c0220,0x603c7013,0x00170146,0x20a08015,0xfd800148,0x0280018d,0x013c6814,0x001c0013,0x001c0220,0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a }; + +static const uint32_t* programs[7] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, NULL }; +static const uint32_t program_len[7] = { 12, 11, 14, 38, 41, 49, 0 }; +static uint32_t program_offset[7]; static void goblin_set_depth(struct goblin_softc *, int); @@ -403,6 +407,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l) } break; + case GOBLIN_COPY: { + struct scrolltest *st = (struct scrolltest *)data; + jareth_copy(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, /* x1 */ st->pm, st->rop); + } + break; + default: return (ENOTTY); } @@ -881,6 +891,53 @@ static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose return 0; } +static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) { + const uint32_t base = 0; + const int pidx = 5; // copy + /* int i; */ + + /* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */ + + power_on(sc); + + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y1 * sc->sc_stride + x1)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,1), (sc->sc_internal_adr + y0 * sc->sc_stride + x0)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,1), (sc->sc_internal_adr + y1 * sc->sc_stride + x1)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n)); + /* for (i = 1 ; i < 8 ; i++) { */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */ + /* } */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride)); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,1), (sc->sc_stride)); + jareth_mpstart_write(sc, program_offset[pidx]); + jareth_mplen_write(sc, program_len[pidx]); + +#if 0 + { + uint32_t data[8]; + int i, j; + char buf[512]; + for (i = 0 ; i < 16 ; i++) { + for (j = 0 ; j < 8 ; j++) + data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j)); + snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]); + aprint_normal("reg%d : %s\n", i, buf); + } + } +#endif + + (void)start_job(sc, verbose); + delay(1); + (void)wait_job(sc, 1, verbose); + + power_off(sc); + + return 0; +} + static void jareth_copyrows(void *cookie, int src, int dst, int n) { @@ -990,6 +1047,21 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi if (verbose == jareth_verbose) aprint_normal_dev(sc->sc_dev, "WAIT - new max count %d with %d delay (param was %u)\n", max_cnt_seen, del, param); } + +#if 0 + { + const uint32_t base = 0; + uint32_t data[8]; + int i, j; + char buf[512]; + for (i = 0 ; i < 16 ; i++) { + for (j = 0 ; j < 8 ; j++) + data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j)); + snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]); + aprint_normal("reg%d : %s\n", i, buf); + } + } +#endif //jareth_control_write(sc, 0); if (status & (1<sc_dev, "WAIT - Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, jareth_ls_status_read(sc)); } + return 0; } diff --git a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c index 93a5ec2..72cec2a 100644 --- a/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c +++ b/NetBSD/9.0/usr/xsrc/external/mit/xf86-video-goblin/dist/src/goblin_accel.c @@ -328,8 +328,7 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg) int i; ENTER; - DPRINTF(X_ERROR, "bits per pixel: %d\n", - pPixmap->drawable.bitsPerPixel); + DPRINTF(X_ERROR, "PrepareSolid bpp: %d, alu %d, pm 0x%08x, Fg 0x%08x\n", pPixmap->drawable.bitsPerPixel, alu, planemask, fg); if ((pGoblin->jreg->power & 1) != 1) pGoblin->jreg->power = 1; @@ -347,17 +346,15 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg) pGoblin->last_rop = alu; if ((alu == 0x3) && // GCcopy - (planemask == 0xFFFFFFFF)) { + (planemask == 0xFFFFFFFF)) { // full pattern // fill - pGoblin->jreg->mpstart = 37; + pGoblin->jreg->mpstart = 37; // FIXME pGoblin->jreg->mplen = 38; } else { // fillrop - pGoblin->jreg->mpstart = 75; + pGoblin->jreg->mpstart = 75; // FIXME pGoblin->jreg->mplen = 41; } - - DPRINTF(X_ERROR, "%s: %x; %x\n", __func__, alu, planemask); return TRUE; } @@ -423,6 +420,27 @@ GoblinPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap, pGoblin->srcoff = exaGetPixmapOffset(pSrcPixmap); pGoblin->xdir = xdir; pGoblin->ydir = ydir; + + GoblinWait(pGoblin); + + pGoblin->jregfile->reg[5][0] = planemask; + pGoblin->jregfile->reg[5][1] = alu; + + pGoblin->last_mask = planemask; + pGoblin->last_rop = alu; + + if ((alu == 0x3) && // GCcopy + (planemask == 0xFFFFFFFF)) { // full pattern + // fill + pGoblin->jreg->mpstart = 116; // FIXME + pGoblin->jreg->mplen = 49; + } else { + // fillrop + pGoblin->jreg->mpstart = 116; // FIXME FIXME FIXME + pGoblin->jreg->mplen = 49; + } + + DPRINTF(X_ERROR, "PrepareCopy: alu %d, pm 0x%08\n", alu, planemask); return TRUE; } @@ -439,50 +457,60 @@ GoblinCopy(PixmapPtr pDstPixmap, char *src, *dst; int i, j; ENTER; - - DPRINTF(X_ERROR, "Copy %d %d -> %d %d [%d x %d]\n", srcX, srcY, dstX, dstY, w, h); srcstart = (srcX << 2) + (pGoblin->srcpitch * srcY) + pGoblin->srcoff; dststart = (dstX << 2) + ( dstpitch * dstY) + dstoff; +#if 1 + src = (char*)0x8f000000 + srcstart; // fixme + dst = (char*)0x8f000000 + dststart; + + if (pGoblin->ydir < 0) { + src += pGoblin->srcpitch * (h-1); + dst += dstpitch * (h-1); + pGoblin->srcpitch = -pGoblin->srcpitch; + dstpitch = -dstpitch; + } + + // FIXME: xdir < 0 + + // 32 bits + w = w*4; + + GoblinWait(pGoblin); + + pGoblin->jregfile->reg[0][0] = (uint32_t)dst; + pGoblin->jregfile->reg[0][1] = (uint32_t)src; + pGoblin->jregfile->reg[1][0] = (uint32_t)src; + pGoblin->jregfile->reg[1][1] = (uint32_t)dst; + pGoblin->jregfile->reg[2][0] = w; + pGoblin->jregfile->reg[3][0] = h; + pGoblin->jregfile->reg[4][0] = dstpitch; + pGoblin->jregfile->reg[4][1] = pGoblin->srcpitch; + + DPRINTF(X_ERROR, "Copy %d %d -> %d %d [%d x %d, %d %d] ; %d -> %d \n", srcX, srcY, dstX, dstY, w, h, pGoblin->xdir, pGoblin->ydir, srcstart, dststart); + + pGoblin->jreg->control = 1; // start + exaMarkSync(pDstPixmap->drawable.pScreen); + +#else src = pGoblin->fb + srcstart; dst = pGoblin->fb + dststart; - if (ydir > 0 && xdir > 0) { + if (pGoblin->ydir > 0) { for (j = 0 ; j < h ; j++) { - for (i = 0 ; i < w; i ++) { - *(src+i) = *(dst+i); - } - src += srcpitch; + memcpy(dst, src, w*4); + src += pGoblin->srcpitch; dst += dstpitch; } - } else if (ydir > 0 && xdir < 0) { - for (j = 0 ; j < h ; j++) { - for (i = w - 1 ; i >= 0 ; i --) { - *(src+i) = *(dst+i); - } - src += srcpitch; - dst += dstpitch; - } - } else if (ydir < 0 && xdir > 0) { - src += srcpitch * h; + } else if (pGoblin->ydir < 0 ) { + src += pGoblin->srcpitch * h; dst += dstpitch * h; for (j = 0 ; j < h ; j++) { - src -= srcpitch; + src -= pGoblin->srcpitch; dst -= dstpitch; - for (i = 0 ; i < w; i ++) { - *(src+i) = *(dst+i); - } + memcpy(dst, src, w*4); } - } else if (ydir < 0 && xdir < 0) { - src += srcpitch * h; - dst += dstpitch * h; - for (j = 0 ; j < h ; j++) { - src -= srcpitch; - dst -= dstpitch; - for (i = w - 1 ; i >= 0 ; i --) { - *(src+i) = *(dst+i); - } - } - } + } +#endif } diff --git a/sbus-to-ztex-gateware-migen/jareth.py b/sbus-to-ztex-gateware-migen/jareth.py index a73796e..bbd9506 100644 --- a/sbus-to-ztex-gateware-migen/jareth.py +++ b/sbus-to-ztex-gateware-migen/jareth.py @@ -28,6 +28,7 @@ opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shif "BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"], "BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"], "MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"], + "BCAST32" : [16, "Wd[x..x+32] $\gets$ Ra[0..32]"], # for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ; "GETM": [17, "GETM: getmask" ], "ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ], @@ -328,12 +329,12 @@ passthrough. self.q.eq(Cat(0, self.a[:255])), ).Elif(self.instruction.opcode == opcodes["AND"][0], self.q.eq(self.a & self.b), - ), + ) ] class ExecAddSub(ExecUnit, AutoDoc): def __init__(self, width=256): - ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V" ]) + ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "BCAST32" ]) self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f""" """) @@ -346,8 +347,12 @@ class ExecAddSub(ExecUnit, AutoDoc): [ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ], ).Elif(self.instruction.opcode == opcodes["SUB32V"][0], [ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ], + ).Elif(self.instruction.opcode == opcodes["BCAST32"][0], + [ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ], ).Elif(self.instruction.opcode == opcodes["MIN32V"][0], - [ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]), self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32]), self.q.eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ], + [ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]), + self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32]) + ).Else(self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ], ) ] @@ -530,7 +535,6 @@ class ExecLS(ExecUnit, AutoDoc): ) ), ).Elif(self.instruction.opcode == opcodes["LOADH"][0], - NextValue(cpar, 0), NextValue(self.has_timeout, 0), NextValue(self.has_failure, 0), NextValue(timeout, 2047), diff --git a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs index 3e33475..8d0bd81 100644 --- a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs +++ b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs @@ -213,7 +213,7 @@ fn main() -> std::io::Result<()> { ); let mcode_fill = assemble_jareth!( - // x..x / $DST in %0, 128 bits + // x..x / $DST in %0 // 128-bits pattern in %1 [assumed to be alignement-homogneous] // x..x / X size in %2 // x..x / Y size in %3, @@ -330,8 +330,8 @@ fn main() -> std::io::Result<()> { ); let mcode_fillrop = assemble_jareth!( - // x..x / $DST in %0, 128 bits - // 128-bits pattern in %1 [assumed to be alignement-homogneous] + // x..x / $DST in %0 + // 128-bits pattern in %1 [assumed to be alignement-homogeneous] // x..x / X size in %2 // x..x / Y size in %3, // x..x / dst_stride in %4 (screen width?) @@ -453,6 +453,150 @@ fn main() -> std::io::Result<()> { fin ); + + + let mcode_copy = assemble_jareth!( + // x..x / $SRC / $DST in %0 + // x..x / $DST / $SRC in %1 + // x..x / X size in %2 + // x..x / Y size in %3, + // x..x src_stride / dst_stride in %4 (screen width?) + // ----- + // main loop: + // live X count in %9 + // leftover X in %6 + // // live Y count in %3 + // data in %7 + // masked data in %7 + // 0/scrap in %15 + // ----- + // header loop: + // live Y count in %9 + // $SRC / $DST in %6 + // dst data in %7 + // src data in %8 + // 0/scrap in %15 + + + start: + // if number of line or element in line is 0, exit early + brz32 done128, %2 + brz32 done128, %3 + // reset masks + resm %15 + // if $DST is aligned on 128 bits, jump to aligned loop + brz4 start128, %0 + + // do the first column to align $DST + startX: + // set alignement; we shift by the addr offset + //and %14, %2, #15 + setmq %15, %0, #16 + setma %15, %1, #16 + getm %14 + // copy Y + psa %9, %3 + // copy $SRC / $DST + psa %6, %0 + loopX_y: + // setadr + setadr %15, %6 + // load src + load256 %8, ^1 + // load old data + load128 %7, ^0 + // insert data + psa* %7, %8 + // rewrite data + store128 %15, ^0, %7 + // increment copied $SRC / $DST by stride + add32v %6, %6, %4 + // decrement copied Y count + sub32v %9, %9, #1 + // if not zero, continue + brnz32 loopX_y, %9 + + loopX_done: + // how much did we do (#15 is 15, #16 is 16) + and %9, %0, #15 + // compute 16-(x&15) + sub32v %9, #16, %9 + // compute the proper value + min32v %9, %9, %2 + // more than one address to increment + bcast32 %9, %9 + // add the count to the addresses, ^0 will now be aligned + add32v %0, %0, %9 + // remove from X, as we have done it + sub32v %2, %2, %9 + // fall through to the aligned loop if not 0 + brz32 done128, %2 + // reset q mask (we will be aligned from now on) + setmq %15, #0, #16 + // add the count to the addresses, ^1 will have the proper shift for masking + add32v %1, %1, %9 + // reset a mask to the proper shifting + setma %15, %1, #16 + + start128: + // compute X leftovers (modulo 16 -> #15 is 15) + and %6, %2, #15 + + loop128_y: + // set source and destination addresses for current Y + setadr %15, %0 + // then the rounded value in X + sub32v %9, %2, %6 + // prefetch data + load256inc %8, ^1 + // already 0, bypass aligned stuff + brz32 loop128_x_end, %9 + + loop128_x: + // merge data from input + psa* %7, %8 + // store to DST w/ post-increment + store128inc %15, ^0, %7 + // sub 16 (#16 is 16) from live rounded X count + sub32v %9, %9, #16 + // prefetch data + loadh128inc %8, ^1, %8 + // if X count is not 0, keep looping + brnz32 loop128_x, %9 + // check for line leftovers + loop128_x_end: + brz4 done128_x, %6 + + // set the leftovers mask (offset is 0 as we are aligned) + // IMPROVE ME + setmq %15, #0, %6 + // load old data + load128 %7, ^0 + // insert pattern + psa* %7, %8 + // rewrite data + store128 %15, ^0, %7 + // reset the Q mask + // IMPROVE ME + setmq %15, #0, #16 + + done128_x: + // decrement Y count + sub32v %3, %3, #1 + // if 0, finished + brz32 done128, %3 + + // add strides to initial addresses + add32v %0, %0, %4 + // loop128 to do next line + brz32 loop128_y, #0 + + done128: + fin + fin + ); + + let mut pos; pos = 0; @@ -518,5 +662,14 @@ fn main() -> std::io::Result<()> { println!(""); println!("-> {}", mcode_fillrop.len()); + pos = 0; + println!("copy:"); + while pos < mcode_copy.len() { + print!("0x{:08x},", mcode_copy[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", mcode_copy.len()); + Ok(()) }