1
0
mirror of synced 2026-03-03 09:55:39 +00:00
This commit is contained in:
Romain Dolbeau
2022-03-14 23:13:46 +01:00
parent be6101d39d
commit a75b2a2d18
4 changed files with 307 additions and 49 deletions

View File

@@ -123,6 +123,7 @@ struct scrolltest {
#define GOBLIN_SCROLL _IOW('X', 0, struct scrolltest)
#define GOBLIN_FILL _IOW('X', 1, struct scrolltest)
#define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest)
#define GOBLIN_COPY _IOW('X', 3, struct scrolltest)
static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *);
static paddr_t goblin_mmap(void *, void *, off_t, int);
@@ -144,6 +145,7 @@ static int power_off(struct goblin_softc *sc);
static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n);
static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n);
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop);
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,
0x00004005,0xfb000809,0x0000000a,0x0000000a };
static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,
@@ -164,9 +166,11 @@ static const uint32_t program_fillrop[41] = { 0x13000089,0x128000c9,0x01bc0014
0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,
0x0000000a};
static const uint32_t* programs[6] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, NULL };
static const uint32_t program_len[6] = { 12, 11, 14, 38, 41, 0 };
static uint32_t program_offset[6];
static const uint32_t program_copy[49] = { 0x17000089,0x168000c9,0x01bc0014,0x0b80000d,0x013f0014,0x003f0054,0x00380011,0x001400c0,0x00180000,0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00161146,0xfc000148,0x0016f007,0x00145c06,0x0014214f,0x00140150,0x00005005,0x00085086,0x0b800089,0x013f0814,0x00045045,0x003f0054,0x001af087,0x403c0012,0x00146086,0xa0a00013,0x02800149,0x001c0220,0x603c7013,0x00170146,0x20a08015,0xfd800148,0x0280018d,0x013c6814,0x001c0013,0x001c0220,0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
static const uint32_t* programs[7] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, NULL };
static const uint32_t program_len[7] = { 12, 11, 14, 38, 41, 49, 0 };
static uint32_t program_offset[7];
static void goblin_set_depth(struct goblin_softc *, int);
@@ -403,6 +407,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
}
break;
case GOBLIN_COPY: {
struct scrolltest *st = (struct scrolltest *)data;
jareth_copy(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, /* x1 */ st->pm, st->rop);
}
break;
default:
return (ENOTTY);
}
@@ -881,6 +891,53 @@ static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose
return 0;
}
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) {
const uint32_t base = 0;
const int pidx = 5; // copy
/* int i; */
/* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */
power_on(sc);
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,1), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,1), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n));
/* for (i = 1 ; i < 8 ; i++) { */
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */
/* } */
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,1), (sc->sc_stride));
jareth_mpstart_write(sc, program_offset[pidx]);
jareth_mplen_write(sc, program_len[pidx]);
#if 0
{
uint32_t data[8];
int i, j;
char buf[512];
for (i = 0 ; i < 16 ; i++) {
for (j = 0 ; j < 8 ; j++)
data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j));
snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]);
aprint_normal("reg%d : %s\n", i, buf);
}
}
#endif
(void)start_job(sc, verbose);
delay(1);
(void)wait_job(sc, 1, verbose);
power_off(sc);
return 0;
}
static void
jareth_copyrows(void *cookie, int src, int dst, int n)
{
@@ -990,6 +1047,21 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi
if (verbose == jareth_verbose)
aprint_normal_dev(sc->sc_dev, "WAIT - new max count %d with %d delay (param was %u)\n", max_cnt_seen, del, param);
}
#if 0
{
const uint32_t base = 0;
uint32_t data[8];
int i, j;
char buf[512];
for (i = 0 ; i < 16 ; i++) {
for (j = 0 ; j < 8 ; j++)
data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j));
snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]);
aprint_normal("reg%d : %s\n", i, buf);
}
}
#endif
//jareth_control_write(sc, 0);
if (status & (1<<CSR_JARETH_STATUS_RUNNING_OFFSET)) {
@@ -1007,6 +1079,7 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi
} else {
//aprint_normal_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, jareth_ls_status_read(sc));
}
return 0;
}

View File

@@ -328,8 +328,7 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
int i;
ENTER;
DPRINTF(X_ERROR, "bits per pixel: %d\n",
pPixmap->drawable.bitsPerPixel);
DPRINTF(X_ERROR, "PrepareSolid bpp: %d, alu %d, pm 0x%08x, Fg 0x%08x\n", pPixmap->drawable.bitsPerPixel, alu, planemask, fg);
if ((pGoblin->jreg->power & 1) != 1)
pGoblin->jreg->power = 1;
@@ -347,17 +346,15 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
pGoblin->last_rop = alu;
if ((alu == 0x3) && // GCcopy
(planemask == 0xFFFFFFFF)) {
(planemask == 0xFFFFFFFF)) { // full pattern
// fill
pGoblin->jreg->mpstart = 37;
pGoblin->jreg->mpstart = 37; // FIXME
pGoblin->jreg->mplen = 38;
} else {
// fillrop
pGoblin->jreg->mpstart = 75;
pGoblin->jreg->mpstart = 75; // FIXME
pGoblin->jreg->mplen = 41;
}
DPRINTF(X_ERROR, "%s: %x; %x\n", __func__, alu, planemask);
return TRUE;
}
@@ -423,6 +420,27 @@ GoblinPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
pGoblin->srcoff = exaGetPixmapOffset(pSrcPixmap);
pGoblin->xdir = xdir;
pGoblin->ydir = ydir;
GoblinWait(pGoblin);
pGoblin->jregfile->reg[5][0] = planemask;
pGoblin->jregfile->reg[5][1] = alu;
pGoblin->last_mask = planemask;
pGoblin->last_rop = alu;
if ((alu == 0x3) && // GCcopy
(planemask == 0xFFFFFFFF)) { // full pattern
// fill
pGoblin->jreg->mpstart = 116; // FIXME
pGoblin->jreg->mplen = 49;
} else {
// fillrop
pGoblin->jreg->mpstart = 116; // FIXME FIXME FIXME
pGoblin->jreg->mplen = 49;
}
DPRINTF(X_ERROR, "PrepareCopy: alu %d, pm 0x%08\n", alu, planemask);
return TRUE;
}
@@ -439,50 +457,60 @@ GoblinCopy(PixmapPtr pDstPixmap,
char *src, *dst;
int i, j;
ENTER;
DPRINTF(X_ERROR, "Copy %d %d -> %d %d [%d x %d]\n", srcX, srcY, dstX, dstY, w, h);
srcstart = (srcX << 2) + (pGoblin->srcpitch * srcY) + pGoblin->srcoff;
dststart = (dstX << 2) + ( dstpitch * dstY) + dstoff;
#if 1
src = (char*)0x8f000000 + srcstart; // fixme
dst = (char*)0x8f000000 + dststart;
if (pGoblin->ydir < 0) {
src += pGoblin->srcpitch * (h-1);
dst += dstpitch * (h-1);
pGoblin->srcpitch = -pGoblin->srcpitch;
dstpitch = -dstpitch;
}
// FIXME: xdir < 0
// 32 bits
w = w*4;
GoblinWait(pGoblin);
pGoblin->jregfile->reg[0][0] = (uint32_t)dst;
pGoblin->jregfile->reg[0][1] = (uint32_t)src;
pGoblin->jregfile->reg[1][0] = (uint32_t)src;
pGoblin->jregfile->reg[1][1] = (uint32_t)dst;
pGoblin->jregfile->reg[2][0] = w;
pGoblin->jregfile->reg[3][0] = h;
pGoblin->jregfile->reg[4][0] = dstpitch;
pGoblin->jregfile->reg[4][1] = pGoblin->srcpitch;
DPRINTF(X_ERROR, "Copy %d %d -> %d %d [%d x %d, %d %d] ; %d -> %d \n", srcX, srcY, dstX, dstY, w, h, pGoblin->xdir, pGoblin->ydir, srcstart, dststart);
pGoblin->jreg->control = 1; // start
exaMarkSync(pDstPixmap->drawable.pScreen);
#else
src = pGoblin->fb + srcstart;
dst = pGoblin->fb + dststart;
if (ydir > 0 && xdir > 0) {
if (pGoblin->ydir > 0) {
for (j = 0 ; j < h ; j++) {
for (i = 0 ; i < w; i ++) {
*(src+i) = *(dst+i);
}
src += srcpitch;
memcpy(dst, src, w*4);
src += pGoblin->srcpitch;
dst += dstpitch;
}
} else if (ydir > 0 && xdir < 0) {
for (j = 0 ; j < h ; j++) {
for (i = w - 1 ; i >= 0 ; i --) {
*(src+i) = *(dst+i);
}
src += srcpitch;
dst += dstpitch;
}
} else if (ydir < 0 && xdir > 0) {
src += srcpitch * h;
} else if (pGoblin->ydir < 0 ) {
src += pGoblin->srcpitch * h;
dst += dstpitch * h;
for (j = 0 ; j < h ; j++) {
src -= srcpitch;
src -= pGoblin->srcpitch;
dst -= dstpitch;
for (i = 0 ; i < w; i ++) {
*(src+i) = *(dst+i);
}
memcpy(dst, src, w*4);
}
} else if (ydir < 0 && xdir < 0) {
src += srcpitch * h;
dst += dstpitch * h;
for (j = 0 ; j < h ; j++) {
src -= srcpitch;
dst -= dstpitch;
for (i = w - 1 ; i >= 0 ; i --) {
*(src+i) = *(dst+i);
}
}
}
}
#endif
}

View File

@@ -28,6 +28,7 @@ opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shif
"BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"],
"BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"],
"MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"],
"BCAST32" : [16, "Wd[x..x+32] $\gets$ Ra[0..32]"],
# for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ;
"GETM": [17, "GETM: getmask" ],
"ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
@@ -328,12 +329,12 @@ passthrough.
self.q.eq(Cat(0, self.a[:255])),
).Elif(self.instruction.opcode == opcodes["AND"][0],
self.q.eq(self.a & self.b),
),
)
]
class ExecAddSub(ExecUnit, AutoDoc):
def __init__(self, width=256):
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V" ])
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "BCAST32" ])
self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f"""
""")
@@ -346,8 +347,12 @@ class ExecAddSub(ExecUnit, AutoDoc):
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
).Elif(self.instruction.opcode == opcodes["SUB32V"][0],
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
).Elif(self.instruction.opcode == opcodes["BCAST32"][0],
[ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
).Elif(self.instruction.opcode == opcodes["MIN32V"][0],
[ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]), self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32]), self.q.eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
[ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]),
self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32])
).Else(self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
)
]
@@ -530,7 +535,6 @@ class ExecLS(ExecUnit, AutoDoc):
)
),
).Elif(self.instruction.opcode == opcodes["LOADH"][0],
NextValue(cpar, 0),
NextValue(self.has_timeout, 0),
NextValue(self.has_failure, 0),
NextValue(timeout, 2047),

View File

@@ -213,7 +213,7 @@ fn main() -> std::io::Result<()> {
);
let mcode_fill = assemble_jareth!(
// x..x / $DST in %0, 128 bits
// x..x / $DST in %0
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
// x..x / X size in %2
// x..x / Y size in %3,
@@ -330,8 +330,8 @@ fn main() -> std::io::Result<()> {
);
let mcode_fillrop = assemble_jareth!(
// x..x / $DST in %0, 128 bits
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
// x..x / $DST in %0
// 128-bits pattern in %1 [assumed to be alignement-homogeneous]
// x..x / X size in %2
// x..x / Y size in %3,
// x..x / dst_stride in %4 (screen width?)
@@ -453,6 +453,150 @@ fn main() -> std::io::Result<()> {
fin
);
let mcode_copy = assemble_jareth!(
// x..x / $SRC / $DST in %0
// x..x / $DST / $SRC in %1
// x..x / X size in %2
// x..x / Y size in %3,
// x..x src_stride / dst_stride in %4 (screen width?)
// -----
// main loop:
// live X count in %9
// leftover X in %6
// // live Y count in %3
// data in %7
// masked data in %7
// 0/scrap in %15
// -----
// header loop:
// live Y count in %9
// $SRC / $DST in %6
// dst data in %7
// src data in %8
// 0/scrap in %15
start:
// if number of line or element in line is 0, exit early
brz32 done128, %2
brz32 done128, %3
// reset masks
resm %15
// if $DST is aligned on 128 bits, jump to aligned loop
brz4 start128, %0
// do the first column to align $DST
startX:
// set alignement; we shift by the addr offset
//and %14, %2, #15
setmq %15, %0, #16
setma %15, %1, #16
getm %14
// copy Y
psa %9, %3
// copy $SRC / $DST
psa %6, %0
loopX_y:
// setadr
setadr %15, %6
// load src
load256 %8, ^1
// load old data
load128 %7, ^0
// insert data
psa* %7, %8
// rewrite data
store128 %15, ^0, %7
// increment copied $SRC / $DST by stride
add32v %6, %6, %4
// decrement copied Y count
sub32v %9, %9, #1
// if not zero, continue
brnz32 loopX_y, %9
loopX_done:
// how much did we do (#15 is 15, #16 is 16)
and %9, %0, #15
// compute 16-(x&15)
sub32v %9, #16, %9
// compute the proper value
min32v %9, %9, %2
// more than one address to increment
bcast32 %9, %9
// add the count to the addresses, ^0 will now be aligned
add32v %0, %0, %9
// remove from X, as we have done it
sub32v %2, %2, %9
// fall through to the aligned loop if not 0
brz32 done128, %2
// reset q mask (we will be aligned from now on)
setmq %15, #0, #16
// add the count to the addresses, ^1 will have the proper shift for masking
add32v %1, %1, %9
// reset a mask to the proper shifting
setma %15, %1, #16
start128:
// compute X leftovers (modulo 16 -> #15 is 15)
and %6, %2, #15
loop128_y:
// set source and destination addresses for current Y
setadr %15, %0
// then the rounded value in X
sub32v %9, %2, %6
// prefetch data
load256inc %8, ^1
// already 0, bypass aligned stuff
brz32 loop128_x_end, %9
loop128_x:
// merge data from input
psa* %7, %8
// store to DST w/ post-increment
store128inc %15, ^0, %7
// sub 16 (#16 is 16) from live rounded X count
sub32v %9, %9, #16
// prefetch data
loadh128inc %8, ^1, %8
// if X count is not 0, keep looping
brnz32 loop128_x, %9
// check for line leftovers
loop128_x_end:
brz4 done128_x, %6
// set the leftovers mask (offset is 0 as we are aligned)
// IMPROVE ME
setmq %15, #0, %6
// load old data
load128 %7, ^0
// insert pattern
psa* %7, %8
// rewrite data
store128 %15, ^0, %7
// reset the Q mask
// IMPROVE ME
setmq %15, #0, #16
done128_x:
// decrement Y count
sub32v %3, %3, #1
// if 0, finished
brz32 done128, %3
// add strides to initial addresses
add32v %0, %0, %4
// loop128 to do next line
brz32 loop128_y, #0
done128:
fin
fin
);
let mut pos;
pos = 0;
@@ -518,5 +662,14 @@ fn main() -> std::io::Result<()> {
println!("");
println!("-> {}", mcode_fillrop.len());
pos = 0;
println!("copy:");
while pos < mcode_copy.len() {
print!("0x{:08x},", mcode_copy[pos]);
pos = pos + 1;
}
println!("");
println!("-> {}", mcode_copy.len());
Ok(())
}