more EXA
This commit is contained in:
@@ -123,6 +123,7 @@ struct scrolltest {
|
||||
#define GOBLIN_SCROLL _IOW('X', 0, struct scrolltest)
|
||||
#define GOBLIN_FILL _IOW('X', 1, struct scrolltest)
|
||||
#define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest)
|
||||
#define GOBLIN_COPY _IOW('X', 3, struct scrolltest)
|
||||
|
||||
static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *);
|
||||
static paddr_t goblin_mmap(void *, void *, off_t, int);
|
||||
@@ -144,6 +145,7 @@ static int power_off(struct goblin_softc *sc);
|
||||
static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n);
|
||||
static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n);
|
||||
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop);
|
||||
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
|
||||
static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,
|
||||
0x00004005,0xfb000809,0x0000000a,0x0000000a };
|
||||
static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,
|
||||
@@ -164,9 +166,11 @@ static const uint32_t program_fillrop[41] = { 0x13000089,0x128000c9,0x01bc0014
|
||||
0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,
|
||||
0x0000000a};
|
||||
|
||||
static const uint32_t* programs[6] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, NULL };
|
||||
static const uint32_t program_len[6] = { 12, 11, 14, 38, 41, 0 };
|
||||
static uint32_t program_offset[6];
|
||||
static const uint32_t program_copy[49] = { 0x17000089,0x168000c9,0x01bc0014,0x0b80000d,0x013f0014,0x003f0054,0x00380011,0x001400c0,0x00180000,0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00161146,0xfc000148,0x0016f007,0x00145c06,0x0014214f,0x00140150,0x00005005,0x00085086,0x0b800089,0x013f0814,0x00045045,0x003f0054,0x001af087,0x403c0012,0x00146086,0xa0a00013,0x02800149,0x001c0220,0x603c7013,0x00170146,0x20a08015,0xfd800148,0x0280018d,0x013c6814,0x001c0013,0x001c0220,0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t* programs[7] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, program_copy, NULL };
|
||||
static const uint32_t program_len[7] = { 12, 11, 14, 38, 41, 49, 0 };
|
||||
static uint32_t program_offset[7];
|
||||
|
||||
static void goblin_set_depth(struct goblin_softc *, int);
|
||||
|
||||
@@ -403,6 +407,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
|
||||
}
|
||||
break;
|
||||
|
||||
case GOBLIN_COPY: {
|
||||
struct scrolltest *st = (struct scrolltest *)data;
|
||||
jareth_copy(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, /* x1 */ st->pm, st->rop);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
return (ENOTTY);
|
||||
}
|
||||
@@ -881,6 +891,53 @@ static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) {
|
||||
const uint32_t base = 0;
|
||||
const int pidx = 5; // copy
|
||||
/* int i; */
|
||||
|
||||
/* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */
|
||||
|
||||
power_on(sc);
|
||||
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,1), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,1), (sc->sc_internal_adr + y1 * sc->sc_stride + x1));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n));
|
||||
/* for (i = 1 ; i < 8 ; i++) { */
|
||||
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */
|
||||
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */
|
||||
/* } */
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride));
|
||||
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,1), (sc->sc_stride));
|
||||
jareth_mpstart_write(sc, program_offset[pidx]);
|
||||
jareth_mplen_write(sc, program_len[pidx]);
|
||||
|
||||
#if 0
|
||||
{
|
||||
uint32_t data[8];
|
||||
int i, j;
|
||||
char buf[512];
|
||||
for (i = 0 ; i < 16 ; i++) {
|
||||
for (j = 0 ; j < 8 ; j++)
|
||||
data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j));
|
||||
snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]);
|
||||
aprint_normal("reg%d : %s\n", i, buf);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
(void)start_job(sc, verbose);
|
||||
delay(1);
|
||||
(void)wait_job(sc, 1, verbose);
|
||||
|
||||
power_off(sc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
jareth_copyrows(void *cookie, int src, int dst, int n)
|
||||
{
|
||||
@@ -990,6 +1047,21 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi
|
||||
if (verbose == jareth_verbose)
|
||||
aprint_normal_dev(sc->sc_dev, "WAIT - new max count %d with %d delay (param was %u)\n", max_cnt_seen, del, param);
|
||||
}
|
||||
|
||||
#if 0
|
||||
{
|
||||
const uint32_t base = 0;
|
||||
uint32_t data[8];
|
||||
int i, j;
|
||||
char buf[512];
|
||||
for (i = 0 ; i < 16 ; i++) {
|
||||
for (j = 0 ; j < 8 ; j++)
|
||||
data[j] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(i,j));
|
||||
snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", data[7-0], data[7-1], data[7-2], data[7-3], data[7-4], data[7-5], data[7-6], data[7-7]);
|
||||
aprint_normal("reg%d : %s\n", i, buf);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//jareth_control_write(sc, 0);
|
||||
if (status & (1<<CSR_JARETH_STATUS_RUNNING_OFFSET)) {
|
||||
@@ -1007,6 +1079,7 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi
|
||||
} else {
|
||||
//aprint_normal_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, jareth_ls_status_read(sc));
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -328,8 +328,7 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
|
||||
int i;
|
||||
|
||||
ENTER;
|
||||
DPRINTF(X_ERROR, "bits per pixel: %d\n",
|
||||
pPixmap->drawable.bitsPerPixel);
|
||||
DPRINTF(X_ERROR, "PrepareSolid bpp: %d, alu %d, pm 0x%08x, Fg 0x%08x\n", pPixmap->drawable.bitsPerPixel, alu, planemask, fg);
|
||||
|
||||
if ((pGoblin->jreg->power & 1) != 1)
|
||||
pGoblin->jreg->power = 1;
|
||||
@@ -347,17 +346,15 @@ GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
|
||||
pGoblin->last_rop = alu;
|
||||
|
||||
if ((alu == 0x3) && // GCcopy
|
||||
(planemask == 0xFFFFFFFF)) {
|
||||
(planemask == 0xFFFFFFFF)) { // full pattern
|
||||
// fill
|
||||
pGoblin->jreg->mpstart = 37;
|
||||
pGoblin->jreg->mpstart = 37; // FIXME
|
||||
pGoblin->jreg->mplen = 38;
|
||||
} else {
|
||||
// fillrop
|
||||
pGoblin->jreg->mpstart = 75;
|
||||
pGoblin->jreg->mpstart = 75; // FIXME
|
||||
pGoblin->jreg->mplen = 41;
|
||||
}
|
||||
|
||||
DPRINTF(X_ERROR, "%s: %x; %x\n", __func__, alu, planemask);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@@ -423,6 +420,27 @@ GoblinPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
|
||||
pGoblin->srcoff = exaGetPixmapOffset(pSrcPixmap);
|
||||
pGoblin->xdir = xdir;
|
||||
pGoblin->ydir = ydir;
|
||||
|
||||
GoblinWait(pGoblin);
|
||||
|
||||
pGoblin->jregfile->reg[5][0] = planemask;
|
||||
pGoblin->jregfile->reg[5][1] = alu;
|
||||
|
||||
pGoblin->last_mask = planemask;
|
||||
pGoblin->last_rop = alu;
|
||||
|
||||
if ((alu == 0x3) && // GCcopy
|
||||
(planemask == 0xFFFFFFFF)) { // full pattern
|
||||
// fill
|
||||
pGoblin->jreg->mpstart = 116; // FIXME
|
||||
pGoblin->jreg->mplen = 49;
|
||||
} else {
|
||||
// fillrop
|
||||
pGoblin->jreg->mpstart = 116; // FIXME FIXME FIXME
|
||||
pGoblin->jreg->mplen = 49;
|
||||
}
|
||||
|
||||
DPRINTF(X_ERROR, "PrepareCopy: alu %d, pm 0x%08\n", alu, planemask);
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
@@ -439,50 +457,60 @@ GoblinCopy(PixmapPtr pDstPixmap,
|
||||
char *src, *dst;
|
||||
int i, j;
|
||||
ENTER;
|
||||
|
||||
DPRINTF(X_ERROR, "Copy %d %d -> %d %d [%d x %d]\n", srcX, srcY, dstX, dstY, w, h);
|
||||
|
||||
srcstart = (srcX << 2) + (pGoblin->srcpitch * srcY) + pGoblin->srcoff;
|
||||
dststart = (dstX << 2) + ( dstpitch * dstY) + dstoff;
|
||||
#if 1
|
||||
src = (char*)0x8f000000 + srcstart; // fixme
|
||||
dst = (char*)0x8f000000 + dststart;
|
||||
|
||||
if (pGoblin->ydir < 0) {
|
||||
src += pGoblin->srcpitch * (h-1);
|
||||
dst += dstpitch * (h-1);
|
||||
pGoblin->srcpitch = -pGoblin->srcpitch;
|
||||
dstpitch = -dstpitch;
|
||||
}
|
||||
|
||||
// FIXME: xdir < 0
|
||||
|
||||
// 32 bits
|
||||
w = w*4;
|
||||
|
||||
GoblinWait(pGoblin);
|
||||
|
||||
pGoblin->jregfile->reg[0][0] = (uint32_t)dst;
|
||||
pGoblin->jregfile->reg[0][1] = (uint32_t)src;
|
||||
pGoblin->jregfile->reg[1][0] = (uint32_t)src;
|
||||
pGoblin->jregfile->reg[1][1] = (uint32_t)dst;
|
||||
pGoblin->jregfile->reg[2][0] = w;
|
||||
pGoblin->jregfile->reg[3][0] = h;
|
||||
pGoblin->jregfile->reg[4][0] = dstpitch;
|
||||
pGoblin->jregfile->reg[4][1] = pGoblin->srcpitch;
|
||||
|
||||
DPRINTF(X_ERROR, "Copy %d %d -> %d %d [%d x %d, %d %d] ; %d -> %d \n", srcX, srcY, dstX, dstY, w, h, pGoblin->xdir, pGoblin->ydir, srcstart, dststart);
|
||||
|
||||
pGoblin->jreg->control = 1; // start
|
||||
|
||||
exaMarkSync(pDstPixmap->drawable.pScreen);
|
||||
|
||||
#else
|
||||
src = pGoblin->fb + srcstart;
|
||||
dst = pGoblin->fb + dststart;
|
||||
|
||||
if (ydir > 0 && xdir > 0) {
|
||||
if (pGoblin->ydir > 0) {
|
||||
for (j = 0 ; j < h ; j++) {
|
||||
for (i = 0 ; i < w; i ++) {
|
||||
*(src+i) = *(dst+i);
|
||||
}
|
||||
src += srcpitch;
|
||||
memcpy(dst, src, w*4);
|
||||
src += pGoblin->srcpitch;
|
||||
dst += dstpitch;
|
||||
}
|
||||
} else if (ydir > 0 && xdir < 0) {
|
||||
for (j = 0 ; j < h ; j++) {
|
||||
for (i = w - 1 ; i >= 0 ; i --) {
|
||||
*(src+i) = *(dst+i);
|
||||
}
|
||||
src += srcpitch;
|
||||
dst += dstpitch;
|
||||
}
|
||||
} else if (ydir < 0 && xdir > 0) {
|
||||
src += srcpitch * h;
|
||||
} else if (pGoblin->ydir < 0 ) {
|
||||
src += pGoblin->srcpitch * h;
|
||||
dst += dstpitch * h;
|
||||
for (j = 0 ; j < h ; j++) {
|
||||
src -= srcpitch;
|
||||
src -= pGoblin->srcpitch;
|
||||
dst -= dstpitch;
|
||||
for (i = 0 ; i < w; i ++) {
|
||||
*(src+i) = *(dst+i);
|
||||
}
|
||||
memcpy(dst, src, w*4);
|
||||
}
|
||||
} else if (ydir < 0 && xdir < 0) {
|
||||
src += srcpitch * h;
|
||||
dst += dstpitch * h;
|
||||
for (j = 0 ; j < h ; j++) {
|
||||
src -= srcpitch;
|
||||
dst -= dstpitch;
|
||||
for (i = w - 1 ; i >= 0 ; i --) {
|
||||
*(src+i) = *(dst+i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shif
|
||||
"BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"],
|
||||
"BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"],
|
||||
"MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"],
|
||||
"BCAST32" : [16, "Wd[x..x+32] $\gets$ Ra[0..32]"],
|
||||
# for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ;
|
||||
"GETM": [17, "GETM: getmask" ],
|
||||
"ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
|
||||
@@ -328,12 +329,12 @@ passthrough.
|
||||
self.q.eq(Cat(0, self.a[:255])),
|
||||
).Elif(self.instruction.opcode == opcodes["AND"][0],
|
||||
self.q.eq(self.a & self.b),
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
class ExecAddSub(ExecUnit, AutoDoc):
|
||||
def __init__(self, width=256):
|
||||
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V" ])
|
||||
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V", "BCAST32" ])
|
||||
self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f"""
|
||||
""")
|
||||
|
||||
@@ -346,8 +347,12 @@ class ExecAddSub(ExecUnit, AutoDoc):
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
|
||||
).Elif(self.instruction.opcode == opcodes["SUB32V"][0],
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
|
||||
).Elif(self.instruction.opcode == opcodes["BCAST32"][0],
|
||||
[ self.q[x*32:(x+1)*32].eq(self.a[0:32]) for x in range(0, width//32) ],
|
||||
).Elif(self.instruction.opcode == opcodes["MIN32V"][0],
|
||||
[ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]), self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32]), self.q.eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
|
||||
[ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]),
|
||||
self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32])
|
||||
).Else(self.q[x*32:(x+1)*32].eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
|
||||
)
|
||||
]
|
||||
|
||||
@@ -530,7 +535,6 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
)
|
||||
),
|
||||
).Elif(self.instruction.opcode == opcodes["LOADH"][0],
|
||||
NextValue(cpar, 0),
|
||||
NextValue(self.has_timeout, 0),
|
||||
NextValue(self.has_failure, 0),
|
||||
NextValue(timeout, 2047),
|
||||
|
||||
@@ -213,7 +213,7 @@ fn main() -> std::io::Result<()> {
|
||||
);
|
||||
|
||||
let mcode_fill = assemble_jareth!(
|
||||
// x..x / $DST in %0, 128 bits
|
||||
// x..x / $DST in %0
|
||||
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
|
||||
// x..x / X size in %2
|
||||
// x..x / Y size in %3,
|
||||
@@ -330,8 +330,8 @@ fn main() -> std::io::Result<()> {
|
||||
);
|
||||
|
||||
let mcode_fillrop = assemble_jareth!(
|
||||
// x..x / $DST in %0, 128 bits
|
||||
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
|
||||
// x..x / $DST in %0
|
||||
// 128-bits pattern in %1 [assumed to be alignement-homogeneous]
|
||||
// x..x / X size in %2
|
||||
// x..x / Y size in %3,
|
||||
// x..x / dst_stride in %4 (screen width?)
|
||||
@@ -453,6 +453,150 @@ fn main() -> std::io::Result<()> {
|
||||
fin
|
||||
);
|
||||
|
||||
|
||||
|
||||
let mcode_copy = assemble_jareth!(
|
||||
// x..x / $SRC / $DST in %0
|
||||
// x..x / $DST / $SRC in %1
|
||||
// x..x / X size in %2
|
||||
// x..x / Y size in %3,
|
||||
// x..x src_stride / dst_stride in %4 (screen width?)
|
||||
// -----
|
||||
// main loop:
|
||||
// live X count in %9
|
||||
// leftover X in %6
|
||||
// // live Y count in %3
|
||||
// data in %7
|
||||
// masked data in %7
|
||||
// 0/scrap in %15
|
||||
// -----
|
||||
// header loop:
|
||||
// live Y count in %9
|
||||
// $SRC / $DST in %6
|
||||
// dst data in %7
|
||||
// src data in %8
|
||||
// 0/scrap in %15
|
||||
|
||||
|
||||
start:
|
||||
// if number of line or element in line is 0, exit early
|
||||
brz32 done128, %2
|
||||
brz32 done128, %3
|
||||
// reset masks
|
||||
resm %15
|
||||
// if $DST is aligned on 128 bits, jump to aligned loop
|
||||
brz4 start128, %0
|
||||
|
||||
// do the first column to align $DST
|
||||
startX:
|
||||
// set alignement; we shift by the addr offset
|
||||
//and %14, %2, #15
|
||||
setmq %15, %0, #16
|
||||
setma %15, %1, #16
|
||||
getm %14
|
||||
// copy Y
|
||||
psa %9, %3
|
||||
// copy $SRC / $DST
|
||||
psa %6, %0
|
||||
loopX_y:
|
||||
// setadr
|
||||
setadr %15, %6
|
||||
// load src
|
||||
load256 %8, ^1
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128 %15, ^0, %7
|
||||
// increment copied $SRC / $DST by stride
|
||||
add32v %6, %6, %4
|
||||
// decrement copied Y count
|
||||
sub32v %9, %9, #1
|
||||
// if not zero, continue
|
||||
brnz32 loopX_y, %9
|
||||
|
||||
loopX_done:
|
||||
// how much did we do (#15 is 15, #16 is 16)
|
||||
and %9, %0, #15
|
||||
// compute 16-(x&15)
|
||||
sub32v %9, #16, %9
|
||||
// compute the proper value
|
||||
min32v %9, %9, %2
|
||||
// more than one address to increment
|
||||
bcast32 %9, %9
|
||||
// add the count to the addresses, ^0 will now be aligned
|
||||
add32v %0, %0, %9
|
||||
// remove from X, as we have done it
|
||||
sub32v %2, %2, %9
|
||||
// fall through to the aligned loop if not 0
|
||||
brz32 done128, %2
|
||||
// reset q mask (we will be aligned from now on)
|
||||
setmq %15, #0, #16
|
||||
// add the count to the addresses, ^1 will have the proper shift for masking
|
||||
add32v %1, %1, %9
|
||||
// reset a mask to the proper shifting
|
||||
setma %15, %1, #16
|
||||
|
||||
start128:
|
||||
// compute X leftovers (modulo 16 -> #15 is 15)
|
||||
and %6, %2, #15
|
||||
|
||||
loop128_y:
|
||||
// set source and destination addresses for current Y
|
||||
setadr %15, %0
|
||||
// then the rounded value in X
|
||||
sub32v %9, %2, %6
|
||||
// prefetch data
|
||||
load256inc %8, ^1
|
||||
// already 0, bypass aligned stuff
|
||||
brz32 loop128_x_end, %9
|
||||
|
||||
loop128_x:
|
||||
// merge data from input
|
||||
psa* %7, %8
|
||||
// store to DST w/ post-increment
|
||||
store128inc %15, ^0, %7
|
||||
// sub 16 (#16 is 16) from live rounded X count
|
||||
sub32v %9, %9, #16
|
||||
// prefetch data
|
||||
loadh128inc %8, ^1, %8
|
||||
// if X count is not 0, keep looping
|
||||
brnz32 loop128_x, %9
|
||||
// check for line leftovers
|
||||
loop128_x_end:
|
||||
brz4 done128_x, %6
|
||||
|
||||
// set the leftovers mask (offset is 0 as we are aligned)
|
||||
// IMPROVE ME
|
||||
setmq %15, #0, %6
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert pattern
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128 %15, ^0, %7
|
||||
// reset the Q mask
|
||||
// IMPROVE ME
|
||||
setmq %15, #0, #16
|
||||
|
||||
done128_x:
|
||||
// decrement Y count
|
||||
sub32v %3, %3, #1
|
||||
// if 0, finished
|
||||
brz32 done128, %3
|
||||
|
||||
// add strides to initial addresses
|
||||
add32v %0, %0, %4
|
||||
// loop128 to do next line
|
||||
brz32 loop128_y, #0
|
||||
|
||||
done128:
|
||||
fin
|
||||
fin
|
||||
);
|
||||
|
||||
|
||||
let mut pos;
|
||||
|
||||
pos = 0;
|
||||
@@ -518,5 +662,14 @@ fn main() -> std::io::Result<()> {
|
||||
println!("");
|
||||
println!("-> {}", mcode_fillrop.len());
|
||||
|
||||
pos = 0;
|
||||
println!("copy:");
|
||||
while pos < mcode_copy.len() {
|
||||
print!("0x{:08x},", mcode_copy[pos]);
|
||||
pos = pos + 1;
|
||||
}
|
||||
println!("");
|
||||
println!("-> {}", mcode_copy.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user