1
0
mirror of synced 2026-01-11 23:42:59 +00:00

more jareth-with-goblin

This commit is contained in:
Romain Dolbeau 2022-03-13 14:44:42 +01:00
parent 936736d35a
commit be6101d39d
9 changed files with 1009 additions and 100 deletions

View File

@ -117,9 +117,12 @@ struct scrolltest {
int x0;
int w;
int n;
int pm;
int rop;
};
#define GOBLIN_SCROLL _IOW('X', 0, struct scrolltest)
#define GOBLIN_FILL _IOW('X', 1, struct scrolltest)
#define GOBLIN_FILLROP _IOW('X', 2, struct scrolltest)
static int goblin_ioctl(void *, void *, u_long, void *, int, struct lwp *);
static paddr_t goblin_mmap(void *, void *, off_t, int);
@ -140,12 +143,30 @@ static int power_on(struct goblin_softc *sc);
static int power_off(struct goblin_softc *sc);
static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n);
static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n);
static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,0x00004005,0xfb000809,0x0000000a,0x0000000a };
static const uint32_t program_fill128[12] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a,0x0000000a };
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop);
static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,
0x00004005,0xfb000809,0x0000000a,0x0000000a };
static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,
0xfb800809,0x0000000a,0x0000000a };
static const uint32_t program_fill256[14] = { 0x01bc0014,0x001a6087,0x013c6814,0x403c0012,0x00146086,0xe03c1013,0x00165146,0xfe800148,
0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a };
static const uint32_t* programs[3] = { program_scroll128, program_fill128, NULL };
static const uint32_t program_len[3] = { 12, 12, 0 };
static uint32_t program_offset[3];
static const uint32_t program_fill[38] = { 0x11800089,0x110000c9,0x01bc0014,0x0800000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00166007,0x00145946,
0x0014214f,0x00005005,0x00085086,0x08000089,0x001a6087,0x013c6814,0x403c0012,0x00146086,
0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,0xc03c7013,
0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };
static const uint32_t program_fillrop[41] = { 0x13000089,0x128000c9,0x01bc0014,0x003c014c,0x0800000d,0x013c2014,0x002000c0,0x00180000,
0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007,
0x00208946,0x0020220f,0x00008005,0x00088086,0x09000089,0x001a6087,0x013c6814,0x403c0012,
0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,0x0180018d,
0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,
0x0000000a};
static const uint32_t* programs[6] = { program_scroll128, program_fill128, program_fill256, program_fill, program_fillrop, NULL };
static const uint32_t program_len[6] = { 12, 11, 14, 38, 41, 0 };
static uint32_t program_offset[6];
static void goblin_set_depth(struct goblin_softc *, int);
@ -376,6 +397,12 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
}
break;
case GOBLIN_FILLROP: {
struct scrolltest *st = (struct scrolltest *)data;
jareth_fillrop(sc, jareth_verbose, st->y0, st->y1, st->x0, st->w, st->n, st->pm, st->rop);
}
break;
default:
return (ENOTTY);
}
@ -447,11 +474,17 @@ goblinloadcmap(struct goblin_softc *sc, int start, int ncolors)
* offset, allowing for the given protection, or return -1 for error.
* 'inspired' by the cg6 code
*/
#define GOBLIN_USER_FBC 0x70000000
#define GOBLIN_USER_RAM 0x70016000
#define GOBLIN_USER_FBC 0x70000000
#define JARETH_USER_REG 0x70001000
#define JARETH_USER_MICROCODE 0x70002000
#define JARETH_USER_REGFILE 0x70003000
#define GOBLIN_USER_RAM 0x70016000
typedef enum {
goblin_bank_fbc,
goblin_bank_fb
goblin_bank_fb,
jareth_bank_reg,
jareth_bank_microcode,
jareth_bank_regfile
} gobo_reg_bank;
struct mmo {
u_long mo_uaddr; /* user (virtual) address */
@ -466,8 +499,11 @@ goblinmmap(dev_t dev, off_t off, int prot)
struct mmo *mo;
u_int u, sz, flags;
static struct mmo mmo[] = {
{ GOBLIN_USER_RAM, 0, goblin_bank_fb },
{ GOBLIN_USER_FBC, 1, goblin_bank_fbc },
{ GOBLIN_USER_RAM, 0, goblin_bank_fb },
{ GOBLIN_USER_FBC, 1, goblin_bank_fbc },
{ JARETH_USER_REG, 1, jareth_bank_reg },
{ JARETH_USER_MICROCODE, 4096, jareth_bank_microcode },
{ JARETH_USER_REGFILE, 1024, jareth_bank_regfile },
};
/* device_printf(sc->sc_dev, "requiesting %llx with %d\n", off, prot); */
@ -506,6 +542,18 @@ goblinmmap(dev_t dev, off_t off, int prot)
return (bus_space_mmap(sc->sc_bustag,
sc->sc_reg_fbc_paddr, u,
prot, flags));
case jareth_bank_reg:
return (bus_space_mmap(sc->sc_bustag,
sc->sc_jareth_reg_paddr, u,
prot, flags));
case jareth_bank_microcode:
return (bus_space_mmap(sc->sc_bustag,
sc->sc_jareth_microcode_paddr, u,
prot, flags));
case jareth_bank_regfile:
return (bus_space_mmap(sc->sc_bustag,
sc->sc_jareth_regfile_paddr, u,
prot, flags));
}
}
}
@ -773,7 +821,7 @@ static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose,
static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n) {
const uint32_t base = 0;
const int pidx = 1;
const int pidx = 3; // fill
int i;
power_on(sc);
@ -801,6 +849,38 @@ static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, i
return 0;
}
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop) {
const uint32_t base = 0;
const int pidx = 4; // fillrop
int i;
power_on(sc);
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,0), (sc->sc_internal_adr + y0 * sc->sc_stride + x0));
for (i = 0 ; i < 8 ; i++) {
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,i), pat);
}
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,0), (w));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,0), (n));
/* for (i = 1 ; i < 8 ; i++) { */
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(2,i), 0); */
/* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), 0); */
/* } */
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,0), (sc->sc_stride));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(5,0), (pm));
bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(5,1), (rop));
jareth_mpstart_write(sc, program_offset[pidx]);
jareth_mplen_write(sc, program_len[pidx]);
(void)start_job(sc, verbose);
delay(1);
(void)wait_job(sc, 1, verbose);
power_off(sc);
return 0;
}
static void
jareth_copyrows(void *cookie, int src, int dst, int n)
{
@ -886,7 +966,7 @@ static int start_job(struct goblin_softc *sc, enum jareth_verbosity verbose) {
static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosity verbose) {
uint32_t status = jareth_status_read(sc);
int count = 0;
int max_count = 3000;
int max_count = 5000;
int del = 1;
const int max_del = 64;
static int max_del_seen = 1;

View File

@ -157,6 +157,7 @@ goblinattach_sbus(device_t parent, device_t self, void *args)
sc->sc_has_jareth = prom_getpropint(node, "goblin-has-jareth", 0);
sc->sc_internal_adr = prom_getpropint(node, "goblin-internal-fb", 0x8f000000);
aprint_normal_dev(self, "Goblin framebuffer internally @ %p\n", (void*)sc->sc_internal_adr);
if (sc->sc_has_jareth) {
if (sa->sa_nreg < 5) {
@ -173,6 +174,7 @@ goblinattach_sbus(device_t parent, device_t self, void *args)
aprint_error(": cannot map Jareth registers\n");
sc->sc_has_jareth = 0;
} else {
sc->sc_jareth_reg_paddr = sbus_bus_addr(sa->sa_bustag, sa->sa_reg[2].oa_space, sa->sa_reg[2].oa_base);
aprint_normal_dev(self, "Jareth registers @ %p\n", (void*)sc->sc_bhregs_jareth);
/* map microcode */
if (sbus_bus_map(sc->sc_bustag,
@ -184,6 +186,7 @@ goblinattach_sbus(device_t parent, device_t self, void *args)
aprint_error(": cannot map Jareth microcode\n");
sc->sc_has_jareth = 0;
} else {
sc->sc_jareth_microcode_paddr = sbus_bus_addr(sa->sa_bustag, sa->sa_reg[3].oa_space, sa->sa_reg[3].oa_base);
aprint_normal_dev(self, "Jareth microcode @ %p\n", (void*)sc->sc_bhregs_microcode);
/* map register file */
if (sbus_bus_map(sc->sc_bustag,
@ -195,6 +198,7 @@ goblinattach_sbus(device_t parent, device_t self, void *args)
aprint_error(": cannot map Jareth regfile\n");
sc->sc_has_jareth = 0;
} else {
sc->sc_jareth_regfile_paddr = sbus_bus_addr(sa->sa_bustag, sa->sa_reg[4].oa_space, sa->sa_reg[4].oa_base);
aprint_normal_dev(self, "Jareth regfile @ %p\n", (void*)sc->sc_bhregs_regfile);
}
}

View File

@ -50,6 +50,9 @@ struct goblin_softc {
bus_space_tag_t sc_bustag;
bus_addr_t sc_reg_fbc_paddr; /* phys address for device mmap() */
bus_addr_t sc_fb_paddr; /* phys address for device mmap() */
bus_addr_t sc_jareth_reg_paddr; /* phys address for device mmap() */
bus_addr_t sc_jareth_microcode_paddr; /* phys address for device mmap() */
bus_addr_t sc_jareth_regfile_paddr; /* phys address for device mmap() */
uint32_t sc_size; /* full memory size */
int sc_opens; /* number of open() to track 8/24 bits */
int sc_has_jareth; /* whether we have a Jareth vector engine available */

View File

@ -36,8 +36,11 @@
#include "exa.h"
/* Various offsets in virtual (ie. mmap()) spaces Linux and Solaris support. */
#define GOBLIN_FBC_VOFF 0x70000000
#define GOBLIN_RAM_VOFF 0x70016000
#define GOBLIN_FBC_VOFF 0x70000000
#define JARETH_REG_VOFF 0x70001000
#define JARETH_MICROCODE_VOFF 0x70002000
#define JARETH_REGFILE_VOFF 0x70003000
#define GOBLIN_RAM_VOFF 0x70016000
typedef struct {
unsigned int fg, bg; /* FG/BG colors for stipple */
@ -54,20 +57,25 @@ typedef struct {
typedef struct {
unsigned char *fb;
GoblinFbcPtr fbc;
int vclipmax;
JarethRegPtr jreg;
JarethMicrocodePtr jmicrocode;
JarethRegfilePtr jregfile;
int width;
int height;
int maxheight;
int vidmem;
sbusDevicePtr psdp;
Bool NoAccel;
CloseScreenProcPtr CloseScreen;
OptionInfoPtr Options;
Bool has_accel;
int clipxa, clipxe;
ExaDriverPtr pExa;
int srcoff, fg;
uint32_t last_mask;
uint32_t last_rop;
uint32_t fg;
int xdir, ydir;
uint32_t srcoff, srcpitch;
} GoblinRec, *GoblinPtr;
extern int GoblinScreenPrivateIndex;

View File

@ -31,14 +31,34 @@
#include "goblin_regs.h"
#include "dgaproc.h"
#include <unistd.h>
/* DGA stuff */
#define DEBUG_GOBLIN 1
#ifdef DEBUG_GOBLIN
#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__);
#define DPRINTF xf86Msg
#else
#define ENTER
#define DPRINTF while (0) xf86Msg
#endif
static Bool Goblin_OpenFramebuffer(ScrnInfoPtr pScrn, char **, unsigned char **mem,
int *, int *, int *);
static Bool Goblin_SetMode(ScrnInfoPtr, DGAModePtr);
static void Goblin_SetViewport(ScrnInfoPtr, int, int, int);
static int Goblin_GetViewport(ScrnInfoPtr);
static void GoblinWaitMarker(ScreenPtr pScreen, int Marker);
static Bool GoblinUploadToScreen(PixmapPtr pDst, int x, int y, int w, int h, char *src, int src_pitch);
static Bool GoblinDownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h, char *dst, int dst_pitch);
static Bool GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg);
static void GoblinSolid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2);
static void GoblinDone(PixmapPtr pDstPixmap);
static Bool GoblinPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap, int xdir, int ydir, int alu, Pixel planemask);
static void GoblinCopy(PixmapPtr pDstPixmap, int srcX, int srcY, int dstX, int dstY, int w, int h);
static void GoblinSync(ScrnInfoPtr);
static DGAFunctionRec Goblin_DGAFuncs = {
@ -153,3 +173,316 @@ Goblin_GetViewport(ScrnInfoPtr pScrn)
/* No viewports, none pending... */
return 0;
}
int
GOBLINEXAInit(ScreenPtr pScreen)
{
ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
GoblinPtr pGoblin = GET_GOBLIN_FROM_SCRN(pScrn);
ExaDriverPtr pExa;
pExa = exaDriverAlloc();
if (!pExa)
return FALSE;
pGoblin->pExa = pExa;
pExa->exa_major = EXA_VERSION_MAJOR;
pExa->exa_minor = EXA_VERSION_MINOR;
pExa->memoryBase = pGoblin->fb;
pExa->memorySize = pGoblin->vidmem - 32;
pExa->offScreenBase = pGoblin->width * pGoblin->height * 4; // 32-bits
/*
* Jareth has 128-bits memory access
*/
pExa->pixmapOffsetAlign = 16;
pExa->pixmapPitchAlign = 16;
pExa->flags = EXA_OFFSCREEN_PIXMAPS;/* | EXA_MIXED_PIXMAPS; */ /* | EXA_SUPPORTS_OFFSCREEN_OVERLAPS; */
/*
* these limits are bogus
* Jareth doesn't deal with coordinates at all, so there is no limit but
* we have to put something here
*/
pExa->maxX = 4096;
pExa->maxY = 4096;
pExa->WaitMarker = GoblinWaitMarker;
pExa->PrepareSolid = GoblinPrepareSolid;
pExa->Solid = GoblinSolid;
pExa->DoneSolid = GoblinDone;
pExa->PrepareCopy = GoblinPrepareCopy;
pExa->Copy = GoblinCopy;
pExa->DoneCopy = GoblinDone;
pExa->UploadToScreen = GoblinUploadToScreen;
pExa->DownloadFromScreen = GoblinDownloadFromScreen;
return exaDriverInit(pScreen, pExa);;
}
static inline void
GoblinWait(GoblinPtr pGoblin)
{
uint32_t status = pGoblin->jreg->status;
int count = 0;
int max_count = 1000;
int del = 1;
const int param = 1;
const int max_del = 32;
ENTER;
while ((status & 1) && (count < max_count)) {
count ++;
usleep(del * param);
del = del < max_del ? 2*del : del;
status = pGoblin->jreg->status;
}
if (status & 1) {
xf86Msg(X_ERROR, "Jareth wait for idle timed out %08x %08x\n", status);
}
}
static void
GoblinWaitMarker(ScreenPtr pScreen, int Marker)
{
ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
GoblinPtr p = GET_GOBLIN_FROM_SCRN(pScrn);
GoblinWait(p);
}
/*
* Memcpy-based UTS.
*/
static Bool
GoblinUploadToScreen(PixmapPtr pDst, int x, int y, int w, int h, char *src, int src_pitch)
{
ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
GoblinPtr pGoblin = GET_GOBLIN_FROM_SCRN(pScrn);
char *dst = pGoblin->fb + exaGetPixmapOffset(pDst);
int dst_pitch = exaGetPixmapPitch(pDst);
int bpp = pDst->drawable.bitsPerPixel;
int cpp = (bpp + 7) >> 3;
int wBytes = w * cpp;
ENTER;
DPRINTF(X_ERROR, "%s depth %d\n", __func__, bpp);
dst += (x * cpp) + (y * dst_pitch);
GoblinWait(pGoblin);
while (h--) {
memcpy(dst, src, wBytes);
src += src_pitch;
dst += dst_pitch;
}
__asm("stbar;");
return TRUE;
}
/*
* Memcpy-based DFS.
*/
static Bool
GoblinDownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h, char *dst, int dst_pitch)
{
ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
GoblinPtr pGoblin = GET_GOBLIN_FROM_SCRN(pScrn);
char *src = pGoblin->fb + exaGetPixmapOffset(pSrc);
int src_pitch = exaGetPixmapPitch(pSrc);
ENTER;
int bpp = pSrc->drawable.bitsPerPixel;
int cpp = (bpp + 7) >> 3;
int wBytes = w * cpp;
src += (x * cpp) + (y * src_pitch);
GoblinWait(pGoblin);
while (h--) {
memcpy(dst, src, wBytes);
src += src_pitch;
dst += dst_pitch;
}
return TRUE;
}
static Bool
GoblinPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planemask, Pixel fg)
{
ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
GoblinPtr pGoblin = GET_GOBLIN_FROM_SCRN(pScrn);
int i;
ENTER;
DPRINTF(X_ERROR, "bits per pixel: %d\n",
pPixmap->drawable.bitsPerPixel);
if ((pGoblin->jreg->power & 1) != 1)
pGoblin->jreg->power = 1;
GoblinWait(pGoblin);
pGoblin->fg = fg;
for (i = 0 ; i < 8; i++)
pGoblin->jregfile->reg[1][i] = fg;
pGoblin->jregfile->reg[5][0] = planemask;
pGoblin->jregfile->reg[5][1] = alu;
pGoblin->last_mask = planemask;
pGoblin->last_rop = alu;
if ((alu == 0x3) && // GCcopy
(planemask == 0xFFFFFFFF)) {
// fill
pGoblin->jreg->mpstart = 37;
pGoblin->jreg->mplen = 38;
} else {
// fillrop
pGoblin->jreg->mpstart = 75;
pGoblin->jreg->mplen = 41;
}
DPRINTF(X_ERROR, "%s: %x; %x\n", __func__, alu, planemask);
return TRUE;
}
static void
GoblinSolid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
{
ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
GoblinPtr pGoblin = GET_GOBLIN_FROM_SCRN(pScrn);
int w = x2 - x1, h = y2 - y1, dstoff, dstpitch;
int start, depth;
uint32_t ptr;
ENTER;
if (pGoblin->last_rop == 5) // GXnoop
return;
dstpitch = exaGetPixmapPitch(pPixmap);
dstoff = exaGetPixmapOffset(pPixmap);
depth = pPixmap->drawable.bitsPerPixel;
switch (depth) {
case 32:
start = dstoff + (y1 * dstpitch) + (x1 << 2);
/* we work in bytes not pixels */
w = w * 4;
break;
case 8:
start = dstoff + (y1 * dstpitch) + x1;
break;
}
ptr = 0x8f000000; // fixme
ptr += start;
GoblinWait(pGoblin);
pGoblin->jregfile->reg[0][0] = ptr;
pGoblin->jregfile->reg[2][0] = w;
pGoblin->jregfile->reg[3][0] = h;
pGoblin->jregfile->reg[4][0] = dstpitch;
DPRINTF(X_ERROR, "Solid %d %d %d %d [%d %d], %d %d -> %d (%p: %p)\n", x1, y1, x2, y2,
w, h, dstpitch, dstoff, start, (void*)start, ptr);
pGoblin->jreg->control = 1; // start
exaMarkSync(pPixmap->drawable.pScreen);
}
static void GoblinDone(PixmapPtr pDstPixmap) {
}
static Bool
GoblinPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap,
int xdir, int ydir, int alu, Pixel planemask)
{
ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
GoblinPtr pGoblin = GET_GOBLIN_FROM_SCRN(pScrn);
ENTER;
pGoblin->srcpitch = exaGetPixmapPitch(pSrcPixmap);
pGoblin->srcoff = exaGetPixmapOffset(pSrcPixmap);
pGoblin->xdir = xdir;
pGoblin->ydir = ydir;
return TRUE;
}
static void
GoblinCopy(PixmapPtr pDstPixmap,
int srcX, int srcY, int dstX, int dstY, int w, int h)
{
ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
GoblinPtr pGoblin = GET_GOBLIN_FROM_SCRN(pScrn);
int dstoff = exaGetPixmapOffset(pDstPixmap);
int dstpitch = exaGetPixmapPitch(pDstPixmap);
int srcstart, dststart;
char *src, *dst;
int i, j;
ENTER;
DPRINTF(X_ERROR, "Copy %d %d -> %d %d [%d x %d]\n", srcX, srcY, dstX, dstY, w, h);
srcstart = (srcX << 2) + (pGoblin->srcpitch * srcY) + pGoblin->srcoff;
dststart = (dstX << 2) + ( dstpitch * dstY) + dstoff;
src = pGoblin->fb + srcstart;
dst = pGoblin->fb + dststart;
if (ydir > 0 && xdir > 0) {
for (j = 0 ; j < h ; j++) {
for (i = 0 ; i < w; i ++) {
*(src+i) = *(dst+i);
}
src += srcpitch;
dst += dstpitch;
}
} else if (ydir > 0 && xdir < 0) {
for (j = 0 ; j < h ; j++) {
for (i = w - 1 ; i >= 0 ; i --) {
*(src+i) = *(dst+i);
}
src += srcpitch;
dst += dstpitch;
}
} else if (ydir < 0 && xdir > 0) {
src += srcpitch * h;
dst += dstpitch * h;
for (j = 0 ; j < h ; j++) {
src -= srcpitch;
dst -= dstpitch;
for (i = 0 ; i < w; i ++) {
*(src+i) = *(dst+i);
}
}
} else if (ydir < 0 && xdir < 0) {
src += srcpitch * h;
dst += dstpitch * h;
for (j = 0 ; j < h ; j++) {
src -= srcpitch;
dst -= dstpitch;
for (i = w - 1 ; i >= 0 ; i --) {
*(src+i) = *(dst+i);
}
}
}
}

View File

@ -40,9 +40,11 @@
#include "compat-api.h"
#ifndef SBUS_DEVICE_GOBLIN
#define SBUS_DEVICE_GOBLIN 0x0010
#endif
/*
0011 src
0101 dst
GXclear 0x0 0 0000
@ -309,8 +311,8 @@ GOBLINPreInit(ScrnInfoPtr pScrn, int flags)
{
GoblinPtr pGoblin;
sbusDevicePtr psdp;
MessageType from;
int i;
int i, prom, len;
char *ptr;
if (flags & PROBE_DETECT) return FALSE;
@ -350,6 +352,24 @@ GOBLINPreInit(ScrnInfoPtr pScrn, int flags)
} else
return FALSE;
}
prom = sparcPromInit();
len = 4;
if ((ptr = sparcPromGetProperty(&psdp->node, "goblin-has-jareth", &len))) {
if (len >= 1) {
/* if (ptr[0]) */
/* pGoblin->has_accel = TRUE; */
/* else */
/* pGoblin->has_accel = FALSE; */
pGoblin->has_accel = TRUE;
} else {
pGoblin->has_accel = FALSE;
}
}
if (pGoblin->has_accel)
xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Jareth found\n");
else
xf86DrvMsg(pScrn->scrnIndex, X_INFO, "no Jareth (%p)\n", ptr);
/*********************
deal with depth
@ -432,12 +452,9 @@ GOBLINPreInit(ScrnInfoPtr pScrn, int flags)
}
}
/* Set the bits per RGB for 8bpp mode */
from = X_DEFAULT;
if (xf86ReturnOptValBool(pGoblin->Options, OPTION_NOACCEL, FALSE)) {
pGoblin->NoAccel = TRUE;
xf86DrvMsg(pScrn->scrnIndex, X_CONFIG, "Acceleration disabled\n");
pGoblin->NoAccel = TRUE;
xf86DrvMsg(pScrn->scrnIndex, X_CONFIG, "Acceleration disabled\n");
}
char *optstr;
@ -526,6 +543,21 @@ GOBLINScreenInit(SCREEN_INIT_ARGS_DECL)
return FALSE;
}
if (pGoblin->has_accel && !pGoblin->NoAccel) {
// map Jareth registers
pGoblin->jreg = xf86MapSbusMem(psdp, JARETH_REG_VOFF, sizeof(JarethReg));
pGoblin->jmicrocode = xf86MapSbusMem(psdp, JARETH_MICROCODE_VOFF, sizeof(JarethReg));
pGoblin->jregfile = xf86MapSbusMem(psdp, JARETH_REGFILE_VOFF, sizeof(JarethMicrocode));
if ((pGoblin->jreg == NULL) ||
(pGoblin->jmicrocode == NULL) ||
(pGoblin->jregfile == NULL)) {
xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "xf86MapSbusMem failed for Jareth\n");
pGoblin->has_accel = FALSE;
} else {
xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Jareth successfully mapped\n");
}
}
/* Darken the screen for aesthetic reasons and set the viewport */
GOBLINSaveScreen(pScreen, SCREEN_SAVER_ON);
@ -595,26 +627,24 @@ GOBLINScreenInit(SCREEN_INIT_ARGS_DECL)
}
}
if (!pGoblin->NoAccel) {
#if 0
{
/* EXA */
XF86ModReqInfo req;
int errmaj, errmin;
memset(&req, 0, sizeof(XF86ModReqInfo));
req.majorversion = EXA_VERSION_MAJOR;
req.minorversion = EXA_VERSION_MINOR;
if (!LoadSubModule(pScrn->module, "exa", NULL, NULL, NULL, &req,
&errmaj, &errmin)) {
LoaderErrorMsg(NULL, "exa", errmaj, errmin);
return FALSE;
}
if (!GOBLINEXAInit(pScreen))
return FALSE;
xf86Msg(X_INFO, "%s: Using EXA acceleration\n", pGoblin->psdp->device);
}
#endif
if (!pGoblin->NoAccel && pGoblin->has_accel) {
{
/* EXA */
XF86ModReqInfo req;
int errmaj, errmin;
memset(&req, 0, sizeof(XF86ModReqInfo));
req.majorversion = EXA_VERSION_MAJOR;
req.minorversion = EXA_VERSION_MINOR;
if (!LoadSubModule(pScrn->module, "exa", NULL, NULL, NULL, &req,
&errmaj, &errmin)) {
LoaderErrorMsg(NULL, "exa", errmaj, errmin);
return FALSE;
}
if (!GOBLINEXAInit(pScreen))
return FALSE;
xf86Msg(X_INFO, "%s: Using EXA acceleration\n", pGoblin->psdp->device);
}
}
/* setup DGA */

View File

@ -50,13 +50,36 @@
#define GOBOFB_INTR_CLEAR_CLEAR 0x0
typedef struct goblin_fbc {
uint32_t mode;
uint32_t vbl_mask;
uint32_t videoctrl;
uint32_t intr_clear;
uint32_t reset;
uint32_t lut_addr;
uint32_t lut;
volatile uint32_t mode;
volatile uint32_t vbl_mask;
volatile uint32_t videoctrl;
volatile uint32_t intr_clear;
volatile uint32_t reset;
volatile uint32_t lut_addr;
volatile uint32_t lut;
} GoblinFbc, *GoblinFbcPtr;
typedef struct jareth_reg {
volatile uint32_t window;
volatile uint32_t mpstart;
volatile uint32_t mplen;
volatile uint32_t control;
volatile uint32_t mpresume;
volatile uint32_t power;
volatile uint32_t status;
volatile uint32_t ev_status;
volatile uint32_t ev_prending;
volatile uint32_t ev_enable;
volatile uint32_t instruction;
volatile uint32_t ls_status;
} JarethReg, *JarethRegPtr;
typedef struct jareth_microcode {
volatile uint32_t mc[1024];
} JarethMicrocode, *JarethMicrocodePtr;
typedef struct jareth_regfile {
volatile uint32_t reg[32][8];
} JarethRegfile, *JarethRegfilePtr;
#endif /* GOBLIN_REGS_H */

View File

@ -14,17 +14,20 @@ opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shif
"UDF" : [-1, "Placeholder for undefined opcodes"],
"PSA" : [0, "Wd $\gets$ Ra // pass A"],
"PSB" : [1, "Wd $\gets$ Rb // pass B"], # for star version mostly
# 2 MSK
"ROP32V" : [2, "Wd $\gets$ ((Rb ROP Ra) & planemask) | (Ra & ~planemask)" ], # replace MSK
"XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"],
"NOT" : [4, "Wd $\gets$ ~Ra // binary invert"],
"ADD32V" : [5, "Wd[x..x+32] $\gets$ Ra[x..x+32] + Rb[x..x+32] // vector 32-bit binary add"],
"SUB32V" : [6, "Wd[x..x+32] $\gets$ Ra[x..x+32] - Rb[x..x+32] // vector 32-bit binary add"],
"SUB32V" : [6, "Wd[x..x+32] $\gets$ Ra[x..x+32] - Rb[x..x+32] // vector 32-bit binary sub"],
"AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL
"BRNZ32" : [8, "If Ra[0:32] != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if non-zero"], # replace TRD
"BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero"],
"BRNZ32" : [8, "If Ra[0:32] != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if non-zero (32-bits)"], # replace TRD
"BRZ32" : [9, "If Ra[0:32] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (32-bits)"],
"FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
"SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"],
# 12 XBT
"SROP" : [12, "set planemask & rop from Ra[0:32] and Ra[32:36]" ], # was XBT
"BRZ4" : [13, "If Ra[0:4] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (4-bits)"],
"BRZ5" : [14, "If Ra[0:5] == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero (5-bits)"],
"MIN32V" : [15, "Wd[x..x+32] $\gets$ umin(Ra[x..x+32], Rb[x..x+32]) // vector 32-bit umin"],
# for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ;
"GETM": [17, "GETM: getmask" ],
"ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
@ -230,12 +233,13 @@ class JarethConst(Module, AutoDoc):
2: [2, "two", "The number two"],
#3: [3, "three", "The number three"],
#4: [4, "four", "The number four"],
#5: [5, "five", "The number five"],
###5: [5, "five", "The number five"],
5: [32, "thirty-two", "The number thirty-two"],
#6: [6, "six", "The number six"],
###6: [6, "six", "The number six"],
6: [31, "thirty-one", "The number thirty-one"],
#7: [7, "seven", "The number seven"],
#8: [8, "eight", "The number eight"],
15: [15, "sixteen", "The number fifteen"],
15: [15, "fifteen", "The number fifteen"],
16: [16, "sixteen", "The number sixteen"],
}
self.adr = Signal(5)
@ -293,16 +297,16 @@ class ExecUnit(Module, AutoDoc):
class ExecLogic(ExecUnit):
def __init__(self, width=256):
ExecUnit.__init__(self, width, ["XOR", "NOT", "PSA", "SHL", "AND"])
ExecUnit.__init__(self, width, ["XOR", "NOT", "PSA", "SHL", "AND" ])
self.intro = ModuleDoc(title="Logic ExecUnit Subclass", body=f"""
This execution unit implements bit-wise logic operations: XOR, NOT, and
passthrough.
* XOR returns the result of A^sB
* XOR returns the result of A^B
* NOT returns the result of !A
* PSA returns the value of A
* SHL returns A << 1
* AND returns the result of A&sB
* AND returns the result of A&B
""")
@ -329,7 +333,7 @@ passthrough.
class ExecAddSub(ExecUnit, AutoDoc):
def __init__(self, width=256):
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V"])
ExecUnit.__init__(self, width, ["ADD32V", "SUB32V", "MIN32V" ])
self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f"""
""")
@ -342,8 +346,70 @@ class ExecAddSub(ExecUnit, AutoDoc):
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] + self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
).Elif(self.instruction.opcode == opcodes["SUB32V"][0],
[ self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32] - self.b[x*32:(x+1)*32]) for x in range(0, width//32) ],
),
).Elif(self.instruction.opcode == opcodes["MIN32V"][0],
[ If((self.a[x*32:(x+1)*32] <= self.b[x*32:(x+1)*32]), self.q[x*32:(x+1)*32].eq(self.a[x*32:(x+1)*32]), self.q.eq(self.b[x*32:(x+1)*32])) for x in range(0, width//32) ],
)
]
class ExecRop(ExecUnit, AutoDoc):
def __init__(self, width=256):
ExecUnit.__init__(self, width, ["ROP32V", "SROP"])
self.notes = ModuleDoc(title="Rop ExecUnit Subclass", body=f"""
""")
rop = Array(Signal() for x in range(4))
planemask = Signal(32)
lanewidth = 128
nlane = width // lanewidth
assert(nlane == 2) ## fixme
rop_in = Array(Signal(2) for x in range(lanewidth))
rop_out = Signal(128)
rop_buf = Signal(nlane * lanewidth)
lanec = Signal(log2_int(nlane, False))
self.sync.eng_clk += [ rop_out[x].eq((rop[(rop_in[x])] & planemask[x%32]) | (rop_in[x][0] & ~planemask[x%32])) for x in range(lanewidth) ]
self.sync.eng_clk += [
#self.q_valid.eq(self.start),
self.instruction_out.eq(self.instruction_in),
]
self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE"))
seq.act("IDLE",
If(self.start,
If(self.instruction.opcode == opcodes["ROP32V"][0],
NextValue(lanec, 0),
[ NextValue(rop_in[x][0], self.a[x]) for x in range(0, lanewidth) ],
[ NextValue(rop_in[x][1], self.b[x]) for x in range(0, lanewidth) ],
NextState("NEXT")
).Elif(self.instruction.opcode == opcodes["SROP"][0],
NextValue(rop_buf, 0),
NextValue(rop[0], self.a[35]),
NextValue(rop[1], self.a[34]),
NextValue(rop[2], self.a[33]),
NextValue(rop[3], self.a[32]),
NextValue(planemask, self.a[0:32]),
NextState("OUT"),
)
))
seq.act("NEXT",
[ NextValue(rop_in[x][0], self.a[128+x]) for x in range(0, lanewidth) ],
[ NextValue(rop_in[x][1], self.b[128+x]) for x in range(0, lanewidth) ],
NextState("WRITE"))
seq.act("WRITE",
Case(lanec, {
0: [ NextValue(rop_buf[0:128], rop_out),
NextValue(lanec, 1),
],
1: [ NextValue(rop_buf[128:256], rop_out),
NextState("OUT"),
],
}))
seq.act("OUT",
self.q_valid.eq(1),
self.q.eq(rop_buf),
NextState("IDLE"));
class ExecLS(ExecUnit, AutoDoc):
def __init__(self, width=256, interface=None, memoryport=None, r_dat_f=None, r_dat_m=None, granule=0):
@ -439,6 +505,7 @@ class ExecLS(ExecUnit, AutoDoc):
)
)
)
# having this extra stage and a registered 'address' help with timings, apparently
lsseq.act("DOMEM",
NextValue(cpar, cpar ^ 1),
If(self.instruction.opcode == opcodes["MEM"][0],
@ -706,7 +773,6 @@ class ExecLS(ExecUnit, AutoDoc):
self.sync.mul_clk += self.state[28:30].eq((self.state[28:30] & Replicate(~start_pipe, 2)) | self.has_timeout)
self.sync.mul_clk += self.state[30:32].eq((self.state[30:32] & Replicate(~start_pipe, 2)) | self.has_failure)
class Jareth(Module, AutoCSR, AutoDoc):
def __init__(self, platform, prefix, memoryport, sim=False, build_prefix=""):
opdoc = "\n"
@ -1148,6 +1214,10 @@ Here are the currently implemented opcodes for The Engine:
NextState("DO_BRZ32"),
).Elif(instruction.opcode == opcodes["BRNZ32"][0],
NextState("DO_BRNZ32"),
).Elif(instruction.opcode == opcodes["BRZ4"][0],
NextState("DO_BRZ4"),
).Elif(instruction.opcode == opcodes["BRZ5"][0],
NextState("DO_BRZ5"),
).Elif(instruction.opcode == opcodes["FIN"][0],
NextState("IDLE"),
NextValue(running, 0),
@ -1218,6 +1288,50 @@ Here are the currently implemented opcodes for The Engine:
)
),
)
seq.act("DO_BRZ4",
If(ra_dat[0:4] == 0,
If( (sext_immediate + mpc + 1 < mpc_stop) & (sext_immediate + mpc + 1 >= self.mpstart.fields.mpstart), # validate new PC is in range
NextState("FETCH"),
NextValue(mpc, sext_immediate + mpc + 1),
).Else(
NextState("IDLE"),
NextValue(running, 0),
)
).Else(
If(abort,
NextState("IDLE"),
NextValue(running, 0),
).Elif(mpc < mpc_stop,
NextState("FETCH"),
NextValue(mpc, mpc + 1),
).Else(
NextState("IDLE"),
NextValue(running, 0),
)
),
)
seq.act("DO_BRZ5",
If(ra_dat[0:5] == 0,
If( (sext_immediate + mpc + 1 < mpc_stop) & (sext_immediate + mpc + 1 >= self.mpstart.fields.mpstart), # validate new PC is in range
NextState("FETCH"),
NextValue(mpc, sext_immediate + mpc + 1),
).Else(
NextState("IDLE"),
NextValue(running, 0),
)
).Else(
If(abort,
NextState("IDLE"),
NextValue(running, 0),
).Elif(mpc < mpc_stop,
NextState("FETCH"),
NextValue(mpc, mpc + 1),
).Else(
NextState("IDLE"),
NextValue(running, 0),
)
),
)
seq.act("PAUSED",
If(~pause_req,
NextValue(pause_gnt, 0),
@ -1232,11 +1346,13 @@ Here are the currently implemented opcodes for The Engine:
exec_units = {
"exec_logic" : ExecLogic(width=rf_width_raw),
"exec_addsub" : ExecAddSub(width=rf_width_raw),
"exec_rop" : ExecRop(width=rf_width_raw),
"exec_ls" : ExecLS(width=rf_width_raw, interface=self.busls, memoryport=memoryport, r_dat_f=r_dat_f, r_dat_m=r_dat_m, granule=granule),
}
exec_units_shift = {
"exec_logic": True,
"exec_addsub": False,
"exec_rop": True,
"exec_ls": False,
}
exec_unit_shift_num = { }

View File

@ -12,36 +12,36 @@ fn main() -> std::io::Result<()> {
// -----
// size & 7 in %5
// size rounded down in %6
// input in %16
// output in %17
// 0 in %31
// input in %7
// output in %8
// 0 in %15
start:
resm %31
setadr %31, %0
load256inc %16, %0
load256inc %17, %1
resm %15
setadr %15, %0
load256inc %7, ^0
load256inc %8, ^1
// slow
setma %31, %0, #16
setma %15, %0, #16
// slow
setmq %31, %1, #16
setmq %15, %1, #16
and %5, %2, #15
sub32v %6, %2, %5
brz32 done, %6
loop:
psa %18, %16
psa %19, %17
psa* %17, %16
psa %20, %17
store128inc %31, %2, %17
psa %18, %7
psa %19, %8
psa* %8, %7
psa %20, %8
store128inc %15, ^2, %8
sub32v %6, %6, #16
brz32 last, %6
loadh128inc %16, %0, %16
loadh128inc %17, %1, %17
loadh128inc %7, ^0, %7
loadh128inc %8, ^1, %8
brz32 loop, #0
last:
// FIXME: not if Q is aligned
loadh128inc %17, %1, %17
store128inc %31, %2, %17
loadh128inc %8, ^1, %8
store128inc %15, ^2, %8
done:
getadr %3
getm %2
@ -58,19 +58,19 @@ fn main() -> std::io::Result<()> {
// live X count in %5
// // live Y count in %3
// data in %7
// 0/scrap in %31
// 0/scrap in %15
start:
// reset masks (probably not necessary with the starred-instruction)
// resm %31
// resm %15
loop_y:
// set source and destination addresses for current Y, X=first
setadr %31, %0
setadr %15, %0
psa %5, %2
loop_x:
// load from SRC w/ post-increment
load256inc %7, %0
load256inc %7, ^0
// store to DST w/ post-increment
store256inc %31, %1, %7
store256inc %15, ^1, %7
// sub 32 (#5 is 32...) from live X count
sub32v %5, %5, #5
// if X count is not 0, keep looping
@ -98,19 +98,19 @@ fn main() -> std::io::Result<()> {
// live X count in %5
// // live Y count in %3
// data in %7
// 0/scrap in %31
// 0/scrap in %15
start:
// reset masks (probably not necessary with the starred-instruction)
// resm %31
// resm %15
loop_y:
// set source and destination addresses for current Y, X=first
setadr %31, %0
setadr %15, %0
psa %5, %2
loop_x:
// load from SRC w/ post-increment
load128inc %7, %0
load128inc %7, ^0
// store to DST w/ post-increment
store128inc %31, %1, %7
store128inc %15, ^1, %7
// sub 16 (#16 is 16) from live X count
sub32v %5, %5, #16
// if X count is not 0, keep looping
@ -139,17 +139,17 @@ fn main() -> std::io::Result<()> {
// live X count in %5
// // live Y count in %3
// data in %7
// 0/scrap in %31
// 0/scrap in %15
start:
// reset masks (probably not necessary with the starred-instruction)
// resm %31
// resm %15
loop_y:
// set source and destination addresses for current Y, X=first
setadr %31, %0
setadr %15, %0
psa %5, %2
loop_x:
// store to DST w/ post-increment
store128inc %31, %0, %1
store128inc %15, ^0, %1
// sub 16 (#16 is 16) from live X count
sub32v %5, %5, #16
// if X count is not 0, keep looping
@ -167,6 +167,291 @@ fn main() -> std::io::Result<()> {
fin
fin
);
let mcode_fill256 = assemble_jareth!(
// x..x / $DST in %0, aligned on 128 bits
// 128-bits pattern in %1
// x..x / X size in %2, multiple of 128 bits (16 bytes)
// x..x / Y size in %3, arbitrary
// x..x / dst_stride in %4 (screen width)
// -----
// live X count in %5
// // live Y count in %3
// data in %7
// 0/scrap in %15
start:
// reset masks (probably not necessary with the starred-instruction)
resm %15
// compute X leftovers (modulo 32 -> #6 is 31)
and %6, %2, #6
// set the leftovers mask (offset is 0 as we are aligned)
setmq %15, #0, %6
loop_y:
// set source and destination addresses for current Y, X=first
setadr %15, %0
// then the rounded value in X
sub32v %5, %2, %6
loop_x:
// store to DST w/ post-increment
store256inc %15, ^0, %1
// sub 16 (#5 is 32) from live X count
sub32v %5, %5, #5
// if X count is not 0, keep looping
brnz32 loop_x, %5
// decrement Y count
sub32v %3, %3, #1
// if 0, finished
brz32 done, %3
// add strides to initial addresses
add32v %0, %0, %4
// loop to do next line
brz32 loop_y, #0
done:
fin
fin
);
let mcode_fill = assemble_jareth!(
// x..x / $DST in %0, 128 bits
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
// x..x / X size in %2
// x..x / Y size in %3,
// x..x / dst_stride in %4 (screen width?)
// -----
// main loop:
// live X count in %5
// leftover X in %6
// // live Y count in %3
// data in %7
// masked data in %7
// 0/scrap in %15
// -----
// header loop:
// live Y count in %5
// $DST in %6
// data in %7
// 0/scrap in %15
start:
// if number of line or element in line is 0, exit early
brz32 done256, %2
brz32 done256, %3
// reset masks
resm %15
// if $DST is aligned on 128 bits, jump to aligned loop
brz4 start256, %0
// do the first column
startX:
// set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes
setmq %15, %0, %2
// copy Y
psa %5, %3
// copy $DST
psa %6, %0
loopX_y:
// setadr
setadr %15, %6
// load old data
load256 %7, ^0
// insert pattern
psa* %7, %1
// rewrite data
store256 %15, ^0, %7
// increment copied $DST by stride
add32v %6, %6, %4
// decrement copied Y count
sub32v %5, %5, #1
// if not zero, continue
brnz32 loopX_y, %5
loopX_done:
// how much did we do (#6 is 31, #5 is 32)
and %5, %0, #6
// compute 32-(x&31)
sub32v %5, #5, %5
// compute the proper value
min32v %5, %5, %2
// add that to the address, which will now be aligned
add32v %0, %0, %5
// remove from X, as we have done it
sub32v %2, %2, %5
// fall through the aligned loop if not 0
brz32 done256, %2
start256:
// compute X leftovers (modulo 32 -> #6 is 31)
and %6, %2, #6
// set the leftovers mask (offset is 0 as we are aligned)
setmq %15, #0, %6
loop256_y:
// set source and destination addresses for current Y
setadr %15, %0
// then the rounded value in X
sub32v %5, %2, %6
// already 0, bypass aligned stuff
brz32 loop256_x_end, %5
loop256_x:
// store to DST w/ post-increment
store256inc %15, ^0, %1
// sub 32 (#5 is 32) from live rounded X count
sub32v %5, %5, #5
// if X count is not 0, keep looping
brnz32 loop256_x, %5
// check for line leftovers
loop256_x_end:
brz4 done256_x, %6
// load old data
load256 %7, ^0
// insert pattern
psa* %7, %1
// rewrite data
store256 %15, ^0, %7
done256_x:
// decrement Y count
sub32v %3, %3, #1
// if 0, finished
brz32 done256, %3
// add strides to initial addresses
add32v %0, %0, %4
// loop256 to do next line
brz32 loop256_y, #0
done256:
fin
fin
);
let mcode_fillrop = assemble_jareth!(
// x..x / $DST in %0, 128 bits
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
// x..x / X size in %2
// x..x / Y size in %3,
// x..x / dst_stride in %4 (screen width?)
// x..x / rop / planemask in %5 [assumed to be alignement-homogenous]
// -----
// main loop:
// live X count in %8
// leftover X in %6
// // live Y count in %3
// data in %7
// masked data in %7
// 0/scrap in %15
// -----
// header loop:
// live Y count in %8
// $DST in %6
// data in %7
// 0/scrap in %15
start:
// if number of line or element in line is 0, exit early
brz32 done256, %2
brz32 done256, %3
// reset masks
resm %15
// set planemask / rop
srop %15, %5
// if $DST is aligned on 128 bits, jump to aligned loop
brz4 start256, %0
// do the first column(s)
startX:
// set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes
setmq %15, %0, %2
// copy Y
psa %8, %3
// copy $DST
psa %6, %0
loopX_y:
// setadr
setadr %15, %6
// load old data
load256 %7, ^0
// rop & insert
rop32v* %7, %7, %1
// rewrite data
store256 %15, ^0, %7
// increment copied $DST by stride
add32v %6, %6, %4
// decrement copied Y count
sub32v %8, %8, #1
// if not zero, continue
brnz32 loopX_y, %8
loopX_done:
// how much did we do (#6 is 31, #5 is 32)
and %8, %0, #6
// compute 32-(x&31) - upper bound
sub32v %8, #5, %8
// compute the proper value
min32v %8, %8, %2
// add that to the address, which will now be aligned if there's stuff left to do
add32v %0, %0, %8
// remove from X, as we have done it
sub32v %2, %2, %8
// fall through the aligned loop if not 0, otherwise done
brz32 done256, %2
start256:
// compute X leftovers (modulo 32 -> #6 is 31)
and %6, %2, #6
// set the leftovers mask (offset is 0 as we are aligned)
setmq %15, #0, %6
loop256_y:
// set source and destination addresses for current Y
setadr %15, %0
// then the rounded value in X
sub32v %8, %2, %6
// already 0, bypass aligned stuff
brz32 loop256_x_end, %8
loop256_x:
// load data
load256 %7, ^0
// rop
rop32v %7, %7, %1
// store to DST w/ post-increment
store256inc %15, ^0, %7
// sub 32 (#5 is 32) from live rounded X count
sub32v %8, %8, #5
// if X count is not 0, keep looping
brnz32 loop256_x, %8
// check for line leftovers
loop256_x_end:
brz4 done256_x, %6
// load old data
load256 %7, ^0
// insert pattern
rop32v* %7, %7, %1
// rewrite data
store256 %15, ^0, %7
done256_x:
// decrement Y count
sub32v %3, %3, #1
// if 0, finished
brz32 done256, %3
// add strides to initial addresses
add32v %0, %0, %4
// loop256 to do next line
brz32 loop256_y, #0
done256:
fin
fin
);
let mut pos;
@ -206,5 +491,32 @@ fn main() -> std::io::Result<()> {
println!("");
println!("-> {}", mcode_fill128.len());
pos = 0;
println!("fill256:");
while pos < mcode_fill256.len() {
print!("0x{:08x},", mcode_fill256[pos]);
pos = pos + 1;
}
println!("");
println!("-> {}", mcode_fill256.len());
pos = 0;
println!("fill:");
while pos < mcode_fill.len() {
print!("0x{:08x},", mcode_fill[pos]);
pos = pos + 1;
}
println!("");
println!("-> {}", mcode_fill.len());
pos = 0;
println!("fillrop:");
while pos < mcode_fillrop.len() {
print!("0x{:08x},", mcode_fillrop[pos]);
pos = pos + 1;
}
println!("");
println!("-> {}", mcode_fillrop.len());
Ok(())
}