From 68b81d91b8819ca1abbb7d5db6a0cebb9c8a393c Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain@dolbeau.org>
Date: Sun, 6 Mar 2022 15:17:02 +0100
Subject: [PATCH] first go at jareth vector engine

---
 NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c      |  654 ++++++++
 NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h      |   62 +
 sbus-to-ztex-gateware-migen/jareth.py         | 1321 +++++++++++++++++
 .../jareth_code/Cargo.lock                    |   13 +
 .../jareth_code/Cargo.toml                    |   23 +
 .../jareth_code/jareth_code.rs                |  108 ++
 .../sbus_to_fpga_fsm.py                       |    6 +
 .../sbus_to_fpga_prom.py                      |   28 +-
 .../sbus_to_fpga_soc.py                       |   36 +-
 9 files changed, 2232 insertions(+), 19 deletions(-)
 create mode 100644 NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c
 create mode 100644 NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h
 create mode 100644 sbus-to-ztex-gateware-migen/jareth.py
 create mode 100644 sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock
 create mode 100644 sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml
 create mode 100644 sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs

diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c
new file mode 100644
index 0000000..80e535e
--- /dev/null
+++ b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c
@@ -0,0 +1,654 @@
+/*	$NetBSD$ */
+
+/*-
+ * Copyright (c) 2022 Romain Dolbeau <romain@dolbeau.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/device.h>
+#include <sys/malloc.h>
+
+#include <sys/bus.h>
+#include <machine/autoconf.h>
+#include <sys/cpu.h>
+#include <sys/conf.h>
+#include <sys/ioccom.h>
+
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <uvm/uvm_extern.h>
+#include <sys/kmem.h>
+
+#include <dev/sbus/sbusvar.h>
+
+#include <dev/sbus/jareth.h>
+
+#include <machine/param.h>
+
+int	jareth_print(void *, const char *);
+int	jareth_match(device_t, cfdata_t, void *);
+void	jareth_attach(device_t, device_t, void *);
+
+CFATTACH_DECL_NEW(jareth, sizeof(struct jareth_softc),
+    jareth_match, jareth_attach, NULL, NULL);
+
+dev_type_open(jareth_open);
+dev_type_close(jareth_close);
+dev_type_ioctl(jareth_ioctl);
+dev_type_mmap(jareth_mmap);
+
+
+
+const struct cdevsw jareth_cdevsw = {
+	.d_open = jareth_open,
+	.d_close = jareth_close,
+	.d_read = noread,
+	.d_write = nowrite,
+	.d_ioctl = jareth_ioctl,
+	.d_stop = nostop,
+	.d_tty = notty,
+	.d_poll = nopoll,
+	.d_mmap = jareth_mmap,
+	.d_kqfilter = nokqfilter,
+	.d_discard = nodiscard,
+	.d_flag = 0
+};
+
+extern struct cfdriver jareth_cd;
+
+struct jareth_testjob {
+	uint32_t data[32][8];
+};
+
+static int init_programs(struct jareth_softc *sc);
+static int write_inputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window);
+static int start_job(struct jareth_softc *sc);
+static int wait_job(struct jareth_softc *sc, uint32_t param);
+static int read_outputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window);
+static int dma_init(struct jareth_softc *sc);
+
+static int power_on(struct jareth_softc *sc);
+static int power_off(struct jareth_softc *sc);
+
+int
+jareth_open(dev_t dev, int flags, int mode, struct lwp *l)
+{
+	int unit = minor(dev) & (MAX_SESSION - 1);
+	int driver = unit & ~(MAX_SESSION - 1);
+	struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver);
+
+	if (sc == NULL)
+		return ENODEV;
+
+	if ((unit != 0) && ((sc->active_sessions & (1 << unit)) == 0)) {
+		return ENODEV;
+	}
+	
+	/* first we need to turn the engine power on ... */
+	power_on(sc);
+	
+	return (0);
+}
+
+int
+jareth_close(dev_t dev, int flags, int mode, struct lwp *l)
+{
+	int unit = minor(dev) & (MAX_SESSION - 1);
+	int driver = unit & ~(MAX_SESSION - 1);
+	struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver);
+
+	if (sc == NULL)
+		return ENODEV;
+
+	if ((unit != 0) && (sc->active_sessions & (1 << unit))) {
+		device_printf(sc->sc_dev, "warning: close() on active session\n");
+		sc->active_sessions &= ~(1 << unit);
+		sc->mapped_sessions &= ~(1 << unit);
+	}
+
+	if (sc->active_sessions == 0)
+		power_off(sc);
+	
+	return (0);
+}
+
+int
+jareth_print(void *aux, const char *busname)
+{
+
+	sbus_print(aux, busname);
+	return (UNCONF);
+}
+
+int
+jareth_match(device_t parent, cfdata_t cf, void *aux)
+{
+	struct sbus_attach_args *sa = (struct sbus_attach_args *)aux;
+
+	return (strcmp("jareth", sa->sa_name) == 0);
+}
+
+static const uint32_t program_test0[25] = { 0x01fc0014,0x407c0012,0xa0400013,0xa0c40013,0x007f0014,0x017f0054,0x0016f087,0x00185086,0x06000189,0x00480400,0x004c0440,0x00440420,0x00500440,0x617d1013,0x001b0186,0x01800189,0x20410015,0x20c51015,0xfb000809,0x20c51015,0x617d1013,0x000c0012,0x00080011,0x0000000a,0x0000000a };
+
+static const uint32_t* programs[2] = { program_test0, NULL };
+static const uint32_t program_len[2] = { 25, 0 };
+static       uint32_t program_offset[2];
+
+static int do_test(struct jareth_softc *sc, uint32_t pidx);
+
+/*
+ * Attach all the sub-devices we can find
+ */
+void
+jareth_attach(device_t parent, device_t self, void *aux)
+{
+	struct sbus_attach_args *sa = aux;
+	struct jareth_softc *sc = device_private(self);
+	struct sbus_softc *sbsc = device_private(parent);
+	int node;
+	int sbusburst;
+		
+	sc->sc_bustag = sa->sa_bustag;
+	sc->sc_dmatag = sa->sa_dmatag;
+	sc->sc_dev = self;
+
+	aprint_normal("\n");
+
+	if (sa->sa_nreg < 3) {
+		aprint_error(": Not enough registers spaces\n");
+		return;
+	}
+
+	/* map registers */
+	if (sbus_bus_map(sc->sc_bustag,
+					 sa->sa_reg[0].oa_space /* sa_slot */,
+					 sa->sa_reg[0].oa_base /* sa_offset */,
+					 sa->sa_reg[0].oa_size /* sa_size */,
+					 BUS_SPACE_MAP_LINEAR,
+					 &sc->sc_bhregs_jareth) != 0) {
+		aprint_error(": cannot map Jareth registers\n");
+		return;
+	} else {
+		aprint_normal_dev(self, "Jareth registers @ %p\n", (void*)sc->sc_bhregs_jareth);
+	}
+	/* map microcode */
+	if (sbus_bus_map(sc->sc_bustag,
+					 sa->sa_reg[1].oa_space /* sa_slot */,
+					 sa->sa_reg[1].oa_base /* sa_offset */,
+					 sa->sa_reg[1].oa_size /* sa_size */,
+					 BUS_SPACE_MAP_LINEAR,
+					 &sc->sc_bhregs_microcode) != 0) {
+		aprint_error(": cannot map Jareth microcode\n");
+		return;
+	} else {
+		aprint_normal_dev(self, "Jareth microcode @ %p\n", (void*)sc->sc_bhregs_microcode);
+	}
+	/* map register file */
+	if (sbus_bus_map(sc->sc_bustag,
+					 sa->sa_reg[2].oa_space /* sa_slot */,
+					 sa->sa_reg[2].oa_base /* sa_offset */,
+					 sa->sa_reg[2].oa_size /* sa_size */,
+					 BUS_SPACE_MAP_LINEAR,
+					 &sc->sc_bhregs_regfile) != 0) {
+		aprint_error(": cannot map Jareth regfile\n");
+		return;
+	} else {
+		aprint_normal_dev(self, "Jareth regfile @ %p\n", (void*)sc->sc_bhregs_regfile);
+	}
+	sc->sc_bufsiz_jareth = sa->sa_reg[0].oa_size;
+	sc->sc_bufsiz_microcode = sa->sa_reg[1].oa_size;
+	sc->sc_bufsiz_regfile = sa->sa_reg[2].oa_size;
+
+	node = sc->sc_node = sa->sa_node;
+
+	/*
+	 * Get transfer burst size from PROM
+	 */
+	sbusburst = sbsc->sc_burst;
+	if (sbusburst == 0)
+		sbusburst = SBUS_BURST_32 - 1; /* 1->16 */
+
+	sc->sc_burst = prom_getpropint(node, "burst-sizes", -1);
+	if (sc->sc_burst == -1)
+		/* take SBus burst sizes */
+		sc->sc_burst = sbusburst;
+
+	/* Clamp at parent's burst sizes */
+	sc->sc_burst &= sbusburst;
+
+	aprint_normal("\n");
+	aprint_normal_dev(self, "nid 0x%x, bustag %p, burst 0x%x (parent 0x%0x)\n",
+			  sc->sc_node,
+			  sc->sc_bustag,
+			  sc->sc_burst,
+			  sbsc->sc_burst);
+
+	/* first we need to turn the engine power on ... */
+	power_on(sc);
+
+	if (init_programs(sc)) {
+		if (init_programs(sc)) {
+			aprint_normal_dev(sc->sc_dev, "INIT - FAILED\n");
+			sc->initialized = 0;
+		} else {
+			sc->initialized = 1;
+		}	
+	} else {
+		sc->initialized = 1;
+	}
+
+	power_off(sc);
+
+	sc->active_sessions = 0;
+	sc->mapped_sessions = 0;
+
+	if (!dma_init(sc)) {
+		// ouch
+		sc->active_sessions = 0xFFFFFFFF;
+		sc->mapped_sessions = 0xFFFFFFFF;
+	} else {
+		do_test(sc, 0);
+	}
+}
+
+#define CONFIG_CSR_DATA_WIDTH 32
+#define sbusfpga_jareth_softc jareth_softc
+#include "dev/sbus/sbusfpga_csr_jareth.h"
+#undef sbusfpga_jareth_softc
+
+#define REG_BASE(reg) (base + (reg * 32))
+#define SUBREG_ADDR(reg, off) (REG_BASE(reg) + (off)*4)
+
+#define SBUSFPGA_DO_TESTJOB   _IOWR(0, 0, struct jareth_testjob)
+
+int
+jareth_ioctl (dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
+{
+	int unit = minor(dev) & (MAX_SESSION - 1);
+	int driver = unit & ~(MAX_SESSION - 1);
+	struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver);
+	int err = 0;
+
+	if (sc == NULL) {
+		return ENODEV;
+	}
+
+	if (!sc->initialized) {
+		if (init_programs(sc)) {
+			return ENXIO;
+		} else {
+			sc->initialized = 1;
+		}
+	}
+	switch (cmd) {
+	case SBUSFPGA_DO_TESTJOB: {
+		if (unit != 0)
+			return ENOTTY;
+		
+		struct jareth_testjob* job = (struct jareth_testjob*)data;
+		jareth_mpstart_write(sc, program_offset[0]);
+		jareth_mplen_write(sc, program_len[0]);
+	
+		err = write_inputs(sc, job, 0);
+		if (err)
+			return err;
+		err = start_job(sc);
+		if (err)
+			return err;
+		delay(1);
+		err = wait_job(sc, 1);
+		if (err)
+			return err;
+		err = read_outputs(sc, job, 0);
+		if (err)
+			return err;
+	}
+		break;
+		
+	default:
+		err = EINVAL;
+		break;
+	}
+
+	return(err);
+}
+
+
+static int power_on(struct jareth_softc *sc) {
+	int err = 0;
+	if ((jareth_power_read(sc) & 1) == 0) {
+		jareth_power_write(sc, 1);
+		delay(1);
+	}
+	return err;
+}
+static int power_off(struct jareth_softc *sc) {
+	int err = 0;
+	jareth_power_write(sc, 0);
+	return err;
+}
+
+static int init_programs(struct jareth_softc *sc) {
+	/* the microcode is a the beginning */
+	int err = 0;
+	uint32_t i, j;
+	uint32_t offset = 0;
+
+	for (j = 0 ; programs[j] != NULL; j ++) {
+		program_offset[j] = offset;
+		for (i = 0 ; i < program_len[j] ; i++) {
+			bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_microcode, ((offset+i)*4), programs[j][i]);
+			if ((i%16)==15)
+				delay(1);
+		}
+		offset += program_len[j];
+	}
+
+	jareth_window_write(sc, 0); /* could use window_window to access fields, but it creates a RMW cycle for nothing */
+	jareth_mpstart_write(sc, 0); /* EC25519 */
+	jareth_mplen_write(sc, program_len[0]); /* EC25519 */
+
+	aprint_normal_dev(sc->sc_dev, "INIT - Jareth status: 0x%08x\n", jareth_status_read(sc));
+
+#if 1
+	/* double check */
+	u_int32_t x;
+	int count = 0;
+	for (i = 0 ; i < program_len[0] && count < 10; i++) {
+		x = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_microcode, (i*4));
+		if (x != programs[0][i]) {
+			aprint_error_dev(sc->sc_dev, "INIT - Jareth program failure: [%d] 0x%08x <> 0x%08x\n", i, x, programs[0][i]);
+			err = 1;
+			count ++;
+		}
+		if ((i%8)==7)
+			delay(1);
+	}
+	if ((x = jareth_window_read(sc)) != 0) {
+			aprint_error_dev(sc->sc_dev, "INIT - Jareth register failure: window = 0x%08x\n", x);
+			err = 1;
+	}
+	if ((x = jareth_mpstart_read(sc)) != 0) {
+			aprint_error_dev(sc->sc_dev, "INIT - Jareth register failure: mpstart = 0x%08x\n", x);
+			err = 1;
+	}
+	if ((x = jareth_mplen_read(sc)) != program_len[0]) {
+			aprint_error_dev(sc->sc_dev, "INIT - Jareth register failure: mplen = 0x%08x\n", x);
+			err = 1;
+	}
+	const int test_reg_num = 73;
+	const uint32_t test_reg_value = 0x0C0FFEE0;
+	bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile, 4*test_reg_num, test_reg_value);
+	delay(1);
+	if ((x = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile, 4*test_reg_num)) != test_reg_value) {
+		aprint_error_dev(sc->sc_dev, "INIT - Jareth register file failure: 0x%08x != 0x%08x\n", x, test_reg_value);
+		err = 1;
+	}
+#endif
+	
+	return err;
+}
+
+static int write_inputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window) {
+	const uint32_t base = window * 0x400;
+	int i, j;
+	uint32_t status = jareth_status_read(sc);
+	int err = 0;
+	if (status & (1<<CSR_JARETH_STATUS_RUNNING_OFFSET)) {
+		aprint_error_dev(sc->sc_dev, "WRITE - Jareth status: 0x%08x, still running?\n", status);
+		return ENXIO;
+	}
+	for (j = 0 ; j < 4 ; j++) {
+		for (i = 0 ; i < 8 ; i++) {
+			bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(j,i), job->data[j][i]);
+		}
+	}
+
+#if 1
+	for (j = 0 ; j < 4 ; j++) {
+		for (i = 0 ; i < 8 && !err; i ++) {
+			if (job->data[j][i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(j,i))) err = EIO;
+			/* delay(1); */
+		}
+	}
+	if (err) aprint_error_dev(sc->sc_dev, "WRITE - data did not read-write properly\n");
+#endif
+
+	return err;
+}
+
+static int start_job(struct jareth_softc *sc) {
+	uint32_t status = jareth_status_read(sc);
+	if (status & (1<<CSR_JARETH_STATUS_RUNNING_OFFSET)) {
+		aprint_error_dev(sc->sc_dev, "START - Jareth status: 0x%08x, still running?\n", status);
+		return ENXIO;
+	}
+	jareth_control_write(sc, 1);
+	//aprint_normal_dev(sc->sc_dev, "START - Jareth status: 0x%08x\n", jareth_status_read(sc));
+	
+	return 0;
+}
+
+static int wait_job(struct jareth_softc *sc, uint32_t param) {
+	uint32_t status = jareth_status_read(sc);
+	int count = 0;
+	int max_count = 250;
+	int del = 1;
+	const int max_del = 32;
+	static int max_del_seen = 1;
+	static int max_cnt_seen = 0;
+	
+	while ((status & (1<<CSR_JARETH_STATUS_RUNNING_OFFSET)) && (count < max_count)) {
+		//uint32_t ls_status = jareth_ls_status_read(sc);
+		//aprint_normal_dev(sc->sc_dev, "WAIT - ongoing, Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, ls_status);
+		count ++;
+		delay(del);
+		del = del < max_del ? 2*del : del;
+		status = jareth_status_read(sc);
+	}
+	if (del > max_del_seen) {
+		max_del_seen = del;
+		aprint_normal_dev(sc->sc_dev, "WAIT - new max delay %d after %d count (param was %u)\n", max_del_seen, count, param);
+	}
+	if (count > max_cnt_seen) {
+		max_cnt_seen = count;
+		aprint_normal_dev(sc->sc_dev, "WAIT - new max count %d with %d delay (param was %u)\n", max_cnt_seen, del, param);
+		
+	}
+	
+	//jareth_control_write(sc, 0);
+	if (status & (1<<CSR_JARETH_STATUS_RUNNING_OFFSET)) {
+		aprint_error_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x (pc 0x%08x), did not finish in time? [inst: 0x%08x ls_status: 0x%08x]\n", status, (status>>1)&0x03ff, jareth_instruction_read(sc),  jareth_ls_status_read(sc));
+		return ENXIO;
+	} else if (status & (1<<CSR_JARETH_STATUS_SIGILL_OFFSET)) {
+		aprint_error_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x, sigill [inst: 0x%08x ls_status: 0x%08x]\n", status, jareth_instruction_read(sc),  jareth_ls_status_read(sc));
+		return ENXIO;
+	} else if (status & (1<<CSR_JARETH_STATUS_ABORT_OFFSET)) {
+		aprint_error_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x, aborted [inst: 0x%08x ls_status: 0x%08x]\n", status, jareth_instruction_read(sc),  jareth_ls_status_read(sc));
+		return ENXIO;
+	} else {
+		//aprint_normal_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, jareth_ls_status_read(sc));
+	}
+
+	return 0;
+}
+
+static int read_outputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window) {
+	const uint32_t base = window * 0x400;
+	int i, j;
+	uint32_t status = jareth_status_read(sc);
+	if (status & (1<<CSR_JARETH_STATUS_RUNNING_OFFSET)) {
+		aprint_error_dev(sc->sc_dev, "READ - Jareth status: 0x%08x, still running?\n", status);
+		return ENXIO;
+	}
+
+	for (j = 0 ; j < 32 ; j++) {
+		for (i = 0 ; i < 8 ; i++) {
+			job->data[j][i]   = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(j,i));
+		}
+		delay(1);
+	}
+
+	return 0;
+}
+
+
+static int
+dma_init(struct jareth_softc *sc) {
+	
+	/* Allocate a dmamap */
+	if (bus_dmamap_create(sc->sc_dmatag, JARETH_VAL_DMA_MAX_SZ, 1, JARETH_VAL_DMA_MAX_SZ, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, &sc->sc_dmamap) != 0) {
+		aprint_error_dev(sc->sc_dev, "DMA map create failed\n");
+		return 0;
+	} else {
+		aprint_normal_dev(sc->sc_dev, "dmamap: %lu %lu %d (%p)\n", sc->sc_dmamap->dm_maxsegsz, sc->sc_dmamap->dm_mapsize, sc->sc_dmamap->dm_nsegs, sc->sc_dmatag->_dmamap_load);
+	}
+
+	if (bus_dmamem_alloc(sc->sc_dmatag, JARETH_VAL_DMA_MAX_SZ, 64, 64, &sc->sc_segs, 1, &sc->sc_rsegs, BUS_DMA_NOWAIT | BUS_DMA_STREAMING)) {
+		aprint_error_dev(sc->sc_dev, "cannot allocate DVMA memory");
+		bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap);
+		return 0;
+	}
+  
+	if (bus_dmamem_map(sc->sc_dmatag, &sc->sc_segs, 1, JARETH_VAL_DMA_MAX_SZ, &sc->sc_dma_kva, BUS_DMA_NOWAIT)) {
+		aprint_error_dev(sc->sc_dev, "cannot allocate DVMA address");
+		bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1);
+		bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap);
+		return 0;
+	}
+  
+	if (bus_dmamap_load(sc->sc_dmatag, sc->sc_dmamap, sc->sc_dma_kva, JARETH_VAL_DMA_MAX_SZ, /* kernel space */ NULL,
+						BUS_DMA_NOWAIT | BUS_DMA_STREAMING | BUS_DMA_WRITE)) {
+		aprint_error_dev(sc->sc_dev, "cannot load dma map");
+		bus_dmamem_unmap(sc->sc_dmatag, &sc->sc_dma_kva, JARETH_VAL_DMA_MAX_SZ);
+		bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1);
+		bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap);
+		return 0;
+	}
+	
+	aprint_normal_dev(sc->sc_dev, "DMA: SW -> kernel address is %p, dvma address is 0x%08llx, seg %llx / %ld\n", sc->sc_dma_kva, sc->sc_dmamap->dm_segs[0].ds_addr, sc->sc_segs.ds_addr, sc->sc_segs.ds_len);
+	
+	return 1;
+}
+
+paddr_t jareth_mmap(dev_t dev, off_t offset, int prot) {
+	int unit = minor(dev) & (MAX_SESSION - 1);
+	int driver = unit & ~(MAX_SESSION - 1);
+	struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver);
+	paddr_t addr = -1;
+
+	device_printf(sc->sc_dev, "%s:%d: %lld %d for %d / %d\n", __PRETTY_FUNCTION__, __LINE__, offset, prot, driver, unit);
+	
+	if (offset != 0)
+		return -1;
+	if (prot & PROT_EXEC)
+		return -1;
+	/* if (sc->mapped_sessions & (1 << unit)) */
+	/* 	return -1; */
+	if ((sc->active_sessions & (1 << unit)) == 0)
+		return -1;
+	if (unit >= MAX_ACTIVE_SESSION)
+		return -1;
+	if (unit <= 0)
+		return -1;
+	
+	//	addr = bus_dmamem_mmap(sc->sc_dmatag, sc->sc_dmamap->dm_segs, 1, (off_t)(4096*unit), prot, BUS_DMA_NOWAIT);
+	if (pmap_extract(pmap_kernel(), ((vaddr_t)sc->sc_dma_kva) + (unit * 4096), &addr)) {
+	
+		device_printf(sc->sc_dev, "mapped page %d to 0x%08lx [0x%08lx], kernel is %p\n", unit, addr, atop(addr), (void*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096)));
+
+		((uint32_t*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096)))[0] = 0xDEADBEEF;
+		sc->mapped_sessions |= (1 << unit);
+		
+		return addr;
+	}
+
+	return -1;
+}
+
+static int do_test(struct jareth_softc *sc, uint32_t pidx) {
+	struct jareth_testjob job;
+	int err = 0, i, j, window = 0;
+
+	power_on(sc);
+
+	for (i = 0 ; i < 8 ; i++) {
+		job.data[0][i] = 0;
+		job.data[1][i] = 0;
+		job.data[2][i] = 0;
+		job.data[3][i] = 0x04030201 + 0x04040404 * i;
+	}
+	job.data[0][0] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 3;
+	job.data[0][1] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048;
+	job.data[0][2] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048;
+	job.data[1][0] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048;
+	job.data[1][1] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 3;
+	job.data[1][2] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048;
+	job.data[2][0] = 16;
+
+	for (i = 0 ; i < 16 ; i++) {
+		((uint32_t*)sc->sc_dma_kva)[i] = 0xDEADBEEF;
+		((uint32_t*)sc->sc_dma_kva)[i+512] = 0x11111111;
+	}
+	
+	jareth_mpstart_write(sc, program_offset[pidx]);
+	jareth_mplen_write(sc, program_len[pidx]);
+	
+	err = write_inputs(sc, &job, window);
+	if (!err)		err = start_job(sc);
+	delay(1);
+	if (!err)
+		err = wait_job(sc, 1);
+	if (!err)
+		err = read_outputs(sc, &job, window);
+
+	char buf[512];
+	for (j = 0 ; j < 32; j++) {
+		snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", job.data[j][7-0], job.data[j][7-1], job.data[j][7-2], job.data[j][7-3], job.data[j][7-4], job.data[j][7-5], job.data[j][7-6], job.data[j][7-7]);
+		aprint_normal("reg%d : %s\n", j, buf);
+	}
+	snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x",
+			 ((uint32_t*)sc->sc_dma_kva)[0+512], ((uint32_t*)sc->sc_dma_kva)[1+512],
+			 ((uint32_t*)sc->sc_dma_kva)[2+512], ((uint32_t*)sc->sc_dma_kva)[3+512],
+			 ((uint32_t*)sc->sc_dma_kva)[4+512], ((uint32_t*)sc->sc_dma_kva)[5+512],
+			 ((uint32_t*)sc->sc_dma_kva)[6+512], ((uint32_t*)sc->sc_dma_kva)[7+512]);
+	aprint_normal("mem0_7 : %s\n", buf);
+	snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x",
+			 ((uint32_t*)sc->sc_dma_kva)[8+512], ((uint32_t*)sc->sc_dma_kva)[9+512],
+			 ((uint32_t*)sc->sc_dma_kva)[10+512], ((uint32_t*)sc->sc_dma_kva)[11+512],
+			 ((uint32_t*)sc->sc_dma_kva)[12+512], ((uint32_t*)sc->sc_dma_kva)[13+512],
+			 ((uint32_t*)sc->sc_dma_kva)[14+512], ((uint32_t*)sc->sc_dma_kva)[15+512]);
+	aprint_normal("mem8_15 : %s\n", buf);
+
+	power_off(sc);
+
+	return err;
+}
diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h
new file mode 100644
index 0000000..f07c6e7
--- /dev/null
+++ b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h
@@ -0,0 +1,62 @@
+/*	$NetBSD$ */
+
+/*-
+ * Copyright (c) 2020 Romain Dolbeau <romain@dolbeau.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _JARETH_H_
+#define _JARETH_H_
+
+#define MAX_SESSION 32 // HW limit
+#define MAX_ACTIVE_SESSION 8 // SW-imposed limit
+// Single 4KiB pages per session
+#define JARETH_VAL_DMA_MAX_SZ (MAX_ACTIVE_SESSION*4*1024)
+
+struct jareth_softc {
+	device_t sc_dev;		/* us as a device */
+	u_int	sc_rev;			/* revision */
+	int	sc_node;		/* PROM node ID */
+	int	sc_burst;		/* DVMA burst size in effect */
+	bus_space_tag_t	sc_bustag;	/* bus tag */
+	bus_space_handle_t sc_bhregs_jareth;	/* bus handle */
+	bus_space_handle_t sc_bhregs_microcode;	/* bus handle */
+	bus_space_handle_t sc_bhregs_regfile;	/* bus handle */
+	//void *	sc_buffer;		/* VA of the registers */
+	int	sc_bufsiz_jareth;		/* Size of buffer */
+	int	sc_bufsiz_microcode;		/* Size of buffer */
+	int	sc_bufsiz_regfile;		/* Size of buffer */
+	int initialized;
+	uint32_t active_sessions;
+	uint32_t mapped_sessions;
+	uint32_t sessions_cookies[MAX_ACTIVE_SESSION];
+	/* DMA kernel structures */
+	bus_dma_tag_t		sc_dmatag;
+	bus_dmamap_t		sc_dmamap;
+	bus_dma_segment_t       sc_segs;
+	int                     sc_rsegs;
+	void *              sc_dma_kva;
+};
+
+#endif /* _JARETH_H_ */
diff --git a/sbus-to-ztex-gateware-migen/jareth.py b/sbus-to-ztex-gateware-migen/jareth.py
new file mode 100644
index 0000000..3105410
--- /dev/null
+++ b/sbus-to-ztex-gateware-migen/jareth.py
@@ -0,0 +1,1321 @@
+from migen import *
+from migen.genlib.cdc import MultiReg
+
+from litex.soc.interconnect.csr import *
+from litex.soc.integration.doc import AutoDoc, ModuleDoc
+from litex.soc.interconnect import wishbone
+from litex.soc.interconnect.csr_eventmanager import *
+
+prime_string = "$2^{{255}}-19$"  # 2\ :sup:`255`-19
+field_latex = "$\mathbf{{F}}_{{{{2^{{255}}}}-19}}$"
+
+opcode_bits = 5  # number of bits used to encode the opcode field
+opcodes = {  # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shift a
+    "UDF" : [-1, "Placeholder for undefined opcodes"],
+    "PSA" : [0, "Wd $\gets$ Ra  // pass A"],
+    "PSB" : [1, "Wd $\gets$ Rb  // pass B"],
+    # 2 MSK
+    "XOR" : [3, "Wd $\gets$ Ra ^ Rb  // bitwise XOR"],
+    "NOT" : [4, "Wd $\gets$ ~Ra   // binary invert"],
+    "ADD" : [5, "Wd $\gets$ Ra + Rb  // 256-bit binary add"],
+    "SUB" : [6, "Wd $\gets$ Ra - Rb  // 256-bit binary subtraction"],
+    "AND" : [7, "Wd $\gets$ Ra & Rb  // bitwise AND"], # replace MUL
+    "BRNZ" : [8, "If Ra != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if non-zero"], # relace TRD
+    "BRZ" : [9, "If Ra == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1  // Branch if zero"],
+    "FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"],
+    "SHL" : [11, "Wd $\gets$ Ra << 1  // shift Ra left by one and store in Wd"],
+    # 12 XBT
+    # for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ;
+    "GETM": [17, "GETM: getmask" ],
+    "ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ],
+    "MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ],
+    "SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ],
+    "LOADH" : [21, "LOADH: imm[7] == 0 for LOAD, address in addr[imm[0...]], high->low & load a+16 into high" ],
+    "MAX" : [22, "Maximum opcode number (for bounds checking)"],
+}
+
+num_registers = 32
+instruction_layout = [
+    ("opcode", opcode_bits, "opcode to be executed"),
+    ("shift", 1, "should A & Q be shifted"),
+    ("ra", log2_int(num_registers), "operand A read register"),
+    ("ca", 1, "set to substitute constant table value for A"),
+    ("rb", log2_int(num_registers), "operand B read register"),
+    ("cb", 1, "set to substitute constant table value for B"),
+    ("wd", log2_int(num_registers), "write register"),
+    ("immediate", 9, "Used by jumps to load the next PC value")
+]
+
+class RegisterFile(Module, AutoDoc):
+    def __init__(self, depth=512, width=256, bypass=False):
+        reset_cycles = 4
+        self.intro = ModuleDoc(title="Register File", body="""
+This implements the register file for the Jareth engine. It's implemented using
+7-series specific block RAMs in order to take advantage of architecture-specific features
+to ensure a compact and performant implementation.
+
+The core primitive is the RAMB36E1. This can be configured as a 64/72-bit wide memory
+but only if used in "SDP" (simple dual port) mode. In SDP, you have one read, one write port.
+However, the register file needs to produce two operands per cycle, while accepting up to
+one operand per cycle.
+
+In order to do this, we stipulate that the RF runs at `rf_clk` (200MHz), but uses four phases
+to produce/consume data. "Engine clock" `eng_clk` (50MHz) runs at a lower rate to accommodate
+large-width arithmetic in a single cycle.
+
+The phasing is defined as follows:
+
+Phase 0:
+  - read from port A
+Phase 1:
+  - read from port B
+Phase 2:
+  - write data
+Phase 3:
+  - quite cycle, used to create extra setup time for next stage (requires multicycle-path constraints)
+
+The writing of data is done in the second phase means that write happen to the same address
+as being read, you get the old value. For pipelined operation, it could be desirable to shift
+the write to happen before the reads, but as of now the implementation is not pipelined.
+
+The register file is unavailable for {} `eng_clk` cycles after reset.
+
+When configured as a 64 bit memory, the depth of the block is 512 bits, corresponding to
+an address width of 9 bits.
+
+        """.format(reset_cycles))
+
+        instruction = Record(instruction_layout)
+        phase = Signal(2)  # internal phase
+        self.phase = Signal()  # external phase
+        self.comb += self.phase.eq(phase[1]) # divide down internal phase so slower modules can capture it
+
+        # these are the signals in and out of the register file
+        self.ra_dat = Signal(width) # this is passed in from outside the module because we want to mux with e.g. memory bus
+        self.ra_adr = Signal(log2_int(depth))
+        self.rb_dat = Signal(width)
+        self.rb_adr = Signal(log2_int(depth))
+
+        # register file pipelines the write target address, going to the exec units; also needs the window to be complete
+        # window is assumed to be static and does not change throughout a give program run, so it's not pipelined
+        self.instruction_pipe_in = Signal(len(instruction))
+        self.instruction_pipe_out = Signal(len(instruction))
+        self.window = Signal(max(1, log2_int(depth) - log2_int(num_registers)))
+
+        # this is the immediate data to write in, coming from the exec units
+        self.wd_dat = Signal(width)
+        self.wd_adr = Signal(log2_int(depth))
+        self.wd_bwe = Signal(width//8)  # byte masks for writing
+        self.we = Signal()
+        self.clear = Signal()
+
+        self.running = Signal() # used for activity gating to RAM
+
+        eng_sync = Signal(reset=1)
+
+        rf_adr = Signal(log2_int(depth))
+        self.comb += [
+            If(phase == 0,
+                rf_adr.eq(self.ra_adr),
+            ).Elif(phase == 1,
+                rf_adr.eq(self.rb_adr),
+            )
+        ]
+        rf_dat = Signal(width)
+        self.sync.eng_clk += [
+            # TODO: check that this is in sync with expected values
+            self.instruction_pipe_out.eq(self.instruction_pipe_in),
+        ]
+        # unfortunately, -1L speed grade is too slow to support pipeline bypassing of the register file:
+        # bypass path closes at about 5.4ns, which fails to meet the 5ns cycle time target for the four-phase RF
+        if bypass:
+            self.sync.rf_clk += [
+                If(phase == 1,
+                    If((self.wd_adr != self.ra_adr) | ~self.we,
+                        self.ra_dat.eq(rf_dat),
+                       ).Else(
+                        self.ra_dat.eq(self.wd_dat),
+                    ),
+                    self.rb_dat.eq(self.rb_dat),
+                   ).Elif(phase == 2,
+                    self.ra_dat.eq(self.ra_dat),
+                    If((self.wd_adr != self.rb_adr) | ~self.we,
+                        self.rb_dat.eq(rf_dat),
+                       ).Else(
+                        self.rb_dat.eq(self.wd_dat),
+                    )
+                          ).Else(
+                    self.ra_dat.eq(self.ra_dat),
+                    self.rb_dat.eq(self.rb_dat),
+                ),
+            ]
+        else:
+            self.sync.rf_clk += [
+                If(phase == 1,
+                    self.ra_dat.eq(rf_dat),
+                    self.rb_dat.eq(self.rb_dat),
+                ).Elif(phase == 2,
+                    self.ra_dat.eq(self.ra_dat),
+                    self.rb_dat.eq(rf_dat),
+                ).Else(
+                    self.ra_dat.eq(self.ra_dat),
+                    self.rb_dat.eq(self.rb_dat),
+                ),
+            ]
+        wren_pipe = Signal() # do not change this variable name, it is constrained in the XDC
+        self.sync.rf_clk += [
+            If(eng_sync,
+                phase.eq(0),
+            ).Else(
+                phase.eq(phase + 1),
+            ),
+            wren_pipe.eq((phase == 1) & self.we),  # we want wren to hit on phase==2, but we pipeline it to relax timing. so capture the input to the pipe on phase == 1
+        ]
+        wd_bwe_pipe = Signal(width//8)
+        self.sync.rf_clk += [
+            # add a register to relax timing on wd_bwe. This offsets the signal by one rf_clk (clk200) period,
+            # but because write happens on phase 2 and the signal is valid on eng_clk (clk50) edges, this will
+            # not affect the functionality
+            wd_bwe_pipe.eq(self.wd_bwe)
+        ]
+
+        for word in range(int(256/64)):
+            self.specials += Instance("BRAM_SDP_MACRO", name="RF_RAMB" + str(word),
+                p_BRAM_SIZE = "36Kb",
+                p_DEVICE = "7SERIES",
+                p_WRITE_WIDTH = 64,
+                p_READ_WIDTH = 64,
+                p_DO_REG = 0,
+                p_INIT_FILE = "NONE",
+                p_SIM_COLLISION_CHECK = "ALL", # "WARNING_ONLY", "GENERATE_X_ONLY", "NONE"
+                p_SRVAL = 0,
+                p_WRITE_MODE = "READ_FIRST",
+                i_RDCLK = ClockSignal("rf_clk"),
+                i_WRCLK = ClockSignal("rf_clk"),
+                i_RDADDR = rf_adr,
+                i_WRADDR = self.wd_adr,
+                i_DI = self.wd_dat[word*64 : word*64 + 64],
+                o_DO = rf_dat[word*64 : word*64 + 64],
+                i_RDEN = self.running, # reduce power when not running
+                i_WREN = wren_pipe, # (phase == 2) & self.we, but pipelined one stage
+                i_RST = ResetSignal("rf_clk"),
+                i_WE = wd_bwe_pipe[word*8 : word*8 + 8],
+
+                i_REGCE = 1, # should be ignored, but added to quiet down simulation warnings
+            )
+
+        # create an internal reset signal that synchronizes the "eng" to the "rf" domains
+        # it will also reset the register file on demand
+        reset_counter = Signal(log2_int(reset_cycles), reset=reset_cycles - 1)
+        self.sync.eng_clk += [
+            If(self.clear,
+                reset_counter.eq(reset_cycles - 1),
+                eng_sync.eq(1),
+            ).Else(
+                If(reset_counter != 0,
+                   reset_counter.eq(reset_counter - 1),
+                    eng_sync.eq(1),
+                ).Else(
+                   eng_sync.eq(0)
+                ),
+            )
+        ]
+
+class JarethConst(Module, AutoDoc):
+    def __init__(self, insert_docs=False):
+        global did_const_doc
+        constant_defs = {
+            0: [0, "zero", "The number zero"],
+            1: [1, "one", "The number one"],
+            2: [2, "two", "The number two"],
+            #3: [3, "three", "The number three"],
+            #4: [4, "four", "The number four"],
+            #5: [5, "five", "The number five"],
+            #6: [6, "six", "The number six"],
+            #7: [7, "seven", "The number seven"],
+            #8: [8, "eight", "The number eight"],
+            15: [15, "sixteen", "The number fifteen"],
+            16: [16, "sixteen", "The number sixteen"],
+        }
+        self.adr = Signal(5)
+        self.const = Signal(256)
+        constant_str = "This module encodes the constants that can be substituted for any register value. Therefore, up to 32 constants can be encoded.\n\n"
+        for code, const in constant_defs.items():
+            self.comb += [
+                If(self.adr == code,
+                    self.const.eq(const[0]),
+                )
+            ]
+            constant_str += """
+**{}**
+
+  Substitute register {} with {}: {}\n""".format(const[1], code, const[2], const[0])
+        if insert_docs:
+            self.constants = ModuleDoc(title="Jareth Constants", body=constant_str)
+
+# ------------------------------------------------------------------------ EXECUTION UNITS
+class ExecUnit(Module, AutoDoc):
+    def __init__(self, width=256, opcode_list=["UDF"], insert_docs=False):
+        if insert_docs:
+            self.intro = ModuleDoc(title="ExecUnit class", body="""
+    ExecUnit is the superclass template for execution units.
+
+    Configuration Arguments:
+      - `opcode_list` is the list of opcodes that an ExecUnit can process
+      - `width` is the bit-width of the execution pathway
+
+    Signal API for an exec unit:
+      - `a` and `b` are the inputs.
+      - `instruction_in` is the instruction corresponding to the currently present `a` and `b` inputs
+      - `start` is a single-clock signal which indicates processing should start
+      - `q` is the output
+      - `instruction_out` is the instruction for the result present at the `q` output
+      - `q_valid` is a single cycle pulse that indicates that the `q` result and `wa_out` value is valid
+
+
+            """)
+        self.instruction = Record(instruction_layout)
+
+        self.a = Signal(width) # raw or shifted
+        self.b = Signal(width) # shifted
+        self.q = Signal(width) # shifted
+        self.start = Signal()
+        self.q_valid = Signal()
+        # pipeline the instruction
+        self.instruction_in = Signal(len(self.instruction))
+        self.instruction_out = Signal(len(self.instruction))
+
+        self.opcode_list = opcode_list
+        self.comb += [
+            self.instruction.raw_bits().eq(self.instruction_in)
+        ]
+
+class ExecLogic(ExecUnit):
+    def __init__(self, width=256):
+        ExecUnit.__init__(self, width, ["XOR", "NOT", "PSA", "SHL", "AND"])
+        self.intro = ModuleDoc(title="Logic ExecUnit Subclass", body=f"""
+This execution unit implements bit-wise logic operations: XOR, NOT, and
+passthrough.
+
+* XOR returns the result of A^sB
+* NOT returns the result of !A
+* PSA returns the value of A
+* SHL returns A << 1
+* AND returns the result of A&sB
+
+""")
+
+        zeros = Signal(255, reset=0)
+        self.sync.eng_clk += [
+            self.q_valid.eq(self.start),
+            self.instruction_out.eq(self.instruction_in),
+        ]
+        self.comb += [
+            If(self.instruction.opcode == opcodes["XOR"][0],
+               self.q.eq(self.a ^ self.b)
+            ).Elif(self.instruction.opcode == opcodes["NOT"][0],
+               self.q.eq(~self.a)
+            ).Elif(self.instruction.opcode == opcodes["PSA"][0],
+                self.q.eq(self.a),
+            ).Elif(self.instruction.opcode == opcodes["PSB"][0],
+                self.q.eq(self.b),
+            ).Elif(self.instruction.opcode == opcodes["SHL"][0],
+                self.q.eq(Cat(0, self.a[:255])),
+            ).Elif(self.instruction.opcode == opcodes["AND"][0],
+                self.q.eq(self.a & self.b),
+            ),
+        ]
+
+class ExecAddSub(ExecUnit, AutoDoc):
+    def __init__(self, width=256):
+        ExecUnit.__init__(self, width, ["ADD", "SUB"])
+        self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f"""
+This execution module implements 256-bit binary addition and subtraction.
+
+Note that to implement operations in $\mathbf{{F}}_p$, where *p* is $2^{{255}}-19$, this must be compounded
+with other operators as follows:
+
+Addition of Ra + Rb into Rc in {field_latex}:
+
+.. code-block:: c
+
+  ADD Rc, Ra, Rb    // Rc <- Ra + Rb
+  TRD Rd, Rc        // Rd <- ReductionValue(Rc)
+  SUB Rc, Rc, Rd    // Rc <- Rc - Rd
+
+Negation of Ra into Rc in {field_latex}:
+
+.. code-block:: c
+
+  SUB Rc, #FIELDPRIME, Ra   //  Rc <- 2^255-19 - Ra
+
+Note that **#FIELDPRIME** is one of the 32 available hard-coded constants
+that can be substituted for any register in any arithmetic operation, please
+see the section on "Constants" for more details.
+
+Subtraction of Ra - Rb into Rc in {field_latex}:
+
+.. code-block:: c
+
+  SUB Rb, #FIELDPRIME, Rb   //  Rb <- 2^255-19 - Rb
+  ADD Rc, Ra, Rb    // Rc <- Ra + Rb
+  TRD Rd, Rc        // Rd <- ReductionValue(Rc)
+  SUB Rc, Rc, Rd    // Rc <- Rc - Rd
+
+In all the examples above, Ra and Rb must be members of {field_latex}.
+        """)
+
+        self.sync.eng_clk += [
+            self.q_valid.eq(self.start),
+            self.instruction_out.eq(self.instruction_in),
+        ]
+        self.comb += [
+            If(self.instruction.opcode == opcodes["ADD"][0],
+               self.q.eq(self.a + self.b),
+            ).Elif(self.instruction.opcode == opcodes["SUB"][0],
+               self.q.eq(self.a - self.b),
+            ),
+        ]
+
+class ExecLS(ExecUnit, AutoDoc):
+    def __init__(self, width=256, interface=None, r_dat_f=None, r_dat_m=None, granule=0):
+        ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "GETM"])
+        
+        self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f"""
+        """)
+
+        self.sync.eng_clk += [ # pipeline the instruction
+            self.instruction_out.eq(self.instruction_in),
+        ]
+
+        assert(width == 256) # fixme
+        assert(len(interface.sel) == 16) # 128 bits Wishbone
+
+        start_pipe = Signal()
+        self.sync.mul_clk += start_pipe.eq(self.start) # break critical path of instruction decode -> SETUP_A state muxes
+        self.submodules.lsseq = lsseq = ClockDomainsRenamer("mul_clk")(FSM(reset_state="IDLE"))
+        cpar = Signal() # to keep track of the odd-ness of our cycle, so we can align 2 mul_clk cycles of output on 1 eng_clk cycle
+        lbuf = Signal(width)
+        timeout = Signal(11)
+        #tries = Signal()
+        self.has_failure = Signal(2)
+        self.has_timeout = Signal(2)
+
+        self.sync.mul_clk += If(timeout > 0, timeout.eq(timeout - 1))
+
+        granule_bits = log2_int(granule)
+        granule_num = width//granule
+        granule_num_bits = log2_int(granule_num)
+        
+        offset = Signal(granule_num_bits-1, reset = 0)
+        max_size_bits=28 # 256 MiB
+        offsetpsize = Signal(max_size_bits+1, reset = 0)
+
+        addresses = Array(Signal(28) for x in range(width//32)) # 128-bits chunk, so 16-bytes chunk, so low 4 bits are ignored
+
+        lsseq.act("IDLE",
+                  If(start_pipe,
+                     If(self.instruction.opcode == opcodes["MEM"][0],
+                        NextValue(cpar, 0),
+                        NextValue(self.has_timeout, 0),
+                        NextValue(self.has_failure, 0),
+                        NextValue(interface.cyc, 1),
+                        NextValue(interface.stb, 1),
+                        NextValue(interface.sel, 2**len(interface.sel)-1),
+                        NextValue(interface.adr, addresses[self.instruction.immediate[0:log2_int(width//32)]]),
+                        NextValue(interface.we, self.instruction.immediate[7]),
+                        NextValue(timeout, 2047),
+                        If(self.instruction.immediate[7], # do we need those tests or could we always update dat_w/dat_r ?
+                           NextValue(interface.dat_w, self.b[0:128])),
+                        NextState("MEMl") # MEMl
+                     ).Elif(self.instruction.opcode == opcodes["LOADH"][0],
+                            NextValue(cpar, 0),
+                            NextValue(self.has_timeout, 0),
+                            NextValue(self.has_failure, 0),
+                            NextValue(interface.cyc, 1),
+                            NextValue(interface.stb, 1),
+                            NextValue(interface.sel, 2**len(interface.sel)-1),
+                            NextValue(interface.adr, addresses[self.instruction.immediate[0:log2_int(width//32)]]),
+                            NextValue(interface.we, self.instruction.immediate[7]),
+                            NextValue(timeout, 2047),
+                            NextValue(lbuf[0:128], self.b[128:256]),
+                            NextState("MEMh") # MEMl
+                     ).Elif(self.instruction.opcode == opcodes["SETM"][0],
+                            Case(self.instruction.immediate[0:2],
+                                 { 0x3 : [ NextValue(r_dat_f[0], 0),
+                                           NextValue(r_dat_f[1], 0),
+                                           NextValue(r_dat_f[2], 0),
+                                           NextValue(r_dat_m[0], (1<<len(r_dat_m[0]))-1),
+                                           NextValue(r_dat_m[1], (1<<len(r_dat_m[1]))-1),
+                                           NextValue(r_dat_m[2], (1<<len(r_dat_m[2]))-1),
+                                           NextState("MEM_ODD") ],
+                                   0x2 : [ NextValue(r_dat_f[2],  self.a[(granule_bits-3):len(r_dat_f[2])]),
+                                           NextValue(offset,      self.a[(granule_bits-3):len(r_dat_f[2])]),
+                                           NextValue(offsetpsize, self.b[0:max_size_bits] + ((self.a[(granule_bits-3):len(r_dat_f[2])]) << (granule_bits-3)) ),
+                                           NextState("GENMASK_R0"),
+                                   ],
+                                   0x1 : [ NextValue(r_dat_f[1],        self.a[(granule_bits-3):len(r_dat_f[1])]),
+                                                 NextValue(offset, 0),
+                                                 NextValue(offsetpsize, self.b[0:max_size_bits]),
+                                                 NextState("GENMASK_R0"),
+                                   ],
+                                   0x0 : [ NextValue(r_dat_f[0],        self.a[(granule_bits-3):len(r_dat_f[0])]),
+                                                 NextValue(offset, 0),
+                                                 NextValue(offsetpsize, self.b[0:max_size_bits]),
+                                                 NextState("GENMASK_R0"),
+                                   ],
+                                 }),
+                     ).Elif(self.instruction.opcode == opcodes["ADR"][0],
+                            If(self.instruction.immediate[7],
+                               [ NextValue(addresses[x], self.a[x*32+4:(x+1)*32]) for x in range(width//32) ],
+                            ),
+                            NextState("MEM_ODD")
+                     ).Elif(self.instruction.opcode == opcodes["GETM"][0],
+                            NextState("MEM_ODD")
+                     )
+                  )
+        )
+        for X in range(0, granule_num):
+            lsseq.act("GENMASK_R" + str(X),
+                      NextValue(cpar, cpar ^ 1),
+                      If((offsetpsize > X) & (X >= offset),
+                         NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 1),
+                      ).Else(
+                         NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 0),
+                      ),
+                      If(X == (granule_num-1),
+                         If(cpar, ## checkme
+                            NextState("MEM_ODD")
+                         ).Else(
+                             NextState("MEM_EVEN1")
+                         )
+                      ).Else(
+                          NextState("GENMASK_R" + str(X+1)),
+                      ),
+            )
+        lsseq.act("GENMASK_R"+str(granule_num), # avoids MiGen complaining, unreachable
+                  NextValue(cpar, cpar ^ 1),
+                  If(cpar, ## checkme
+                     NextState("MEM_ODD")
+                  ).Else(
+                      NextState("MEM_EVEN1")
+                  )
+        )
+            
+        lsseq.act("MEMl",
+                  NextValue(cpar, cpar ^ 1),
+                  If(interface.ack,
+                     If(~self.instruction.immediate[7],
+                        NextValue(lbuf[0:128], interface.dat_r)),
+                     NextValue(interface.cyc, 0),
+                     NextValue(interface.stb, 0),
+                     NextState("MEMl2")
+                  ).Elif(interface.err,
+                         NextValue(self.has_failure[0], 1),
+                         NextValue(interface.cyc, 0),
+                         NextValue(interface.stb, 0),
+                         NextState("ERR"),
+                  ).Elif(timeout == 0,
+                         NextValue(self.has_timeout[0], 1),
+                         NextValue(interface.cyc, 0),
+                         NextValue(interface.stb, 0),
+                         NextState("ERR"),
+                  ))
+        lsseq.act("MEMl2",
+                  NextValue(cpar, cpar ^ 1),
+                  If(~interface.ack,
+                     If(self.instruction.immediate[6], # post-inc
+                        NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
+                     ),
+                     If(self.instruction.immediate[8],
+                        NextValue(interface.cyc, 1),
+                        NextValue(interface.stb, 1),
+                        NextValue(interface.sel, 2**len(interface.sel)-1),
+                        NextValue(interface.adr, (addresses[self.instruction.immediate[0:log2_int(width//32)]]) + 1),
+                        NextValue(interface.we, self.instruction.immediate[7]),
+                        NextValue(timeout, 2047),
+                        If(self.instruction.immediate[7],
+                           NextValue(interface.dat_w, self.b[128:256])),
+                        NextState("MEMh")
+                     ).Else(
+                         NextValue(lbuf[128:256], 0),
+                         If(cpar, ## checkme
+                            NextState("MEM_ODD")
+                         ).Else(
+                             NextState("MEM_EVEN1")
+                         )
+                     )
+                  ))
+        lsseq.act("MEMh",
+                  NextValue(cpar, cpar ^ 1),
+                  If(interface.ack,
+                     If(~self.instruction.immediate[7],
+                        NextValue(lbuf[128:256], interface.dat_r)),
+                     NextValue(interface.cyc, 0),
+                     NextValue(interface.stb, 0),
+                     NextState("MEMh2")
+                  ).Elif(interface.err,
+                         NextValue(self.has_failure[1], 1),
+                         NextValue(interface.cyc, 0),
+                         NextValue(interface.stb, 0),
+                         NextState("ERR"),
+                  ).Elif(timeout == 0,
+                         NextValue(self.has_timeout[1], 1),
+                         NextValue(interface.cyc, 0),
+                         NextValue(interface.stb, 0),
+                         NextState("ERR"),
+                  ))
+        lsseq.act("MEMh2",
+                  NextValue(cpar, cpar ^ 1),
+                  If(~interface.ack,
+                     If(self.instruction.immediate[6], # post-inc
+                        NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1),
+                     ),
+                     #NextValue(tries, 0),
+                     If(cpar, ## checkme
+                        NextState("MEM_ODD")
+                     ).Else(
+                        NextState("MEM_EVEN1")
+                     )
+                  ))
+        lsseq.act("MEM_ODD", # clock alignement cycle
+                  NextState("MEM_EVEN1"))
+        lsseq.act("MEM_EVEN1",
+                  NextState("MEM_EVEN2"))
+        lsseq.act("MEM_EVEN2",
+                  NextValue(cpar, 0),
+                  NextValue(self.has_failure, 0),
+                  NextValue(self.has_timeout, 0),
+                  NextState("IDLE"))
+        lsseq.act("ERR",
+                  #If(~tries, # second attempt
+                  #   NextValue(cpar, 0),
+                  #   NextValue(tries, 1),
+                  #   NextState("IDLE")
+                  #).Else(NextValue(tries, 0), # no third attempt, give up
+                         If(cpar, ## checkme
+                            NextState("MEM_ODD")
+                         ).Else(
+                             NextState("MEM_EVEN1")
+                         )
+                  #)
+        )
+        self.sync.mul_clk += [
+            If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"),
+               self.q_valid.eq(1),
+               If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]),
+                  If(~self.instruction.immediate[7],
+                     self.q.eq(lbuf),
+                  ).Else(
+                      self.q.eq(0), #self.a
+                  )
+               ).Elif(self.instruction.opcode == opcodes["SETM"][0],
+                   self.q.eq(0), #self.a
+               ).Elif(self.instruction.opcode == opcodes["ADR"][0],
+                      If(~self.instruction.immediate[7],
+                         [ self.q[x*32:(x+1)*32].eq(Cat(Signal(4, reset = 0), addresses[x])) for x in range(width//32) ],
+                      ).Else(
+                          self.q.eq(0),
+                      )
+               ).Elif(self.instruction.opcode == opcodes["GETM"][0],
+                      self.q.eq(Cat(Cat(r_dat_f[0], Signal(28, reset = 0)),
+                                    r_dat_m[0],
+                                    Cat(r_dat_f[1], Signal(28, reset = 0)),
+                                    r_dat_m[1],
+                                    Cat(r_dat_f[2], Signal(28, reset = 0)),
+                                    r_dat_m[2],
+                                    Cat(r_dat_f[3], Signal(28, reset = 0)),
+                                    r_dat_m[3])),
+               ).Else(
+                   self.q.eq(0xBADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000),
+               ),
+            ).Else(
+                self.q_valid.eq(0),
+            )
+        ]
+
+        self.state = Signal(32)
+        self.sync.mul_clk += self.state[0].eq(lsseq.ongoing("IDLE"))
+        self.sync.mul_clk += self.state[1].eq(lsseq.ongoing("MEMl"))
+        self.sync.mul_clk += self.state[2].eq(lsseq.ongoing("MEMl2"))
+        self.sync.mul_clk += self.state[3].eq(lsseq.ongoing("MEMh"))
+        self.sync.mul_clk += self.state[4].eq(lsseq.ongoing("MEMh2"))
+        self.sync.mul_clk += self.state[5].eq(lsseq.ongoing("MEM_ODD"))
+        self.sync.mul_clk += self.state[6].eq(lsseq.ongoing("MEM_EVEN1"))
+        self.sync.mul_clk += self.state[7].eq(lsseq.ongoing("MEM_EVEN2"))
+        self.sync.mul_clk += self.state[8].eq(lsseq.ongoing("MEM_ERR"))
+        self.sync.mul_clk += self.state[28:30].eq((self.state[28:30] & Replicate(~start_pipe, 2)) | self.has_timeout)
+        self.sync.mul_clk += self.state[30:32].eq((self.state[30:32] & Replicate(~start_pipe, 2)) | self.has_failure)
+
+        
+class Jareth(Module, AutoCSR, AutoDoc):
+    def __init__(self, platform, prefix, sim=False, build_prefix=""):
+        opdoc = "\n"
+        for mnemonic, description in opcodes.items():
+            opdoc += f" * **{mnemonic}** ({str(description[0])}) -- {description[1]} \n"
+
+        self.intro = ModuleDoc(title="Jareth", body="""
+Jareth is a vector computational engine based on the Curve25519 Engine.
+
+The Engine loosely resembles a Harvard architecture microcoded CPU, with a single
+512-entry, 256-bit wide 2R1W windowed-register file, a handful of execution units, and a "mailbox"
+unit (like a load/store, but transactional to wishbone). The Engine's microcode is
+contained in a 1k-entry, 32-bit wide microcode block. Microcode procedures are written to
+the block, and execution will start from the `mpstart` offset when the `go` bit is set.
+Execution will stop after either one of two conditions are met: either a `FIN` instruction
+is executed, or the microcode program counter (mpc) goes past the stop threshold, computed
+as `mpstart` + `mplen`.
+
+The register file is "windowed". A single window consists of 32x256-bit wide registers,
+and there are up to 16 windows. The concept behind windows is that core routines, such
+as point doubling and point addition, are codable using no more than 32 intermediate
+registers. The same microcode can be used, then, to serve point operations to up to
+16 different clients, selectable by setting the appropriate window. Note that the register
+file will stripe across four 4kiB pages, which means that memory protection can be
+enforced at page-level boundaries by hardware (with the help of the OS) for up to four
+separate clients, each getting four register windows.
+
+Every register read can be overridden from a constant ROM, by asserting `ca` or `cb` for
+registers a and b respectively. When either of these bits are asserted, the respective
+register address is fed into a "constants" lookup table, and the result of that table lookup is
+replaced for the constant value. This means up to 32 commonly used constants may be stored
+in the hardware for quick retrieval.
+
+.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/block_diagram.png
+   :alt: High-level block diagram of the Curev25519 engine
+
+Above is a high-level block diagram of the Curve25519 engine. Four clocks are present
+in this microarchitecture, and they are phase-aligned thanks to the 7-Series MMCM
+and low-skew global clock network. `eng_clk` is 50MHz, `mul_clk` is 100MHz, and
+`rf_clk` is 200MHz. The slowest 50MHz `eng_clk` clock controls the `seq` state machine, whose
+state names are listed on the left. A 50MHz base clock is chosen because this allows a
+single-cycle 256-bit add/sub using hardware carry chains in the Spartan7 -1L speed grade,
+greatly simplifying most of the arithmetic blocks. Faster clocks are used to pump the microcode
+RAM (100MHz) and register file (200MHz), so that we are wasting less time fetching instructions
+and operands. In particular, the register file uses four phases because we are emulating
+a three-port register file (2R1W) using a single-port memory primitive, and the microcode RAM
+runs at 100MHz (sysclk) for convenience of reading/writing instructions from the Wishbone bus.
+Not shown in the diagram are the global "window" register bits, or the multiplexers that
+switch off the datapaths when the system is not running allowing Wishbone full access to
+the machine state.
+
+Execution units are subclasses of "ExecUnit", and their instantiation is controlled by
+inclusion in the `exec_units` dictionary. Likewise, opcodes are defined in the `opcodes`,
+dictionary, and opcodes are bound to ExecUnits by passing them as the `opcode_list` argument
+to the execution units.
+
+Note that execution units can take an arbitrary amount of time to complete. Most will complete
+in one cycle, but for example, the multiplier takes 52 cycles @ 100MHz, or 26 `eng_clk` cycles.
+The current implementation does not allow pipelined operation; registered stages are provided
+to break combinational paths and bring up the base clock rate, but every instruction must go through
+the entire FETCH-EXEC-WAIT_DONE cycle before the next one can issue.
+
+The design is partially outfitted with registers to facilitate pipelining in the future, but
+the current simplified implementation is expected to provide adequate speedup. It's
+probably not worth the additional resources to do e.g. pipeline bypassing and hazard checking,
+as the target FPGA design is nearly at capacity.
+
+A conservative implementation (no optimization of intermediate values, immediate reduction of
+every add/sub operation) of Montgomery scalar multiplication using Engine25519
+completes one scalar multiply operation in 2.270ms, compared to 103ms in software.
+This does not include the time required to do the final affine inversion (done in software,
+with significant overhead -- about 100ms), or the time to load the microcode and operands (about 5us).
+The affine inversion can also be microcoded, it just hasn't been done yet.
+
+The Engine address space is divided up as follows (expressed as offset from base)::
+
+ 0x0_0000 - 0x0_0fff: microcode (one 4k byte page)
+ 0x1_0000 - 0x1_3fff: memory-mapped register file (4 x 4k pages = 16kbytes)
+
+Here are the currently implemented opcodes for The Engine:
+{}
+        """.format(opdoc))
+
+        microcode_width = 32
+        microcode_depth = 1024
+        running = Signal() # asserted when microcode is running
+
+        instruction = Record(instruction_layout) # current instruction to execute
+        illegal_opcode = Signal()
+        abort = Signal();
+
+        ### register file
+        rf_depth_raw = num_registers * 1 # total # or registers
+        rf_width_raw = 256 # width of a register
+        granule = 8
+        granule_bits = log2_int(granule)
+        granule_num = rf_width_raw//granule
+        granule_num_bits = log2_int(granule_num)
+        
+        self.submodules.rf = rf = RegisterFile(depth=rf_depth_raw, width=rf_width_raw)
+        self.window = CSRStorage(fields=[
+            CSRField("window", size=max(1, log2_int(rf_depth_raw) - log2_int(num_registers)), description="Selects the current register window to use"),
+        ])
+
+        self.mpstart = CSRStorage(fields=[
+            CSRField("mpstart", size=log2_int(microcode_depth), description="Where to start execution")
+        ])
+        self.mplen = CSRStorage(fields=[
+            CSRField("mplen", size=log2_int(microcode_depth), description="Length of the current microcode program. Thus valid code must be in the range of [mpstart, mpstart + mplen]"),
+        ])
+        self.control = CSRStorage(fields=[
+            CSRField("go", size=1, pulse=True, description="Writing to this puts the engine in `run` mode, and it will execute mplen microcode instructions starting at mpstart"),
+        ])
+        self.mpresume = CSRStatus(fields=[
+            CSRField("mpresume", size=log2_int(microcode_depth), description="Where to resume execution after a pause")
+        ])
+
+        self.power = CSRStorage(fields=[
+            CSRField("on", size=1, reset=0,
+                description="Writing `1` turns on the clocks to this block, `0` stops the clocks (for power savings). The handling of the clock gate is in a different module, this is just a flag to that block."),
+            CSRField("pause_req", size=1, description="Writing a `1` to this block will pause execution at the next micro-op, and allow for read-out of data from RF/microcode. Must check pause_gnt to confirm the pause has happened. Used to interrupt flow for suspend/resume."),
+        ])
+        # bring pause into the eng_clk domain
+        pause_req = Signal()
+        self.sync.eng_clk += pause_req.eq(self.power.fields.pause_req)
+        # re-sync the eng_clk phase to the RF phase whenever clocks are re-applied. We don't guarantee that the clocks start exactly
+        # at the same time, so you can get phase shift...
+        power_on_delay = Signal(max=16, reset=15)
+        eng_powered_on = Signal()
+        self.sync += [ # stretch out any power on pulse so we can process a reset in the clk50 domain after its enable has been switched on
+            If(~self.power.fields.on,
+                power_on_delay.eq(15)
+            ).Elif(power_on_delay > 0,
+                power_on_delay.eq(power_on_delay - 1)
+            ).Else(
+                power_on_delay.eq(0)
+            ),
+            eng_powered_on.eq(power_on_delay == 0), # make a signal that specifies that the engine is powered on that happens 16 cycles after the clocks are turned on
+            # note that this signal drops only *after* the power has been toggled, because when the clock is cut,
+            # the downstream "eng_clk" domain signals won't capture the latest state. So, once the power comes on,
+            # eng_powered_on must drop for a few cycles, then come back up again, which properly triggers a synchronization of the RF.
+        ]
+        eng_on_50 = Signal()
+        eng_on_50_r = Signal()
+        self.specials += MultiReg(eng_powered_on, eng_on_50, "eng_clk")
+        self.sync.eng_clk += eng_on_50_r.eq(eng_on_50)
+        rf_reset_clear = Signal()
+        self.specials += MultiReg(ResetSignal("eng_clk"), rf_reset_clear, "eng_clk") # sync up the register file's fast clock to our slow clock
+        self.comb += rf.clear.eq(rf_reset_clear | (eng_on_50 & ~eng_on_50_r))
+
+        self.status = CSRStatus(fields=[
+            CSRField("running", size=1, description="When set, the microcode engine is running. All wishbone access to RF and microcode memory areas will stall until this bit is clear"),
+            CSRField("mpc", size=log2_int(microcode_depth), description="Current location of the microcode program counter. Mostly for debug."),
+            CSRField("pause_gnt", size=1, description="When set, the engine execution has been paused, and the RF & microcode ROM can be read out for suspend/resume"),
+            CSRField("sigill", size=1, description="Illegal Instruction"),
+            CSRField("abort", size=1, description="Abort from failure"),
+            CSRField("finished", size=1, description="Finished"),
+        ])
+        pause_gnt = Signal()
+        mpc = Signal(log2_int(microcode_depth))  # the microcode program counter
+        running_r = Signal()
+        self.sync += [
+            self.status.fields.running.eq(running),
+            self.status.fields.pause_gnt.eq(pause_gnt),
+            self.status.fields.mpc.eq(mpc),
+            self.status.fields.sigill.eq(illegal_opcode),
+            self.status.fields.abort.eq(abort),
+            self.status.fields.finished.eq(((~running & running_r) | self.status.fields.finished) & (~(running & ~running_r))),
+        ]
+
+        self.submodules.ev = EventManager()
+        self.ev.finished = EventSourcePulse(description="Microcode run finished execution")
+        self.ev.illegal_opcode = EventSourcePulse(description="Illegal opcode encountered")
+        self.ev.finalize()
+        ill_op_r = Signal()
+        self.sync += [
+        running_r.eq(running),
+            ill_op_r.eq(illegal_opcode),
+        ]
+        self.comb += [
+            self.ev.finished.trigger.eq(~running & running_r), # falling edge pulse on running
+            self.ev.illegal_opcode.trigger.eq(~ill_op_r & illegal_opcode),
+        ]
+
+        ### microcode memory - 1rd/1wr dedicated to wishbone, 1rd for execution
+        microcode = Memory(microcode_width, microcode_depth)
+        self.specials += microcode
+        micro_wrport = microcode.get_port(write_capable=True, mode=READ_FIRST) # READ_FIRST allows BRAM inference
+        self.specials += micro_wrport
+        micro_rdport = microcode.get_port(mode=READ_FIRST)
+        self.specials += micro_rdport
+        micro_runport = microcode.get_port(mode=READ_FIRST) # , clock_domain="eng_clk"
+        self.specials += micro_runport
+
+        self.comb += [
+            micro_runport.adr.eq(mpc),
+            instruction.raw_bits().eq(micro_runport.dat_r),  # mapping should follow the record definition *exactly*
+            instruction.eq(micro_runport.dat_r),
+        ]
+        instruction_fields = []
+        for opcode, bits, description in instruction_layout:
+            instruction_fields.append(CSRField(opcode, size=bits, description=description))
+        self.instruction = CSRStatus(description="Current instruction being executed by the engine. The format of this register exactly reflects the binary layout of an Engine instruction.", fields=instruction_fields)
+        self.comb += [
+            self.instruction.status.eq(micro_runport.dat_r)
+        ]
+
+        self.ls_status = CSRStatus(32, description="Status of the L/S unit")
+
+        ### wishbone bus interface: decode the two address spaces and dispatch accordingly
+        self.bus = bus = wishbone.Interface()
+        wdata = Signal(32)
+        wadr = Signal(log2_int(rf_depth_raw) + 3) # wishbone bus is 32-bits wide, so 3 extra bits to select the sub-words out of the 256-bit registers
+        wmask = Signal(4)
+        wdata_we = Signal()
+        rdata_re = Signal()
+        rdata_ack = Signal()
+        rdata_req = Signal()
+        radr = Signal(log2_int(rf_depth_raw) + 3)
+
+        micro_rd_waitstates = 2
+        micro_rdack = Signal(max=(micro_rd_waitstates+1))
+        self.sync += [
+            If( ((bus.adr & ((0xFFFF_C000) >> 2)) >= ((prefix | 0x1_0000) >> 2)) & (((bus.adr & ((0xFFFF_C000) >> 2)) < ((prefix | 0x1_4000) >> 2))),
+                # fully decode register file address to avoid aliasing
+                If(bus.cyc & bus.stb & bus.we & ~bus.ack,
+                    If(~running | pause_gnt,
+                        wdata.eq(bus.dat_w),
+                        wadr.eq(bus.adr[:wadr.nbits]),
+                        wmask.eq(bus.sel),
+                        wdata_we.eq(1),
+                        If(rf.phase,
+                            bus.ack.eq(1),
+                        ).Else(
+                            bus.ack.eq(0),
+                        ),
+                    ).Else(
+                        wdata_we.eq(0),
+                        bus.ack.eq(0),
+                    )
+                ).Elif(bus.cyc & bus.stb & ~bus.we & ~bus.ack,
+                    If(~running | pause_gnt,
+                        radr.eq(bus.adr[:radr.nbits]),
+                        rdata_re.eq(1),
+                        bus.dat_r.eq( rf.ra_dat >> ((radr & 0x7) * 32) ),
+                        bus.ack.eq(rdata_ack),
+                        rdata_req.eq(1),
+                    ).Else(
+                        rdata_re.eq(0),
+                        bus.ack.eq(0),
+                        rdata_req.eq(0),
+                    )
+                ).Else(
+                    wdata_we.eq(0),
+                    bus.ack.eq(0),
+                    rdata_req.eq(0),
+                    rdata_re.eq(0),
+                )
+            ).Elif( (bus.adr & ((0xFFFF_F000) >> 2)) == ((0x0 | prefix) >> 2),
+                # fully decode microcode address to avoid aliasing
+                If(bus.cyc & bus.stb & bus.we & ~bus.ack,
+                    micro_wrport.adr.eq(bus.adr),
+                    micro_wrport.dat_w.eq(bus.dat_w),
+                    micro_wrport.we.eq(1),
+                    bus.ack.eq(1),
+                ).Elif(bus.cyc & bus.stb & ~bus.we & ~bus.ack,
+                    micro_wrport.we.eq(0),
+                    micro_rdport.adr.eq(bus.adr),
+                    bus.dat_r.eq(micro_rdport.dat_r),
+
+                    If(micro_rdack == 0, # 1 cycle delay for read to occur
+                        bus.ack.eq(1),
+                    ).Else(
+                        bus.ack.eq(0),
+                        micro_rdack.eq(micro_rdack - 1),
+                    )
+                ).Else(
+                    micro_wrport.we.eq(0),
+                    micro_rdack.eq(micro_rd_waitstates),
+                    bus.ack.eq(0),
+                )
+            ).Else(
+                # handle all mis-target reads not explicitly decoded
+                If(bus.cyc & bus.stb & ~bus.we & ~bus.ack,
+                    bus.dat_r.eq(0xC0DE_BADD),
+                    bus.ack.eq(1),
+                ).Elif(bus.cyc & bus.stb & bus.we & ~bus.ack,
+                    bus.ack.eq(1), # ignore writes -- but don't hang the bus
+                ).Else(
+                    bus.ack.eq(0),
+                )
+
+            )
+        ]
+
+        ### execution path signals to register file
+        ra_dat = Signal(rf_width_raw)
+        ra_adr = Signal(log2_int(num_registers))
+        ra_const = Signal()
+        r_shift = Signal()
+        rb_dat = Signal(rf_width_raw)
+        rb_adr = Signal(log2_int(num_registers))
+        rb_const = Signal()
+        wd_dat = Signal(rf_width_raw)
+        wd_adr = Signal(log2_int(num_registers))
+        wd_bwe = Signal(rf_width_raw//8, reset = 0xFFFF_FFFF)
+        rf_write = Signal()
+
+        r_dat_f = Array(Signal(granule_num_bits-1, reset = 0) for x in range(4)) ## FIXME: mem ctrl is 256/2=128 bits so 1 fewer bits
+        r_dat_m = Array(Signal(granule_num, reset = ((1<<(granule_num))-1)) for x in range(4))
+
+        self.submodules.ra_const_rom = JarethConst(insert_docs=True)
+        self.submodules.rb_const_rom = JarethConst()
+
+        ### merge execution path signals with host access paths
+        self.comb += [
+            ra_const.eq(instruction.ca),
+            rb_const.eq(instruction.cb),
+            ra_adr.eq(instruction.ra),
+            rb_adr.eq(instruction.rb),
+            self.ra_const_rom.adr.eq(ra_adr),
+            self.rb_const_rom.adr.eq(rb_adr),
+            rf.window.eq(self.window.fields.window),
+            r_shift.eq(instruction.shift),
+
+            If(running & ~pause_gnt,
+                rf.ra_adr.eq(Cat(ra_adr, self.window.fields.window)),
+                rf.rb_adr.eq(Cat(rb_adr, self.window.fields.window)),
+                rf.instruction_pipe_in.eq(instruction.raw_bits()),
+                rf.wd_adr.eq(Cat(wd_adr, self.window.fields.window)),
+                rf.wd_dat.eq(wd_dat),
+                rf.wd_bwe.eq(wd_bwe),
+                rf.we.eq(rf_write),
+            ).Else(
+                rf.ra_adr.eq(radr >> 3),
+                rf.wd_adr.eq(wadr >> 3),
+                rf.wd_dat.eq(Cat(wdata,wdata,wdata,wdata,wdata,wdata,wdata,wdata)), # replicate; use byte-enable to multiplex
+                rf.wd_bwe.eq(0xF << ((wadr & 0x7) * 4)), # select the byte
+                rf.we.eq(wdata_we),
+            ),
+            If(~ra_const,
+               #ra_dat.eq((rf.ra_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[0]))) & Cat(Replicate(r_dat_m[0][0], 8), Replicate(r_dat_m[0][1], 8), Replicate(r_dat_m[0][2], 8), Replicate(r_dat_m[0][3], 8), Replicate(r_dat_m[0][4], 8), Replicate(r_dat_m[0][5], 8), Replicate(r_dat_m[0][6], 8), Replicate(r_dat_m[0][7], 8), Replicate(r_dat_m[0][8], 8), Replicate(r_dat_m[0][9], 8), Replicate(r_dat_m[0][10], 8), Replicate(r_dat_m[0][11], 8), Replicate(r_dat_m[0][12], 8), Replicate(r_dat_m[0][13], 8), Replicate(r_dat_m[0][14], 8), Replicate(r_dat_m[0][15], 8), Replicate(r_dat_m[0][16], 8), Replicate(r_dat_m[0][17], 8), Replicate(r_dat_m[0][18], 8), Replicate(r_dat_m[0][19], 8), Replicate(r_dat_m[0][20], 8), Replicate(r_dat_m[0][21], 8), Replicate(r_dat_m[0][22], 8), Replicate(r_dat_m[0][23], 8), Replicate(r_dat_m[0][24], 8), Replicate(r_dat_m[0][25], 8), Replicate(r_dat_m[0][26], 8), Replicate(r_dat_m[0][27], 8), Replicate(r_dat_m[0][28], 8), Replicate(r_dat_m[0][29], 8), Replicate(r_dat_m[0][30], 8), Replicate(r_dat_m[0][31], 8)))
+               If(~r_shift,
+                  ra_dat.eq(rf.ra_dat),
+               ).Else(
+                   ra_dat.eq((rf.ra_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[0]))) & Cat([Replicate(r_dat_m[0][x], granule) for x in range(0, granule_num)]))
+               )
+            ).Else(
+                ra_dat.eq(self.ra_const_rom.const),
+            ),
+            If(~rb_const,
+               # rb_dat.eq(rf.rb_dat[8*r_dat_f[1]:8+8*r_dat_l[1]]),
+               #Case(r_dat_f[1],
+                #     {x: Case(r_dat_l[1], { y: rb_dat.eq(rf.rb_dat[x*8:(y+1)*8]) for y in range(x, 32) } ) for x in range(0, 32) }
+               #)
+               #rb_dat.eq((rf.rb_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[1]))) & Cat(Replicate(r_dat_m[1][0], 8), Replicate(r_dat_m[1][1], 8), Replicate(r_dat_m[1][2], 8), Replicate(r_dat_m[1][3], 8), Replicate(r_dat_m[1][4], 8), Replicate(r_dat_m[1][5], 8), Replicate(r_dat_m[1][6], 8), Replicate(r_dat_m[1][7], 8), Replicate(r_dat_m[1][8], 8), Replicate(r_dat_m[1][9], 8), Replicate(r_dat_m[1][10], 8), Replicate(r_dat_m[1][11], 8), Replicate(r_dat_m[1][12], 8), Replicate(r_dat_m[1][13], 8), Replicate(r_dat_m[1][14], 8), Replicate(r_dat_m[1][15], 8), Replicate(r_dat_m[1][16], 8), Replicate(r_dat_m[1][17], 8), Replicate(r_dat_m[1][18], 8), Replicate(r_dat_m[1][19], 8), Replicate(r_dat_m[1][20], 8), Replicate(r_dat_m[1][21], 8), Replicate(r_dat_m[1][22], 8), Replicate(r_dat_m[1][23], 8), Replicate(r_dat_m[1][24], 8), Replicate(r_dat_m[1][25], 8), Replicate(r_dat_m[1][26], 8), Replicate(r_dat_m[1][27], 8), Replicate(r_dat_m[1][28], 8), Replicate(r_dat_m[1][29], 8), Replicate(r_dat_m[1][30], 8), Replicate(r_dat_m[1][31], 8)))
+               If(~r_shift,
+                  rb_dat.eq(rf.rb_dat),
+               ).Else(
+                   rb_dat.eq((rf.rb_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[1]))) & Cat([Replicate(r_dat_m[1][x], granule) for x in range(0, granule_num)])),
+               )
+            ).Else(
+                rb_dat.eq(self.rb_const_rom.const)
+            )
+        ]
+        # simple machine to wait 2 RF clock cycles for data to propagate out of the register file and back to the host
+        rd_wait_states=4
+        bus_rd_wait = Signal(max=(rd_wait_states+1))
+        self.sync.rf_clk += [
+            If(rdata_req,
+                If(~running | pause_gnt,
+                    If(bus_rd_wait != 0,
+                        bus_rd_wait.eq(bus_rd_wait-1),
+                    ).Else(
+                        rdata_ack.eq(1),
+                    )
+                )
+            ).Else(
+                rdata_ack.eq(0),
+                bus_rd_wait.eq(rd_wait_states),
+            )
+        ]
+
+        sext_immediate = Signal(log2_int(microcode_depth))
+        self.comb += sext_immediate.eq(Cat(instruction.immediate, instruction.immediate[8])) # migen signed math failed us. so manually sign extend. this breaks the configurability of the code.
+
+        ### Microcode sequencer. Very simple: it can only run linear sections of microcode. Feature not bug;
+        ### constant time operation is a defense against timing attacks.
+
+        # pulse-stretch the go from sys->eng_clk. Don't use Migen CDC primitives, as they add latency; a BlindTransfer
+        # primitive on its own will take about as much time as a couple instructions on The Engine.
+        engine_go = Signal()
+        go_stretch = Signal(2)
+        self.sync += [ # note that we will miss this if the system throttles our clocks when this pulse arrives
+            If(self.control.fields.go,
+                go_stretch.eq(2)
+            ).Else(
+                If(go_stretch != 0,
+                   go_stretch.eq(go_stretch - 1),
+                )
+            )
+        ]
+        self.comb += engine_go.eq(self.control.fields.go | (go_stretch != 0))
+
+        self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE"))
+        mpc_stop = Signal(log2_int(microcode_depth))
+        window_latch = Signal(self.window.fields.window.size)
+        exec = Signal()  # indicates to execution units to start running
+        done = Signal()  # indicates when the given execution units are done (as-muxed from subunits)
+        self.comb += rf.running.eq(~seq.ongoing("IDLE") | rdata_re),  # let the RF know when we're not executing, so it can idle to save power
+        seq.act("IDLE",
+            NextValue(pause_gnt, 0),
+            If(engine_go,
+                If(pause_req,
+                    NextValue(mpc, self.mpresume.fields.mpresume)
+                ).Else(
+                    NextValue(mpc, self.mpstart.fields.mpstart)
+                ),
+                NextValue(mpc_stop, self.mpstart.fields.mpstart + self.mplen.fields.mplen - 1),
+                NextValue(window_latch, self.window.fields.window),
+                NextValue(running, 1),
+                NextState("FETCH"),
+            ).Else(
+                NextValue(running, 0),
+            )
+        )
+        seq.act("FETCH",
+            If(pause_req,
+                NextState("PAUSED"),
+                NextValue(pause_gnt, 1),
+            ).Else(
+                # one cycle latency for instruction fetch
+                NextState("EXEC"),
+                NextValue(pause_gnt, 0),
+            )
+        )
+        seq.act("EXEC", # not a great name. This is actually where the register file fetches its contents.
+            If(instruction.opcode == opcodes["BRZ"][0],
+                NextState("DO_BRZ"),
+            ).Elif(instruction.opcode == opcodes["BRNZ"][0],
+                NextState("DO_BRNZ"),
+            ).Elif(instruction.opcode == opcodes["FIN"][0],
+                NextState("IDLE"),
+                NextValue(running, 0),
+            ).Elif(instruction.opcode < opcodes["MAX"][0], # check if the opcode is legal before running it
+                exec.eq(1),
+                NextState("WAIT_DONE"),
+            ).Else(
+                NextState("ILLEGAL_OPCODE"),
+            )
+        )
+        seq.act("WAIT_DONE", # this is where the actual instruction execution happens.
+            If(done, # TODO: for now, we just wait for each instruction to finish; but the foundations are around for pipelining...
+                If(mpc < mpc_stop,
+                   NextState("FETCH"),
+                   NextValue(mpc, mpc + 1),
+                ).Else(
+                    NextState("IDLE"),
+                    NextValue(running, 0),
+                )
+            )
+        )
+        seq.act("ILLEGAL_OPCODE",
+            NextState("IDLE"),
+            NextValue(running, 0),
+            illegal_opcode.eq(1),
+        )
+        seq.act("DO_BRZ",
+            If(ra_dat == 0,
+                If( (sext_immediate + mpc + 1 < mpc_stop) & (sext_immediate + mpc + 1 >= self.mpstart.fields.mpstart), # validate new PC is in range
+                    NextState("FETCH"),
+                    NextValue(mpc, sext_immediate + mpc + 1),
+                ).Else(
+                    NextState("IDLE"),
+                    NextValue(running, 0),
+                )
+            ).Else(
+                If(abort,
+                    NextState("IDLE"),
+                    NextValue(running, 0),
+                ).Elif(mpc < mpc_stop,
+                    NextState("FETCH"),
+                    NextValue(mpc, mpc + 1),
+                ).Else(
+                    NextState("IDLE"),
+                    NextValue(running, 0),
+                )
+            ),
+        )
+        seq.act("DO_BRNZ",
+            If(ra_dat != 0,
+                If( (sext_immediate + mpc + 1 < mpc_stop) & (sext_immediate + mpc + 1 >= self.mpstart.fields.mpstart), # validate new PC is in range
+                    NextState("FETCH"),
+                    NextValue(mpc, sext_immediate + mpc + 1),
+                ).Else(
+                    NextState("IDLE"),
+                    NextValue(running, 0),
+                )
+            ).Else(
+                If(abort,
+                    NextState("IDLE"),
+                    NextValue(running, 0),
+                ).Elif(mpc < mpc_stop,
+                    NextState("FETCH"),
+                    NextValue(mpc, mpc + 1),
+                ).Else(
+                    NextState("IDLE"),
+                    NextValue(running, 0),
+                )
+            ),
+        )
+        seq.act("PAUSED",
+            If(~pause_req,
+                NextValue(pause_gnt, 0),
+                NextState("FETCH"), # could probably go directly to "EXEC", but, this is a minor detail recovering from pause
+            )
+        )
+        
+        #pad_SBUS_DATA_OE_LED = platform.request("SBUS_DATA_OE_LED")
+        #led = Signal(reset = 1)
+        #self.comb += pad_SBUS_DATA_OE_LED.eq(led)
+        self.busls = wishbone.Interface(data_width = 128, adr_width = 28) # FIXME: hardwired (here and elsewhere)
+        exec_units = {
+            "exec_logic"     : ExecLogic(width=rf_width_raw),
+            "exec_addsub"    : ExecAddSub(width=rf_width_raw),
+            "exec_ls"        : ExecLS(width=rf_width_raw, interface=self.busls, r_dat_f=r_dat_f, r_dat_m=r_dat_m, granule=granule),
+        }
+        exec_units_shift = {
+            "exec_logic": True,
+            "exec_addsub": False,
+            "exec_ls": False,
+        }
+        exec_unit_shift_num = { }
+        index = 0
+        
+        for name, unit in exec_units.items():
+            setattr(self.submodules, name, unit);
+            setattr(self, "done" + str(index), Signal(name="done"+str(index)))
+            setattr(self, "unit_q" + str(index), Signal(wd_dat.nbits, name="unit_q"+str(index)))
+            setattr(self, "unit_sel" + str(index), Signal(name="unit_sel"+str(index)))
+            setattr(self, "unit_wd" + str(index), Signal(log2_int(num_registers), name="unit_wd"+str(index)))
+            if (exec_units_shift[name]):
+                setattr(self, "unit_shift" + str(index), Signal(name="unit_shift"+str(index)))
+            subdecode = Signal()
+            for op in unit.opcode_list:
+                self.comb += [
+                    If(instruction.opcode == opcodes[op][0],
+                        subdecode.eq(1)
+                    )
+                ]
+            instruction_out = Record(instruction_layout)
+            self.comb += [
+                instruction_out.raw_bits().eq(unit.instruction_out)
+            ]
+            self.comb += [
+                unit.start.eq(exec & subdecode),
+                getattr(self, "done" + str(index)).eq(unit.q_valid),
+                unit.a.eq(ra_dat),
+                unit.b.eq(rb_dat),
+                unit.instruction_in.eq(instruction.raw_bits()),
+                getattr(self, "unit_q" + str(index)).eq(unit.q),
+                getattr(self, "unit_sel" + str(index)).eq(subdecode),
+                getattr(self, "unit_wd" + str(index)).eq(instruction_out.wd),
+            ]
+            if (exec_units_shift[name]):
+                self.comb += [ getattr(self, "unit_shift" + str(index)).eq(instruction_out.shift), ]
+            exec_unit_shift_num[index] = exec_units_shift[name]
+            index += 1
+
+        for i in range(index):
+            if (exec_unit_shift_num[i]):
+                self.comb += [
+                    If(getattr(self, "done" + str(i)),
+                       done.eq(1),  # TODO: for proper pipelining, handle case of two units done simultaneously!
+                       If(getattr(self, "unit_shift" + str(i)),
+                          wd_dat.eq(getattr(self, "unit_q" + str(i)) << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2]))),
+                          wd_adr.eq(getattr(self, "unit_wd" + str(i))),
+                          wd_bwe.eq(Cat([Replicate(r_dat_m[2][x], granule//8) for x in range(0, granule_num)])),
+                       ).Else(
+                           wd_dat.eq(getattr(self, "unit_q" + str(i))),
+                           wd_adr.eq(getattr(self, "unit_wd" + str(i))),
+                           wd_bwe.eq(0xFFFF_FFFF),
+                       )
+                    ).Elif(seq.ongoing("IDLE"),
+                           done.eq(0),
+                    )
+                ]
+            else:
+                self.comb += [
+                    If(getattr(self, "done" + str(i)),
+                       done.eq(1),  # TODO: for proper pipelining, handle case of two units done simultaneously!
+                           wd_dat.eq(getattr(self, "unit_q" + str(i))),
+                           wd_adr.eq(getattr(self, "unit_wd" + str(i))),
+                           wd_bwe.eq(0xFFFF_FFFF),
+                    ).Elif(seq.ongoing("IDLE"),
+                           done.eq(0),
+                    )
+                ]
+
+        self.comb += [
+            rf_write.eq(done),
+        ]
+        
+        self.sync += abort.eq((abort & ~engine_go) | (self.exec_ls.has_failure[0] | self.exec_ls.has_failure[1] | self.exec_ls.has_timeout[0] | self.exec_ls.has_timeout[1]))
+        self.comb += self.ls_status.status.eq(self.exec_ls.state)
+
+        ##### TIMING CONSTRAINTS -- you want these. Trust me.
+
+        clk50 = "clk50"
+        #clk100 = "clk100"
+        clk100 = "sysclk"
+        clk200 = "clk200"
+        # registered exec units need this set of rules
+        ### clk200->clk50 multi-cycle paths:
+        # we architecturally guarantee extra setup time from the register file to the point of consumption:
+        # read data is stable by the 3rd phase of the RF fetch cycle, and so it is in fact ready even before
+        # the other signals that trigger the execute mode, hence 4+1 cycles total setup time
+        platform.add_platform_command("set_multicycle_path 5 -setup -start -from [get_clocks " + clk200 + "] -to [get_clocks " + clk50 + "] -through [get_cells *rf_r*_dat_reg*]")
+        platform.add_platform_command("set_multicycle_path 4 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk50 + "] -through [get_cells *rf_r*_dat_reg*]")
+        ### clk200->clk100 multi-cycle paths:
+        # same as above, but for the multiplier path.
+        platform.add_platform_command("set_multicycle_path 3 -setup -start -from [get_clocks " + clk200 + "] -to [get_clocks " + clk100 + "] -through [get_cells *rf_r*_dat_reg*]")
+        platform.add_platform_command("set_multicycle_path 2 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk100 + "] -through [get_cells *rf_r*_dat_reg*]")
+
+        # unregistered exec units need this set of rules
+        ### clk200->clk200 multi-cycle paths:
+        # this is for the case when we don't register the data, and just go straight from RF out put RF input. In the worst case
+        # we have three (? maybe five?) clk200 cycles to compute as we phase through the reads and writes
+        platform.add_platform_command("set_multicycle_path 3 -setup -from [get_clocks " + clk200 + "] -to [get_clocks " + clk200 + "] -through [get_cells *rf_r*_dat_reg*]")
+        platform.add_platform_command("set_multicycle_path 2 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk200 + "] -through [get_cells *rf_r*_dat_reg*]")
+
+        # other paths
+        ### sys->clk200 multi-cycle paths:
+        # microcode fetch is stable 10ns before use by the register file, by design
+        platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=ra_const)
+        platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=ra_const)
+        platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=rb_const)
+        platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=rb_const)
+        platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.ra_const_rom.adr)
+        platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.ra_const_rom.adr)
+        platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.rb_const_rom.adr)
+        platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.rb_const_rom.adr)
+        # ignore the clk200 reset path for timing purposes -- there is >1 cycle guaranteed after reset for everything to settle before anything moves on these paths
+        platform.add_platform_command("set_false_path -through [get_nets " + clk200 + "_rst]")
+        # ignore the clk50 reset path for timing purposes -- there is > 1 cycle guaranteed after reset for everything to settle before anything moves on these paths (applies for other crypto engines, (SHA/AES) as well)
+        platform.add_platform_command("set_false_path -through [get_nets " + clk50 + "_rst]")
+        ### sys->clk50 multi-cycle paths:
+        # microcode fetch is guaranteed not to transition in the middle of an exec computation
+        platform.add_platform_command("set_multicycle_path 2 -setup -start -from [get_clocks " + clk100 + "] -to [get_clocks " + clk50 + "] -through [get_cells microcode_reg*]")
+        platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk50 + "] -through [get_cells microcode_reg*]")
+        ### clk50->clk200 multi-cycle paths:
+        # engine running will set up a full eng_clk cycle before any RF accesses need to be valid
+        platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_nets {{ {net1} {net2} {net3} }}]", net1=running, net2=running_r, net3=rf.running)
+        platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_nets {{ {net1} {net2} {net3} }}]", net1=running, net2=running_r, net3=rf.running)
+        # this signal is a combo from clk50+sys
+        platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]")
+        platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]")
+        # data writeback happens on phase==2, and thus is stable for at least two clk200 clocks extra
+        platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]")
+        platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]")
+        platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]")
+        platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]")
+        ### sys->clk200 multi-cycle paths:
+        # data writeback happens on phase==2, and thus is stable for at least two clk200 clocks extra + one full eng_clk (total 25ns)
+        platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]")
+        platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]")
+        platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]")
+        platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]")
+        # this signal is a combo from clk50+sys
+        platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]")
+        platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]")
diff --git a/sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock
new file mode 100644
index 0000000..4be67b7
--- /dev/null
+++ b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock
@@ -0,0 +1,13 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+[[package]]
+name = "jareth-as"
+version = "0.1.0"
+
+[[package]]
+name = "jareth_code"
+version = "0.1.0"
+dependencies = [
+ "jareth-as 0.1.0",
+]
+
diff --git a/sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml
new file mode 100644
index 0000000..ead6a03
--- /dev/null
+++ b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "jareth_code"
+version = "0.1.0"
+authors = ["Romain Dolbeau <romain@dolbeau.org>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+
+[dependencies.jareth-as]
+#git="https://github.com/betrusted-io/jareth-as.git"
+#rev="6681e73c1fdc4a460b5ef9f9c7c91aef546d00f3"
+path = "/home/dolbeau/jareth-as"
+
+[dev-dependencies.jareth-as]
+#git="https://github.com/betrusted-io/jareth-as.git"
+#rev="6681e73c1fdc4a460b5ef9f9c7c91aef546d00f3"
+path = "/home/dolbeau/jareth-as"
+
+[[bin]]
+name = "jareth_code"
+path = "jareth_code.rs"
diff --git a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs
new file mode 100644
index 0000000..aca9d00
--- /dev/null
+++ b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs
@@ -0,0 +1,108 @@
+#![recursion_limit="768"]
+
+extern crate jareth_as;
+use jareth_as::*;
+
+fn main() -> std::io::Result<()> {
+    let mcode = assemble_jareth!(
+	// 0..0 $DST / $DST / $SRC in %0
+	// 0..0 $DST / $SRC / $DST in %1
+	// size in %2
+	// pattern in %3
+	// -----
+	// size & 7 in %5
+	// size rounded down in %6
+	// input in %16
+	// output in %17
+	// 0 in %31
+         start:
+				resm %31
+				setadr %31, %0
+				load256inc %16, %0
+				load256inc %17, %1
+				// slow
+				setma %31, %0, #16
+				// slow
+				setmq %31, %1, #16
+				and %5, %2, #15
+				sub %6, %2, %5
+				brz done, %6
+		loop:
+			psa %18, %16
+			psa %19, %17
+				psa* %17, %16
+			psa %20, %17
+				store128inc %31, %2, %17
+				sub %6, %6, #16
+				brz last, %6
+				loadh128inc %16, %0, %16
+				loadh128inc %17, %1, %17
+				brz loop, #0
+		last:
+				// FIXME: not if Q is aligned
+				loadh128inc %17, %1, %17
+				store128inc %31, %2, %17
+		done:
+				getadr %3
+				getm %2
+				fin
+				fin
+	);
+    let _mcode3 = assemble_jareth!(
+	// 0..0 / $DST / $SRC in %0
+	// size in %2
+	// pattern in %3
+               start:
+			   resm %31
+			   psa %31, #0
+			   psa %30, #1
+			   sub %30, %31, %30
+			   psa %29, #2
+			   setmq %31, %29, %2
+			   setma %31, %0, %2
+			   psa* %30, %3
+			   getm %3
+			   resm %31
+			   psa %2, %30
+			   setadr %31 , %0
+			   load256 %1, %0
+			   load128 %0, %0
+			   fin
+			   fin
+	);
+    let _mcode2 = assemble_jareth!(
+				psa %1, %3
+			    setma %31, %0, %2
+				psa %2, %3
+				getm %3
+				fin
+				fin
+			    resm %31
+				psa %0, %3
+				setmq %31, %1, %2
+				psa %1, %3
+				fin
+				fin
+				fin
+				setma %31, %0, %2
+				setma %31, %0, %2
+			    resm %31
+				fin
+				fin
+				fin
+				fin
+    );
+
+    let mut pos;
+
+	pos = 0;
+	println!("test code:");
+    while pos < mcode.len() {
+		  print!("0x{:08x},", mcode[pos]);
+		  pos = pos + 1;
+    }
+	println!("");
+	println!("-> {}", mcode.len());
+
+	Ok(())
+}
diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py
index 7e7b199..d605e77 100644
--- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py
+++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py
@@ -31,6 +31,8 @@ USBOHCI_ADDR_PFX =       Signal(12, reset = 0x008)
 SRAM_ADDR_PFX =          Signal(12, reset = 0x009) # unmapped ; LE
 ENGINE_ADDR_PFXA =       Signal(12, reset = 0x00a)
 ENGINE_ADDR_PFXB =       Signal(12, reset = 0x00b)
+JARETH_ADDR_PFXA =       Signal(12, reset = 0x00c)
+JARETH_ADDR_PFXB =       Signal(12, reset = 0x00d)
 CG6_BT_ADDR_PFX =        Signal(12, reset = 0x020)
 CG6_ALT_ADDR_PFX =       Signal(12, reset = 0x028)
 CG6_FHC_ADDR_PFX =       Signal(12, reset = 0x030)
@@ -503,6 +505,8 @@ class SBusFPGABus(Module):
                                  (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX) |
                                  (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXA) |
                                  (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXB) |
+                                 (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXA) |
+                                 (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXB) |
                                  (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_BT_ADDR_PFX) |
                                  (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_ALT_ADDR_PFX) |
                                  (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_FHC_ADDR_PFX) |
@@ -704,6 +708,8 @@ class SBusFPGABus(Module):
                                      (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX) |
                                      (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXA) |
                                      (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXB) |
+                                     (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXA) |
+                                     (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXB) |
                                      (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_BT_ADDR_PFX) |
                                      (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_ALT_ADDR_PFX) |
                                      (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_FHC_ADDR_PFX) |
diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py
index bb17092..17ca500 100644
--- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py
+++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py
@@ -104,7 +104,8 @@ def get_prom(soc,
              cg3=False,
              cg6=False,
              cg3_res=None,
-             sdcard=False):
+             sdcard=False,
+             jareth=False):
 
     framebuffer = (bw2 or cg3 or cg6)
     
@@ -119,7 +120,7 @@ def get_prom(soc,
     r += "\" RDOL,sbusstat\" device-name\n"
     r += get_header_map_stuff("sbus_bus_stat", "sbus_bus_stat", 256)
     
-    if (trng or usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard):
+    if (trng or usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard or jareth):
         r += "finish-device\nnew-device\n"
 
     if (trng):
@@ -131,7 +132,7 @@ def get_prom(soc,
         r += "  map-out-trng\n"
         r += ";\n"
         r += "disabletrng!\n"
-        if (usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard):
+        if (usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard or jareth):
             r += "finish-device\nnew-device\n"
 
     if (usb):
@@ -151,7 +152,7 @@ def get_prom(soc,
         r += " map-out-usb_host_ctrl\n"
         r += ";\n"
         r += "my-reset!\n"
-        if ((sdram or not sdram) or engine or i2c or framebuffer or sdcard):
+        if ((sdram or not sdram) or engine or i2c or framebuffer or sdcard or jareth):
             r += "finish-device\nnew-device\n"
         
     if (sdram):
@@ -176,15 +177,15 @@ def get_prom(soc,
         r += "\" RDOL,hidden_sdram\" device-name\n"
         r += get_header_mapx_stuff("mregs", [ "ddrphy", "sdram" ], [ 4096, 4096 ], [ "csr", "csr" ])
         r += "fload sdram_init.fth\ninit!\n"
-    if (engine or i2c or framebuffer or sdcard):
+    if (engine or i2c or framebuffer or sdcard or jareth):
         r += "finish-device\nnew-device\n"
     
     if (engine):
         r += "\" betrustedc25519e\" device-name\n"
         r += ": sbusfpga_regionaddr_curve25519engine-microcode sbusfpga_regionaddr_curve25519engine ;\n"
         r += ": sbusfpga_regionaddr_curve25519engine-regfile sbusfpga_regionaddr_curve25519engine h# 10000 + ;\n"
-        r += get_header_mapx_stuff("curve25519engine", [ "curve25519engine-regs", "curve25519engine-microcode", "curve25519engine-regfile" ], [ 4096, 4096, 65536 ] , ["csr", "region", "region" ] )
-        if (i2c or framebuffer or sdcard):
+        r += get_header_mapx_stuff("curve25519engine", [ "curve25519engine", "curve25519engine-microcode", "curve25519engine-regfile" ], [ 4096, 4096, 65536 ] , ["csr", "region", "region" ] )
+        if (i2c or framebuffer or sdcard or jareth):
             r += "finish-device\nnew-device\n"
         
     if (i2c):
@@ -199,7 +200,7 @@ def get_prom(soc,
         r += "  \" lm75\" encode-string \" compatible\" property\n"
         r += "  h# 48 encode-int \" addr\" property\n"
         r += "  finish-device\n"
-        if (framebuffer or sdcard):
+        if (framebuffer or sdcard or jareth):
             r += "finish-device\nnew-device\n"
         
     if (framebuffer):
@@ -233,7 +234,7 @@ def get_prom(soc,
         else:
             r += get_header_map_stuff("cg6extraregs", "cg6", 4096, reg=False)
             r += "fload cg6_init.fth\ncg6_init!\n"
-        if (sdcard):
+        if (sdcard or jareth):
             r += "finish-device\nnew-device\n"
         
     if (sdcard):
@@ -249,6 +250,15 @@ def get_prom(soc,
         r += "sdcard-init!\n"
         r += "fload sdcard.fth\n"
         r += "fload sdcard_access.fth\n"
+        if (jareth):
+            r += "finish-device\nnew-device\n"
+
+    if (jareth):
+        r += "\" jareth\" device-name\n"
+        r += ": sbusfpga_regionaddr_jareth-microcode sbusfpga_regionaddr_jareth ;\n"
+        r += ": sbusfpga_regionaddr_jareth-regfile sbusfpga_regionaddr_jareth h# 10000 + ;\n"
+        r += get_header_mapx_stuff("jareth", [ "jareth", "jareth-microcode", "jareth-regfile" ], [ 4096, 4096, 4096 ] , ["csr", "region", "region" ] )
+
     r += "end0\n"
 
     return r
diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py
index 4ec713f..06c98d6 100644
--- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py
+++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py
@@ -40,7 +40,6 @@ import bw2_fb
 import cg3_fb
 import cg6_fb
 import cg6_accel
-#import cgtrois
 
 # Wishbone stuff
 from sbus_wb import WishboneDomainCrossingMaster
@@ -65,7 +64,7 @@ class _CRG(Module):
 #        self.clock_domains.cd_por       = ClockDomain() # 48 MHz native, reset'ed by SBus, power-on-reset timer
         if (usb):
             self.clock_domains.cd_usb       = ClockDomain() # 48 MHZ PLL, reset'ed by SBus (via pll), for USB controller
-        if (engine): # also used for cgtrois
+        if (engine): # also used for Jareth
             self.clock_domains.cd_clk50     = ClockDomain() # 50 MHz (gated) for curve25519engine  -> eng_clk
             #self.clock_domains.cd_clk100    = ClockDomain() # 100 MHz for curve25519engine -> sys_clk
             self.clock_domains.cd_clk200    = ClockDomain() # 200 MHz (gated) for curve25519engine -> rf_clk
@@ -121,7 +120,7 @@ class _CRG(Module):
         #platform.add_false_path_constraints(self.cd_sys.clk, self.cd_sbus.clk)
         #platform.add_false_path_constraints(self.cd_sbus.clk, self.cd_sys.clk)
         ##platform.add_false_path_constraints(self.cd_native.clk, self.cd_sys.clk)
-        if (engine): # also used for cgtrois
+        if (engine): # also used for Jareth
             pll.create_clkout(self.cd_clk50, sys_clk_freq/2, ce=pll.locked & self.curve25519_on)
             platform.add_platform_command("create_generated_clock -name clk50 [get_pins {{{{MMCME2_ADV/CLKOUT{}}}}}]".format(num_clk))
             num_clk = num_clk + 1
@@ -212,7 +211,7 @@ class SBusFPGA(SoCCore):
         #if self.irq.enabled:
             #self.irq.add(name, use_loc_if_exists=True)
             
-    def __init__(self, variant, version, sys_clk_freq, trng, usb, sdram, engine, i2c, bw2, cg3, cg6, cg3_res, sdcard, **kwargs):
+    def __init__(self, variant, version, sys_clk_freq, trng, usb, sdram, engine, i2c, bw2, cg3, cg6, cg3_res, sdcard, jareth, **kwargs):
         framebuffer = (bw2 or cg3 or cg6)
         
         print(f"Building SBusFPGA for board version {version}")
@@ -275,6 +274,7 @@ class SBusFPGA(SoCCore):
             "usb_host":         0x00080000, # OHCI registers are here, not in CSR
             #"usb_shared_mem":   0x00090000, # unused ATM
             "curve25519engine": 0x000a0000, # includes microcode (4 KiB@0) and registers (16 KiB @ 64 KiB)
+            "jareth":           0x000c0000, # includes microcode (4 KiB@0) and registers (2 KiB @ 64 KiB)
             "cg6_bt":           0x00200000, # required for compatibility, bt_regs for cg6
             #"cg6_dhc":          0x00240000, # required for compatibility, unused
             "cg6_alt":          0x00280000, # required for compatibility
@@ -291,7 +291,9 @@ class SBusFPGA(SoCCore):
             "dvma_bridge":      0xfc000000, # required to match DVMA virtual addresses
         }
         self.mem_map.update(wb_mem_map)
-        self.submodules.crg = _CRG(platform=platform, sys_clk_freq=sys_clk_freq, usb=usb, usb_clk_freq=48e6, engine=engine, framebuffer=framebuffer, pix_clk=litex.soc.cores.video.video_timings[cg3_res]["pix_clk"])
+        self.submodules.crg = _CRG(platform=platform, sys_clk_freq=sys_clk_freq, usb=usb, usb_clk_freq=48e6, engine=(engine or jareth), framebuffer=framebuffer, pix_clk=litex.soc.cores.video.video_timings[cg3_res]["pix_clk"])
+            
+        
         #self.platform.add_period_constraint(self.platform.lookup_request("SBUS_3V3_CLK", loose=True), 1e9/25e6) # SBus max
 
         ## add our custom timings after the clocks have been defined
@@ -484,7 +486,7 @@ class SBusFPGA(SoCCore):
             #self.comb += pad_sdcard_interrupt.eq(sig_sdcard_interrupt)
             #self.comb += sig_sdcard_interrupt.eq(~self.sdirq.irq) ##
 
-        if (usb or engine or sdcard):
+        if (usb or engine or sdcard or jareth): # jareth only for testing
             if (not single_dvma_master):
                 self.bus.add_slave(name="dvma_bridge", slave=self.wishbone_slave_sys, region=SoCRegion(origin=self.mem_map.get("dvma_bridge", None), size=0x03ffffff, cached=False))
 
@@ -502,7 +504,8 @@ class SBusFPGA(SoCCore):
                 self.bus.add_master(name="curve25519engineLS", master=self.curve25519engine.busls)
             else:
                 self.comb += self.curve25519engine.busls.connect(self.wishbone_slave_sys)
-            self.comb += self.crg.curve25519_on.eq(self.curve25519engine.power.fields.on)
+            if (not jareth):
+                self.comb += self.crg.curve25519_on.eq(self.curve25519engine.power.fields.on)
             
         if (i2c):
             self.submodules.i2c = RTLI2C(platform, pads=platform.request("i2c"))
@@ -535,6 +538,16 @@ class SBusFPGA(SoCCore):
                 self.add_ram("cg6_accel_rom", origin=self.mem_map["cg6_accel_rom"], size=rounded_cg6_rom_len, contents=cg6_rom_data, mode="r")
                 self.add_ram("cg6_accel_ram", origin=self.mem_map["cg6_accel_ram"], size=2**12, mode="rw")
 
+        if (jareth):
+            from jareth import Jareth;
+            self.submodules.jareth = ClockDomainsRenamer({"eng_clk":"clk50", "rf_clk":"clk200", "mul_clk":"clk100_gated"})(Jareth(platform=platform,prefix=self.mem_map.get("jareth", None))) # , "sys":"clk100"
+            self.bus.add_slave("jareth", self.jareth.bus, SoCRegion(origin=self.mem_map.get("jareth", None), size=0x20000, cached=False))
+            self.bus.add_master(name="jarethLS", master=self.jareth.busls) # Jareth doesn't need the DVMA
+            if (not engine):
+                self.comb += self.crg.curve25519_on.eq(self.jareth.power.fields.on)
+            else:
+                self.comb += self.crg.curve25519_on.eq(self.jareth.power.fields.on | self.curve25519engine.power.fields.on)
+
         print("IRQ to Device map:\n")
         print(platform.irq_device_map)
         print("Device to IRQ map:\n")
@@ -566,12 +579,13 @@ def main():
     parser.add_argument("--cg3-res", default="1152x900@76Hz", help="Specify the CG3/CG6 resolution")
     parser.add_argument("--cg6", action="store_true", help="add a CG6 framebuffer [V1.2+VGA_RGB222 pmod]")
     parser.add_argument("--sdcard", action="store_true", help="add a sdcard {no SW yet}")
+    parser.add_argument("--jareth", action="store_true", help="add a Jareth vector core [all]")
     builder_args(parser)
     vivado_build_args(parser)
     args = parser.parse_args()
 
     if (args.sdram == False):
-        print(" ***** WARNING ***** : not enablling the SDRAM still adds a controller, but doesn't add the DMA engines\n")
+        print(" ***** WARNING ***** : not enabling the SDRAM still adds a controller, but doesn't add the DMA engines\n")
     if (args.usb and (args.version == "V1.0")):
         print(" ***** WARNING ***** : USB on V1.0 is an ugly hack \n");
     if (args.i2c):
@@ -596,7 +610,8 @@ def main():
                    cg3=args.cg3,
                    cg6=args.cg6,
                    cg3_res=args.cg3_res,
-                   sdcard=args.sdcard)
+                   sdcard=args.sdcard,
+                   jareth=args.jareth)
     #soc.add_uart(name="uart", baudrate=115200, fifo_depth=16)
 
     version_for_filename = args.version.replace(".", "_")
@@ -644,7 +659,8 @@ def main():
                                               cg3=args.cg3,
                                               cg6=args.cg6,
                                               cg3_res=args.cg3_res,
-                                              sdcard=args.sdcard)
+                                              sdcard=args.sdcard,
+                                              jareth=args.jareth)
     write_to_file(os.path.join(f"prom_{version_for_filename}.fth"), prom_content)