From 68b81d91b8819ca1abbb7d5db6a0cebb9c8a393c Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Sun, 6 Mar 2022 15:17:02 +0100 Subject: [PATCH] first go at jareth vector engine --- NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c | 654 ++++++++ NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h | 62 + sbus-to-ztex-gateware-migen/jareth.py | 1321 +++++++++++++++++ .../jareth_code/Cargo.lock | 13 + .../jareth_code/Cargo.toml | 23 + .../jareth_code/jareth_code.rs | 108 ++ .../sbus_to_fpga_fsm.py | 6 + .../sbus_to_fpga_prom.py | 28 +- .../sbus_to_fpga_soc.py | 36 +- 9 files changed, 2232 insertions(+), 19 deletions(-) create mode 100644 NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c create mode 100644 NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h create mode 100644 sbus-to-ztex-gateware-migen/jareth.py create mode 100644 sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock create mode 100644 sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml create mode 100644 sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c new file mode 100644 index 0000000..80e535e --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.c @@ -0,0 +1,654 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2022 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +int jareth_print(void *, const char *); +int jareth_match(device_t, cfdata_t, void *); +void jareth_attach(device_t, device_t, void *); + +CFATTACH_DECL_NEW(jareth, sizeof(struct jareth_softc), + jareth_match, jareth_attach, NULL, NULL); + +dev_type_open(jareth_open); +dev_type_close(jareth_close); +dev_type_ioctl(jareth_ioctl); +dev_type_mmap(jareth_mmap); + + + +const struct cdevsw jareth_cdevsw = { + .d_open = jareth_open, + .d_close = jareth_close, + .d_read = noread, + .d_write = nowrite, + .d_ioctl = jareth_ioctl, + .d_stop = nostop, + .d_tty = notty, + .d_poll = nopoll, + .d_mmap = jareth_mmap, + .d_kqfilter = nokqfilter, + .d_discard = nodiscard, + .d_flag = 0 +}; + +extern struct cfdriver jareth_cd; + +struct jareth_testjob { + uint32_t data[32][8]; +}; + +static int init_programs(struct jareth_softc *sc); +static int write_inputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window); +static int start_job(struct jareth_softc *sc); +static int wait_job(struct jareth_softc *sc, uint32_t param); +static int read_outputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window); +static int dma_init(struct jareth_softc *sc); + +static int power_on(struct jareth_softc *sc); +static int power_off(struct jareth_softc *sc); + +int +jareth_open(dev_t dev, int flags, int mode, struct lwp *l) +{ + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver); + + if (sc == NULL) + return ENODEV; + + if ((unit != 0) && ((sc->active_sessions & (1 << unit)) == 0)) { + return ENODEV; + } + + /* first we need to turn the engine power on ... */ + power_on(sc); + + return (0); +} + +int +jareth_close(dev_t dev, int flags, int mode, struct lwp *l) +{ + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver); + + if (sc == NULL) + return ENODEV; + + if ((unit != 0) && (sc->active_sessions & (1 << unit))) { + device_printf(sc->sc_dev, "warning: close() on active session\n"); + sc->active_sessions &= ~(1 << unit); + sc->mapped_sessions &= ~(1 << unit); + } + + if (sc->active_sessions == 0) + power_off(sc); + + return (0); +} + +int +jareth_print(void *aux, const char *busname) +{ + + sbus_print(aux, busname); + return (UNCONF); +} + +int +jareth_match(device_t parent, cfdata_t cf, void *aux) +{ + struct sbus_attach_args *sa = (struct sbus_attach_args *)aux; + + return (strcmp("jareth", sa->sa_name) == 0); +} + +static const uint32_t program_test0[25] = { 0x01fc0014,0x407c0012,0xa0400013,0xa0c40013,0x007f0014,0x017f0054,0x0016f087,0x00185086,0x06000189,0x00480400,0x004c0440,0x00440420,0x00500440,0x617d1013,0x001b0186,0x01800189,0x20410015,0x20c51015,0xfb000809,0x20c51015,0x617d1013,0x000c0012,0x00080011,0x0000000a,0x0000000a }; + +static const uint32_t* programs[2] = { program_test0, NULL }; +static const uint32_t program_len[2] = { 25, 0 }; +static uint32_t program_offset[2]; + +static int do_test(struct jareth_softc *sc, uint32_t pidx); + +/* + * Attach all the sub-devices we can find + */ +void +jareth_attach(device_t parent, device_t self, void *aux) +{ + struct sbus_attach_args *sa = aux; + struct jareth_softc *sc = device_private(self); + struct sbus_softc *sbsc = device_private(parent); + int node; + int sbusburst; + + sc->sc_bustag = sa->sa_bustag; + sc->sc_dmatag = sa->sa_dmatag; + sc->sc_dev = self; + + aprint_normal("\n"); + + if (sa->sa_nreg < 3) { + aprint_error(": Not enough registers spaces\n"); + return; + } + + /* map registers */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[0].oa_space /* sa_slot */, + sa->sa_reg[0].oa_base /* sa_offset */, + sa->sa_reg[0].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_jareth) != 0) { + aprint_error(": cannot map Jareth registers\n"); + return; + } else { + aprint_normal_dev(self, "Jareth registers @ %p\n", (void*)sc->sc_bhregs_jareth); + } + /* map microcode */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[1].oa_space /* sa_slot */, + sa->sa_reg[1].oa_base /* sa_offset */, + sa->sa_reg[1].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_microcode) != 0) { + aprint_error(": cannot map Jareth microcode\n"); + return; + } else { + aprint_normal_dev(self, "Jareth microcode @ %p\n", (void*)sc->sc_bhregs_microcode); + } + /* map register file */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[2].oa_space /* sa_slot */, + sa->sa_reg[2].oa_base /* sa_offset */, + sa->sa_reg[2].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_regfile) != 0) { + aprint_error(": cannot map Jareth regfile\n"); + return; + } else { + aprint_normal_dev(self, "Jareth regfile @ %p\n", (void*)sc->sc_bhregs_regfile); + } + sc->sc_bufsiz_jareth = sa->sa_reg[0].oa_size; + sc->sc_bufsiz_microcode = sa->sa_reg[1].oa_size; + sc->sc_bufsiz_regfile = sa->sa_reg[2].oa_size; + + node = sc->sc_node = sa->sa_node; + + /* + * Get transfer burst size from PROM + */ + sbusburst = sbsc->sc_burst; + if (sbusburst == 0) + sbusburst = SBUS_BURST_32 - 1; /* 1->16 */ + + sc->sc_burst = prom_getpropint(node, "burst-sizes", -1); + if (sc->sc_burst == -1) + /* take SBus burst sizes */ + sc->sc_burst = sbusburst; + + /* Clamp at parent's burst sizes */ + sc->sc_burst &= sbusburst; + + aprint_normal("\n"); + aprint_normal_dev(self, "nid 0x%x, bustag %p, burst 0x%x (parent 0x%0x)\n", + sc->sc_node, + sc->sc_bustag, + sc->sc_burst, + sbsc->sc_burst); + + /* first we need to turn the engine power on ... */ + power_on(sc); + + if (init_programs(sc)) { + if (init_programs(sc)) { + aprint_normal_dev(sc->sc_dev, "INIT - FAILED\n"); + sc->initialized = 0; + } else { + sc->initialized = 1; + } + } else { + sc->initialized = 1; + } + + power_off(sc); + + sc->active_sessions = 0; + sc->mapped_sessions = 0; + + if (!dma_init(sc)) { + // ouch + sc->active_sessions = 0xFFFFFFFF; + sc->mapped_sessions = 0xFFFFFFFF; + } else { + do_test(sc, 0); + } +} + +#define CONFIG_CSR_DATA_WIDTH 32 +#define sbusfpga_jareth_softc jareth_softc +#include "dev/sbus/sbusfpga_csr_jareth.h" +#undef sbusfpga_jareth_softc + +#define REG_BASE(reg) (base + (reg * 32)) +#define SUBREG_ADDR(reg, off) (REG_BASE(reg) + (off)*4) + +#define SBUSFPGA_DO_TESTJOB _IOWR(0, 0, struct jareth_testjob) + +int +jareth_ioctl (dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) +{ + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver); + int err = 0; + + if (sc == NULL) { + return ENODEV; + } + + if (!sc->initialized) { + if (init_programs(sc)) { + return ENXIO; + } else { + sc->initialized = 1; + } + } + switch (cmd) { + case SBUSFPGA_DO_TESTJOB: { + if (unit != 0) + return ENOTTY; + + struct jareth_testjob* job = (struct jareth_testjob*)data; + jareth_mpstart_write(sc, program_offset[0]); + jareth_mplen_write(sc, program_len[0]); + + err = write_inputs(sc, job, 0); + if (err) + return err; + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, 1); + if (err) + return err; + err = read_outputs(sc, job, 0); + if (err) + return err; + } + break; + + default: + err = EINVAL; + break; + } + + return(err); +} + + +static int power_on(struct jareth_softc *sc) { + int err = 0; + if ((jareth_power_read(sc) & 1) == 0) { + jareth_power_write(sc, 1); + delay(1); + } + return err; +} +static int power_off(struct jareth_softc *sc) { + int err = 0; + jareth_power_write(sc, 0); + return err; +} + +static int init_programs(struct jareth_softc *sc) { + /* the microcode is a the beginning */ + int err = 0; + uint32_t i, j; + uint32_t offset = 0; + + for (j = 0 ; programs[j] != NULL; j ++) { + program_offset[j] = offset; + for (i = 0 ; i < program_len[j] ; i++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_microcode, ((offset+i)*4), programs[j][i]); + if ((i%16)==15) + delay(1); + } + offset += program_len[j]; + } + + jareth_window_write(sc, 0); /* could use window_window to access fields, but it creates a RMW cycle for nothing */ + jareth_mpstart_write(sc, 0); /* EC25519 */ + jareth_mplen_write(sc, program_len[0]); /* EC25519 */ + + aprint_normal_dev(sc->sc_dev, "INIT - Jareth status: 0x%08x\n", jareth_status_read(sc)); + +#if 1 + /* double check */ + u_int32_t x; + int count = 0; + for (i = 0 ; i < program_len[0] && count < 10; i++) { + x = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_microcode, (i*4)); + if (x != programs[0][i]) { + aprint_error_dev(sc->sc_dev, "INIT - Jareth program failure: [%d] 0x%08x <> 0x%08x\n", i, x, programs[0][i]); + err = 1; + count ++; + } + if ((i%8)==7) + delay(1); + } + if ((x = jareth_window_read(sc)) != 0) { + aprint_error_dev(sc->sc_dev, "INIT - Jareth register failure: window = 0x%08x\n", x); + err = 1; + } + if ((x = jareth_mpstart_read(sc)) != 0) { + aprint_error_dev(sc->sc_dev, "INIT - Jareth register failure: mpstart = 0x%08x\n", x); + err = 1; + } + if ((x = jareth_mplen_read(sc)) != program_len[0]) { + aprint_error_dev(sc->sc_dev, "INIT - Jareth register failure: mplen = 0x%08x\n", x); + err = 1; + } + const int test_reg_num = 73; + const uint32_t test_reg_value = 0x0C0FFEE0; + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile, 4*test_reg_num, test_reg_value); + delay(1); + if ((x = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile, 4*test_reg_num)) != test_reg_value) { + aprint_error_dev(sc->sc_dev, "INIT - Jareth register file failure: 0x%08x != 0x%08x\n", x, test_reg_value); + err = 1; + } +#endif + + return err; +} + +static int write_inputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window) { + const uint32_t base = window * 0x400; + int i, j; + uint32_t status = jareth_status_read(sc); + int err = 0; + if (status & (1<sc_dev, "WRITE - Jareth status: 0x%08x, still running?\n", status); + return ENXIO; + } + for (j = 0 ; j < 4 ; j++) { + for (i = 0 ; i < 8 ; i++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(j,i), job->data[j][i]); + } + } + +#if 1 + for (j = 0 ; j < 4 ; j++) { + for (i = 0 ; i < 8 && !err; i ++) { + if (job->data[j][i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(j,i))) err = EIO; + /* delay(1); */ + } + } + if (err) aprint_error_dev(sc->sc_dev, "WRITE - data did not read-write properly\n"); +#endif + + return err; +} + +static int start_job(struct jareth_softc *sc) { + uint32_t status = jareth_status_read(sc); + if (status & (1<sc_dev, "START - Jareth status: 0x%08x, still running?\n", status); + return ENXIO; + } + jareth_control_write(sc, 1); + //aprint_normal_dev(sc->sc_dev, "START - Jareth status: 0x%08x\n", jareth_status_read(sc)); + + return 0; +} + +static int wait_job(struct jareth_softc *sc, uint32_t param) { + uint32_t status = jareth_status_read(sc); + int count = 0; + int max_count = 250; + int del = 1; + const int max_del = 32; + static int max_del_seen = 1; + static int max_cnt_seen = 0; + + while ((status & (1<sc_dev, "WAIT - ongoing, Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, ls_status); + count ++; + delay(del); + del = del < max_del ? 2*del : del; + status = jareth_status_read(sc); + } + if (del > max_del_seen) { + max_del_seen = del; + aprint_normal_dev(sc->sc_dev, "WAIT - new max delay %d after %d count (param was %u)\n", max_del_seen, count, param); + } + if (count > max_cnt_seen) { + max_cnt_seen = count; + aprint_normal_dev(sc->sc_dev, "WAIT - new max count %d with %d delay (param was %u)\n", max_cnt_seen, del, param); + + } + + //jareth_control_write(sc, 0); + if (status & (1<sc_dev, "WAIT - Jareth status: 0x%08x (pc 0x%08x), did not finish in time? [inst: 0x%08x ls_status: 0x%08x]\n", status, (status>>1)&0x03ff, jareth_instruction_read(sc), jareth_ls_status_read(sc)); + return ENXIO; + } else if (status & (1<sc_dev, "WAIT - Jareth status: 0x%08x, sigill [inst: 0x%08x ls_status: 0x%08x]\n", status, jareth_instruction_read(sc), jareth_ls_status_read(sc)); + return ENXIO; + } else if (status & (1<sc_dev, "WAIT - Jareth status: 0x%08x, aborted [inst: 0x%08x ls_status: 0x%08x]\n", status, jareth_instruction_read(sc), jareth_ls_status_read(sc)); + return ENXIO; + } else { + //aprint_normal_dev(sc->sc_dev, "WAIT - Jareth status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, jareth_ls_status_read(sc)); + } + + return 0; +} + +static int read_outputs(struct jareth_softc *sc, struct jareth_testjob *job, const int window) { + const uint32_t base = window * 0x400; + int i, j; + uint32_t status = jareth_status_read(sc); + if (status & (1<sc_dev, "READ - Jareth status: 0x%08x, still running?\n", status); + return ENXIO; + } + + for (j = 0 ; j < 32 ; j++) { + for (i = 0 ; i < 8 ; i++) { + job->data[j][i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(j,i)); + } + delay(1); + } + + return 0; +} + + +static int +dma_init(struct jareth_softc *sc) { + + /* Allocate a dmamap */ + if (bus_dmamap_create(sc->sc_dmatag, JARETH_VAL_DMA_MAX_SZ, 1, JARETH_VAL_DMA_MAX_SZ, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, &sc->sc_dmamap) != 0) { + aprint_error_dev(sc->sc_dev, "DMA map create failed\n"); + return 0; + } else { + aprint_normal_dev(sc->sc_dev, "dmamap: %lu %lu %d (%p)\n", sc->sc_dmamap->dm_maxsegsz, sc->sc_dmamap->dm_mapsize, sc->sc_dmamap->dm_nsegs, sc->sc_dmatag->_dmamap_load); + } + + if (bus_dmamem_alloc(sc->sc_dmatag, JARETH_VAL_DMA_MAX_SZ, 64, 64, &sc->sc_segs, 1, &sc->sc_rsegs, BUS_DMA_NOWAIT | BUS_DMA_STREAMING)) { + aprint_error_dev(sc->sc_dev, "cannot allocate DVMA memory"); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + if (bus_dmamem_map(sc->sc_dmatag, &sc->sc_segs, 1, JARETH_VAL_DMA_MAX_SZ, &sc->sc_dma_kva, BUS_DMA_NOWAIT)) { + aprint_error_dev(sc->sc_dev, "cannot allocate DVMA address"); + bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + if (bus_dmamap_load(sc->sc_dmatag, sc->sc_dmamap, sc->sc_dma_kva, JARETH_VAL_DMA_MAX_SZ, /* kernel space */ NULL, + BUS_DMA_NOWAIT | BUS_DMA_STREAMING | BUS_DMA_WRITE)) { + aprint_error_dev(sc->sc_dev, "cannot load dma map"); + bus_dmamem_unmap(sc->sc_dmatag, &sc->sc_dma_kva, JARETH_VAL_DMA_MAX_SZ); + bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + aprint_normal_dev(sc->sc_dev, "DMA: SW -> kernel address is %p, dvma address is 0x%08llx, seg %llx / %ld\n", sc->sc_dma_kva, sc->sc_dmamap->dm_segs[0].ds_addr, sc->sc_segs.ds_addr, sc->sc_segs.ds_len); + + return 1; +} + +paddr_t jareth_mmap(dev_t dev, off_t offset, int prot) { + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct jareth_softc *sc = device_lookup_private(&jareth_cd, driver); + paddr_t addr = -1; + + device_printf(sc->sc_dev, "%s:%d: %lld %d for %d / %d\n", __PRETTY_FUNCTION__, __LINE__, offset, prot, driver, unit); + + if (offset != 0) + return -1; + if (prot & PROT_EXEC) + return -1; + /* if (sc->mapped_sessions & (1 << unit)) */ + /* return -1; */ + if ((sc->active_sessions & (1 << unit)) == 0) + return -1; + if (unit >= MAX_ACTIVE_SESSION) + return -1; + if (unit <= 0) + return -1; + + // addr = bus_dmamem_mmap(sc->sc_dmatag, sc->sc_dmamap->dm_segs, 1, (off_t)(4096*unit), prot, BUS_DMA_NOWAIT); + if (pmap_extract(pmap_kernel(), ((vaddr_t)sc->sc_dma_kva) + (unit * 4096), &addr)) { + + device_printf(sc->sc_dev, "mapped page %d to 0x%08lx [0x%08lx], kernel is %p\n", unit, addr, atop(addr), (void*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096))); + + ((uint32_t*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096)))[0] = 0xDEADBEEF; + sc->mapped_sessions |= (1 << unit); + + return addr; + } + + return -1; +} + +static int do_test(struct jareth_softc *sc, uint32_t pidx) { + struct jareth_testjob job; + int err = 0, i, j, window = 0; + + power_on(sc); + + for (i = 0 ; i < 8 ; i++) { + job.data[0][i] = 0; + job.data[1][i] = 0; + job.data[2][i] = 0; + job.data[3][i] = 0x04030201 + 0x04040404 * i; + } + job.data[0][0] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 3; + job.data[0][1] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048; + job.data[0][2] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048; + job.data[1][0] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048; + job.data[1][1] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 3; + job.data[1][2] = (uint32_t)((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + 5 + 2048; + job.data[2][0] = 16; + + for (i = 0 ; i < 16 ; i++) { + ((uint32_t*)sc->sc_dma_kva)[i] = 0xDEADBEEF; + ((uint32_t*)sc->sc_dma_kva)[i+512] = 0x11111111; + } + + jareth_mpstart_write(sc, program_offset[pidx]); + jareth_mplen_write(sc, program_len[pidx]); + + err = write_inputs(sc, &job, window); + if (!err) err = start_job(sc); + delay(1); + if (!err) + err = wait_job(sc, 1); + if (!err) + err = read_outputs(sc, &job, window); + + char buf[512]; + for (j = 0 ; j < 32; j++) { + snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", job.data[j][7-0], job.data[j][7-1], job.data[j][7-2], job.data[j][7-3], job.data[j][7-4], job.data[j][7-5], job.data[j][7-6], job.data[j][7-7]); + aprint_normal("reg%d : %s\n", j, buf); + } + snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", + ((uint32_t*)sc->sc_dma_kva)[0+512], ((uint32_t*)sc->sc_dma_kva)[1+512], + ((uint32_t*)sc->sc_dma_kva)[2+512], ((uint32_t*)sc->sc_dma_kva)[3+512], + ((uint32_t*)sc->sc_dma_kva)[4+512], ((uint32_t*)sc->sc_dma_kva)[5+512], + ((uint32_t*)sc->sc_dma_kva)[6+512], ((uint32_t*)sc->sc_dma_kva)[7+512]); + aprint_normal("mem0_7 : %s\n", buf); + snprintf(buf, 512, "0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x", + ((uint32_t*)sc->sc_dma_kva)[8+512], ((uint32_t*)sc->sc_dma_kva)[9+512], + ((uint32_t*)sc->sc_dma_kva)[10+512], ((uint32_t*)sc->sc_dma_kva)[11+512], + ((uint32_t*)sc->sc_dma_kva)[12+512], ((uint32_t*)sc->sc_dma_kva)[13+512], + ((uint32_t*)sc->sc_dma_kva)[14+512], ((uint32_t*)sc->sc_dma_kva)[15+512]); + aprint_normal("mem8_15 : %s\n", buf); + + power_off(sc); + + return err; +} diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h new file mode 100644 index 0000000..f07c6e7 --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/jareth.h @@ -0,0 +1,62 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _JARETH_H_ +#define _JARETH_H_ + +#define MAX_SESSION 32 // HW limit +#define MAX_ACTIVE_SESSION 8 // SW-imposed limit +// Single 4KiB pages per session +#define JARETH_VAL_DMA_MAX_SZ (MAX_ACTIVE_SESSION*4*1024) + +struct jareth_softc { + device_t sc_dev; /* us as a device */ + u_int sc_rev; /* revision */ + int sc_node; /* PROM node ID */ + int sc_burst; /* DVMA burst size in effect */ + bus_space_tag_t sc_bustag; /* bus tag */ + bus_space_handle_t sc_bhregs_jareth; /* bus handle */ + bus_space_handle_t sc_bhregs_microcode; /* bus handle */ + bus_space_handle_t sc_bhregs_regfile; /* bus handle */ + //void * sc_buffer; /* VA of the registers */ + int sc_bufsiz_jareth; /* Size of buffer */ + int sc_bufsiz_microcode; /* Size of buffer */ + int sc_bufsiz_regfile; /* Size of buffer */ + int initialized; + uint32_t active_sessions; + uint32_t mapped_sessions; + uint32_t sessions_cookies[MAX_ACTIVE_SESSION]; + /* DMA kernel structures */ + bus_dma_tag_t sc_dmatag; + bus_dmamap_t sc_dmamap; + bus_dma_segment_t sc_segs; + int sc_rsegs; + void * sc_dma_kva; +}; + +#endif /* _JARETH_H_ */ diff --git a/sbus-to-ztex-gateware-migen/jareth.py b/sbus-to-ztex-gateware-migen/jareth.py new file mode 100644 index 0000000..3105410 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/jareth.py @@ -0,0 +1,1321 @@ +from migen import * +from migen.genlib.cdc import MultiReg + +from litex.soc.interconnect.csr import * +from litex.soc.integration.doc import AutoDoc, ModuleDoc +from litex.soc.interconnect import wishbone +from litex.soc.interconnect.csr_eventmanager import * + +prime_string = "$2^{{255}}-19$" # 2\ :sup:`255`-19 +field_latex = "$\mathbf{{F}}_{{{{2^{{255}}}}-19}}$" + +opcode_bits = 5 # number of bits used to encode the opcode field +opcodes = { # mnemonic : [bit coding, docstring] ; if bit 6 (0x20) is set, shift a + "UDF" : [-1, "Placeholder for undefined opcodes"], + "PSA" : [0, "Wd $\gets$ Ra // pass A"], + "PSB" : [1, "Wd $\gets$ Rb // pass B"], + # 2 MSK + "XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"], + "NOT" : [4, "Wd $\gets$ ~Ra // binary invert"], + "ADD" : [5, "Wd $\gets$ Ra + Rb // 256-bit binary add"], + "SUB" : [6, "Wd $\gets$ Ra - Rb // 256-bit binary subtraction"], + "AND" : [7, "Wd $\gets$ Ra & Rb // bitwise AND"], # replace MUL + "BRNZ" : [8, "If Ra != 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if non-zero"], # relace TRD + "BRZ" : [9, "If Ra == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero"], + "FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"], + "SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"], + # 12 XBT + # for MEM, bit #31 (imm[8]) indicates both lanes are needed; imm[31] == 0 faster as the second access is not done ; + "GETM": [17, "GETM: getmask" ], + "ADR": [18, "ADR: set or recover addresses, Wd $\gets$ ADR (for GETADR) or Wd $\gets$ 0 (for SETADR)" ], + "MEM" : [19, "MEM: imm[8] == 1 for 256 imm[7] == 0 for LOAD, imm[7] == 1 for STORE (beware, store zeroes the output reg); post-inc in imm[6], address in addr[imm[0...]]" ], + "SETM" : [20, "SETMx: Wd $\gets$ 0, masking for x = imm[1:0] set to start Ra[0:4], length Rb[0:5] ; using imm[1:0]==3 reset all (alias resm)" ], + "LOADH" : [21, "LOADH: imm[7] == 0 for LOAD, address in addr[imm[0...]], high->low & load a+16 into high" ], + "MAX" : [22, "Maximum opcode number (for bounds checking)"], +} + +num_registers = 32 +instruction_layout = [ + ("opcode", opcode_bits, "opcode to be executed"), + ("shift", 1, "should A & Q be shifted"), + ("ra", log2_int(num_registers), "operand A read register"), + ("ca", 1, "set to substitute constant table value for A"), + ("rb", log2_int(num_registers), "operand B read register"), + ("cb", 1, "set to substitute constant table value for B"), + ("wd", log2_int(num_registers), "write register"), + ("immediate", 9, "Used by jumps to load the next PC value") +] + +class RegisterFile(Module, AutoDoc): + def __init__(self, depth=512, width=256, bypass=False): + reset_cycles = 4 + self.intro = ModuleDoc(title="Register File", body=""" +This implements the register file for the Jareth engine. It's implemented using +7-series specific block RAMs in order to take advantage of architecture-specific features +to ensure a compact and performant implementation. + +The core primitive is the RAMB36E1. This can be configured as a 64/72-bit wide memory +but only if used in "SDP" (simple dual port) mode. In SDP, you have one read, one write port. +However, the register file needs to produce two operands per cycle, while accepting up to +one operand per cycle. + +In order to do this, we stipulate that the RF runs at `rf_clk` (200MHz), but uses four phases +to produce/consume data. "Engine clock" `eng_clk` (50MHz) runs at a lower rate to accommodate +large-width arithmetic in a single cycle. + +The phasing is defined as follows: + +Phase 0: + - read from port A +Phase 1: + - read from port B +Phase 2: + - write data +Phase 3: + - quite cycle, used to create extra setup time for next stage (requires multicycle-path constraints) + +The writing of data is done in the second phase means that write happen to the same address +as being read, you get the old value. For pipelined operation, it could be desirable to shift +the write to happen before the reads, but as of now the implementation is not pipelined. + +The register file is unavailable for {} `eng_clk` cycles after reset. + +When configured as a 64 bit memory, the depth of the block is 512 bits, corresponding to +an address width of 9 bits. + + """.format(reset_cycles)) + + instruction = Record(instruction_layout) + phase = Signal(2) # internal phase + self.phase = Signal() # external phase + self.comb += self.phase.eq(phase[1]) # divide down internal phase so slower modules can capture it + + # these are the signals in and out of the register file + self.ra_dat = Signal(width) # this is passed in from outside the module because we want to mux with e.g. memory bus + self.ra_adr = Signal(log2_int(depth)) + self.rb_dat = Signal(width) + self.rb_adr = Signal(log2_int(depth)) + + # register file pipelines the write target address, going to the exec units; also needs the window to be complete + # window is assumed to be static and does not change throughout a give program run, so it's not pipelined + self.instruction_pipe_in = Signal(len(instruction)) + self.instruction_pipe_out = Signal(len(instruction)) + self.window = Signal(max(1, log2_int(depth) - log2_int(num_registers))) + + # this is the immediate data to write in, coming from the exec units + self.wd_dat = Signal(width) + self.wd_adr = Signal(log2_int(depth)) + self.wd_bwe = Signal(width//8) # byte masks for writing + self.we = Signal() + self.clear = Signal() + + self.running = Signal() # used for activity gating to RAM + + eng_sync = Signal(reset=1) + + rf_adr = Signal(log2_int(depth)) + self.comb += [ + If(phase == 0, + rf_adr.eq(self.ra_adr), + ).Elif(phase == 1, + rf_adr.eq(self.rb_adr), + ) + ] + rf_dat = Signal(width) + self.sync.eng_clk += [ + # TODO: check that this is in sync with expected values + self.instruction_pipe_out.eq(self.instruction_pipe_in), + ] + # unfortunately, -1L speed grade is too slow to support pipeline bypassing of the register file: + # bypass path closes at about 5.4ns, which fails to meet the 5ns cycle time target for the four-phase RF + if bypass: + self.sync.rf_clk += [ + If(phase == 1, + If((self.wd_adr != self.ra_adr) | ~self.we, + self.ra_dat.eq(rf_dat), + ).Else( + self.ra_dat.eq(self.wd_dat), + ), + self.rb_dat.eq(self.rb_dat), + ).Elif(phase == 2, + self.ra_dat.eq(self.ra_dat), + If((self.wd_adr != self.rb_adr) | ~self.we, + self.rb_dat.eq(rf_dat), + ).Else( + self.rb_dat.eq(self.wd_dat), + ) + ).Else( + self.ra_dat.eq(self.ra_dat), + self.rb_dat.eq(self.rb_dat), + ), + ] + else: + self.sync.rf_clk += [ + If(phase == 1, + self.ra_dat.eq(rf_dat), + self.rb_dat.eq(self.rb_dat), + ).Elif(phase == 2, + self.ra_dat.eq(self.ra_dat), + self.rb_dat.eq(rf_dat), + ).Else( + self.ra_dat.eq(self.ra_dat), + self.rb_dat.eq(self.rb_dat), + ), + ] + wren_pipe = Signal() # do not change this variable name, it is constrained in the XDC + self.sync.rf_clk += [ + If(eng_sync, + phase.eq(0), + ).Else( + phase.eq(phase + 1), + ), + wren_pipe.eq((phase == 1) & self.we), # we want wren to hit on phase==2, but we pipeline it to relax timing. so capture the input to the pipe on phase == 1 + ] + wd_bwe_pipe = Signal(width//8) + self.sync.rf_clk += [ + # add a register to relax timing on wd_bwe. This offsets the signal by one rf_clk (clk200) period, + # but because write happens on phase 2 and the signal is valid on eng_clk (clk50) edges, this will + # not affect the functionality + wd_bwe_pipe.eq(self.wd_bwe) + ] + + for word in range(int(256/64)): + self.specials += Instance("BRAM_SDP_MACRO", name="RF_RAMB" + str(word), + p_BRAM_SIZE = "36Kb", + p_DEVICE = "7SERIES", + p_WRITE_WIDTH = 64, + p_READ_WIDTH = 64, + p_DO_REG = 0, + p_INIT_FILE = "NONE", + p_SIM_COLLISION_CHECK = "ALL", # "WARNING_ONLY", "GENERATE_X_ONLY", "NONE" + p_SRVAL = 0, + p_WRITE_MODE = "READ_FIRST", + i_RDCLK = ClockSignal("rf_clk"), + i_WRCLK = ClockSignal("rf_clk"), + i_RDADDR = rf_adr, + i_WRADDR = self.wd_adr, + i_DI = self.wd_dat[word*64 : word*64 + 64], + o_DO = rf_dat[word*64 : word*64 + 64], + i_RDEN = self.running, # reduce power when not running + i_WREN = wren_pipe, # (phase == 2) & self.we, but pipelined one stage + i_RST = ResetSignal("rf_clk"), + i_WE = wd_bwe_pipe[word*8 : word*8 + 8], + + i_REGCE = 1, # should be ignored, but added to quiet down simulation warnings + ) + + # create an internal reset signal that synchronizes the "eng" to the "rf" domains + # it will also reset the register file on demand + reset_counter = Signal(log2_int(reset_cycles), reset=reset_cycles - 1) + self.sync.eng_clk += [ + If(self.clear, + reset_counter.eq(reset_cycles - 1), + eng_sync.eq(1), + ).Else( + If(reset_counter != 0, + reset_counter.eq(reset_counter - 1), + eng_sync.eq(1), + ).Else( + eng_sync.eq(0) + ), + ) + ] + +class JarethConst(Module, AutoDoc): + def __init__(self, insert_docs=False): + global did_const_doc + constant_defs = { + 0: [0, "zero", "The number zero"], + 1: [1, "one", "The number one"], + 2: [2, "two", "The number two"], + #3: [3, "three", "The number three"], + #4: [4, "four", "The number four"], + #5: [5, "five", "The number five"], + #6: [6, "six", "The number six"], + #7: [7, "seven", "The number seven"], + #8: [8, "eight", "The number eight"], + 15: [15, "sixteen", "The number fifteen"], + 16: [16, "sixteen", "The number sixteen"], + } + self.adr = Signal(5) + self.const = Signal(256) + constant_str = "This module encodes the constants that can be substituted for any register value. Therefore, up to 32 constants can be encoded.\n\n" + for code, const in constant_defs.items(): + self.comb += [ + If(self.adr == code, + self.const.eq(const[0]), + ) + ] + constant_str += """ +**{}** + + Substitute register {} with {}: {}\n""".format(const[1], code, const[2], const[0]) + if insert_docs: + self.constants = ModuleDoc(title="Jareth Constants", body=constant_str) + +# ------------------------------------------------------------------------ EXECUTION UNITS +class ExecUnit(Module, AutoDoc): + def __init__(self, width=256, opcode_list=["UDF"], insert_docs=False): + if insert_docs: + self.intro = ModuleDoc(title="ExecUnit class", body=""" + ExecUnit is the superclass template for execution units. + + Configuration Arguments: + - `opcode_list` is the list of opcodes that an ExecUnit can process + - `width` is the bit-width of the execution pathway + + Signal API for an exec unit: + - `a` and `b` are the inputs. + - `instruction_in` is the instruction corresponding to the currently present `a` and `b` inputs + - `start` is a single-clock signal which indicates processing should start + - `q` is the output + - `instruction_out` is the instruction for the result present at the `q` output + - `q_valid` is a single cycle pulse that indicates that the `q` result and `wa_out` value is valid + + + """) + self.instruction = Record(instruction_layout) + + self.a = Signal(width) # raw or shifted + self.b = Signal(width) # shifted + self.q = Signal(width) # shifted + self.start = Signal() + self.q_valid = Signal() + # pipeline the instruction + self.instruction_in = Signal(len(self.instruction)) + self.instruction_out = Signal(len(self.instruction)) + + self.opcode_list = opcode_list + self.comb += [ + self.instruction.raw_bits().eq(self.instruction_in) + ] + +class ExecLogic(ExecUnit): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["XOR", "NOT", "PSA", "SHL", "AND"]) + self.intro = ModuleDoc(title="Logic ExecUnit Subclass", body=f""" +This execution unit implements bit-wise logic operations: XOR, NOT, and +passthrough. + +* XOR returns the result of A^sB +* NOT returns the result of !A +* PSA returns the value of A +* SHL returns A << 1 +* AND returns the result of A&sB + +""") + + zeros = Signal(255, reset=0) + self.sync.eng_clk += [ + self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + self.comb += [ + If(self.instruction.opcode == opcodes["XOR"][0], + self.q.eq(self.a ^ self.b) + ).Elif(self.instruction.opcode == opcodes["NOT"][0], + self.q.eq(~self.a) + ).Elif(self.instruction.opcode == opcodes["PSA"][0], + self.q.eq(self.a), + ).Elif(self.instruction.opcode == opcodes["PSB"][0], + self.q.eq(self.b), + ).Elif(self.instruction.opcode == opcodes["SHL"][0], + self.q.eq(Cat(0, self.a[:255])), + ).Elif(self.instruction.opcode == opcodes["AND"][0], + self.q.eq(self.a & self.b), + ), + ] + +class ExecAddSub(ExecUnit, AutoDoc): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["ADD", "SUB"]) + self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f""" +This execution module implements 256-bit binary addition and subtraction. + +Note that to implement operations in $\mathbf{{F}}_p$, where *p* is $2^{{255}}-19$, this must be compounded +with other operators as follows: + +Addition of Ra + Rb into Rc in {field_latex}: + +.. code-block:: c + + ADD Rc, Ra, Rb // Rc <- Ra + Rb + TRD Rd, Rc // Rd <- ReductionValue(Rc) + SUB Rc, Rc, Rd // Rc <- Rc - Rd + +Negation of Ra into Rc in {field_latex}: + +.. code-block:: c + + SUB Rc, #FIELDPRIME, Ra // Rc <- 2^255-19 - Ra + +Note that **#FIELDPRIME** is one of the 32 available hard-coded constants +that can be substituted for any register in any arithmetic operation, please +see the section on "Constants" for more details. + +Subtraction of Ra - Rb into Rc in {field_latex}: + +.. code-block:: c + + SUB Rb, #FIELDPRIME, Rb // Rb <- 2^255-19 - Rb + ADD Rc, Ra, Rb // Rc <- Ra + Rb + TRD Rd, Rc // Rd <- ReductionValue(Rc) + SUB Rc, Rc, Rd // Rc <- Rc - Rd + +In all the examples above, Ra and Rb must be members of {field_latex}. + """) + + self.sync.eng_clk += [ + self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + self.comb += [ + If(self.instruction.opcode == opcodes["ADD"][0], + self.q.eq(self.a + self.b), + ).Elif(self.instruction.opcode == opcodes["SUB"][0], + self.q.eq(self.a - self.b), + ), + ] + +class ExecLS(ExecUnit, AutoDoc): + def __init__(self, width=256, interface=None, r_dat_f=None, r_dat_m=None, granule=0): + ExecUnit.__init__(self, width, ["MEM", "SETM", "ADR", "LOADH", "GETM"]) + + self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f""" + """) + + self.sync.eng_clk += [ # pipeline the instruction + self.instruction_out.eq(self.instruction_in), + ] + + assert(width == 256) # fixme + assert(len(interface.sel) == 16) # 128 bits Wishbone + + start_pipe = Signal() + self.sync.mul_clk += start_pipe.eq(self.start) # break critical path of instruction decode -> SETUP_A state muxes + self.submodules.lsseq = lsseq = ClockDomainsRenamer("mul_clk")(FSM(reset_state="IDLE")) + cpar = Signal() # to keep track of the odd-ness of our cycle, so we can align 2 mul_clk cycles of output on 1 eng_clk cycle + lbuf = Signal(width) + timeout = Signal(11) + #tries = Signal() + self.has_failure = Signal(2) + self.has_timeout = Signal(2) + + self.sync.mul_clk += If(timeout > 0, timeout.eq(timeout - 1)) + + granule_bits = log2_int(granule) + granule_num = width//granule + granule_num_bits = log2_int(granule_num) + + offset = Signal(granule_num_bits-1, reset = 0) + max_size_bits=28 # 256 MiB + offsetpsize = Signal(max_size_bits+1, reset = 0) + + addresses = Array(Signal(28) for x in range(width//32)) # 128-bits chunk, so 16-bytes chunk, so low 4 bits are ignored + + lsseq.act("IDLE", + If(start_pipe, + If(self.instruction.opcode == opcodes["MEM"][0], + NextValue(cpar, 0), + NextValue(self.has_timeout, 0), + NextValue(self.has_failure, 0), + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, addresses[self.instruction.immediate[0:log2_int(width//32)]]), + NextValue(interface.we, self.instruction.immediate[7]), + NextValue(timeout, 2047), + If(self.instruction.immediate[7], # do we need those tests or could we always update dat_w/dat_r ? + NextValue(interface.dat_w, self.b[0:128])), + NextState("MEMl") # MEMl + ).Elif(self.instruction.opcode == opcodes["LOADH"][0], + NextValue(cpar, 0), + NextValue(self.has_timeout, 0), + NextValue(self.has_failure, 0), + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, addresses[self.instruction.immediate[0:log2_int(width//32)]]), + NextValue(interface.we, self.instruction.immediate[7]), + NextValue(timeout, 2047), + NextValue(lbuf[0:128], self.b[128:256]), + NextState("MEMh") # MEMl + ).Elif(self.instruction.opcode == opcodes["SETM"][0], + Case(self.instruction.immediate[0:2], + { 0x3 : [ NextValue(r_dat_f[0], 0), + NextValue(r_dat_f[1], 0), + NextValue(r_dat_f[2], 0), + NextValue(r_dat_m[0], (1< X) & (X >= offset), + NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 1), + ).Else( + NextValue(r_dat_m[self.instruction.immediate[0:2]][X], 0), + ), + If(X == (granule_num-1), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + ).Else( + NextState("GENMASK_R" + str(X+1)), + ), + ) + lsseq.act("GENMASK_R"+str(granule_num), # avoids MiGen complaining, unreachable + NextValue(cpar, cpar ^ 1), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + ) + + lsseq.act("MEMl", + NextValue(cpar, cpar ^ 1), + If(interface.ack, + If(~self.instruction.immediate[7], + NextValue(lbuf[0:128], interface.dat_r)), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("MEMl2") + ).Elif(interface.err, + NextValue(self.has_failure[0], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + ).Elif(timeout == 0, + NextValue(self.has_timeout[0], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + )) + lsseq.act("MEMl2", + NextValue(cpar, cpar ^ 1), + If(~interface.ack, + If(self.instruction.immediate[6], # post-inc + NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), + ), + If(self.instruction.immediate[8], + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, (addresses[self.instruction.immediate[0:log2_int(width//32)]]) + 1), + NextValue(interface.we, self.instruction.immediate[7]), + NextValue(timeout, 2047), + If(self.instruction.immediate[7], + NextValue(interface.dat_w, self.b[128:256])), + NextState("MEMh") + ).Else( + NextValue(lbuf[128:256], 0), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + ) + )) + lsseq.act("MEMh", + NextValue(cpar, cpar ^ 1), + If(interface.ack, + If(~self.instruction.immediate[7], + NextValue(lbuf[128:256], interface.dat_r)), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("MEMh2") + ).Elif(interface.err, + NextValue(self.has_failure[1], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + ).Elif(timeout == 0, + NextValue(self.has_timeout[1], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + )) + lsseq.act("MEMh2", + NextValue(cpar, cpar ^ 1), + If(~interface.ack, + If(self.instruction.immediate[6], # post-inc + NextValue(addresses[self.instruction.immediate[0:log2_int(width//32)]], addresses[self.instruction.immediate[0:log2_int(width//32)]] + 1), + ), + #NextValue(tries, 0), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + )) + lsseq.act("MEM_ODD", # clock alignement cycle + NextState("MEM_EVEN1")) + lsseq.act("MEM_EVEN1", + NextState("MEM_EVEN2")) + lsseq.act("MEM_EVEN2", + NextValue(cpar, 0), + NextValue(self.has_failure, 0), + NextValue(self.has_timeout, 0), + NextState("IDLE")) + lsseq.act("ERR", + #If(~tries, # second attempt + # NextValue(cpar, 0), + # NextValue(tries, 1), + # NextState("IDLE") + #).Else(NextValue(tries, 0), # no third attempt, give up + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + #) + ) + self.sync.mul_clk += [ + If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"), + self.q_valid.eq(1), + If((self.instruction.opcode == opcodes["MEM"][0]) | (self.instruction.opcode == opcodes["LOADH"][0]), + If(~self.instruction.immediate[7], + self.q.eq(lbuf), + ).Else( + self.q.eq(0), #self.a + ) + ).Elif(self.instruction.opcode == opcodes["SETM"][0], + self.q.eq(0), #self.a + ).Elif(self.instruction.opcode == opcodes["ADR"][0], + If(~self.instruction.immediate[7], + [ self.q[x*32:(x+1)*32].eq(Cat(Signal(4, reset = 0), addresses[x])) for x in range(width//32) ], + ).Else( + self.q.eq(0), + ) + ).Elif(self.instruction.opcode == opcodes["GETM"][0], + self.q.eq(Cat(Cat(r_dat_f[0], Signal(28, reset = 0)), + r_dat_m[0], + Cat(r_dat_f[1], Signal(28, reset = 0)), + r_dat_m[1], + Cat(r_dat_f[2], Signal(28, reset = 0)), + r_dat_m[2], + Cat(r_dat_f[3], Signal(28, reset = 0)), + r_dat_m[3])), + ).Else( + self.q.eq(0xBADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000_BADD0000), + ), + ).Else( + self.q_valid.eq(0), + ) + ] + + self.state = Signal(32) + self.sync.mul_clk += self.state[0].eq(lsseq.ongoing("IDLE")) + self.sync.mul_clk += self.state[1].eq(lsseq.ongoing("MEMl")) + self.sync.mul_clk += self.state[2].eq(lsseq.ongoing("MEMl2")) + self.sync.mul_clk += self.state[3].eq(lsseq.ongoing("MEMh")) + self.sync.mul_clk += self.state[4].eq(lsseq.ongoing("MEMh2")) + self.sync.mul_clk += self.state[5].eq(lsseq.ongoing("MEM_ODD")) + self.sync.mul_clk += self.state[6].eq(lsseq.ongoing("MEM_EVEN1")) + self.sync.mul_clk += self.state[7].eq(lsseq.ongoing("MEM_EVEN2")) + self.sync.mul_clk += self.state[8].eq(lsseq.ongoing("MEM_ERR")) + self.sync.mul_clk += self.state[28:30].eq((self.state[28:30] & Replicate(~start_pipe, 2)) | self.has_timeout) + self.sync.mul_clk += self.state[30:32].eq((self.state[30:32] & Replicate(~start_pipe, 2)) | self.has_failure) + + +class Jareth(Module, AutoCSR, AutoDoc): + def __init__(self, platform, prefix, sim=False, build_prefix=""): + opdoc = "\n" + for mnemonic, description in opcodes.items(): + opdoc += f" * **{mnemonic}** ({str(description[0])}) -- {description[1]} \n" + + self.intro = ModuleDoc(title="Jareth", body=""" +Jareth is a vector computational engine based on the Curve25519 Engine. + +The Engine loosely resembles a Harvard architecture microcoded CPU, with a single +512-entry, 256-bit wide 2R1W windowed-register file, a handful of execution units, and a "mailbox" +unit (like a load/store, but transactional to wishbone). The Engine's microcode is +contained in a 1k-entry, 32-bit wide microcode block. Microcode procedures are written to +the block, and execution will start from the `mpstart` offset when the `go` bit is set. +Execution will stop after either one of two conditions are met: either a `FIN` instruction +is executed, or the microcode program counter (mpc) goes past the stop threshold, computed +as `mpstart` + `mplen`. + +The register file is "windowed". A single window consists of 32x256-bit wide registers, +and there are up to 16 windows. The concept behind windows is that core routines, such +as point doubling and point addition, are codable using no more than 32 intermediate +registers. The same microcode can be used, then, to serve point operations to up to +16 different clients, selectable by setting the appropriate window. Note that the register +file will stripe across four 4kiB pages, which means that memory protection can be +enforced at page-level boundaries by hardware (with the help of the OS) for up to four +separate clients, each getting four register windows. + +Every register read can be overridden from a constant ROM, by asserting `ca` or `cb` for +registers a and b respectively. When either of these bits are asserted, the respective +register address is fed into a "constants" lookup table, and the result of that table lookup is +replaced for the constant value. This means up to 32 commonly used constants may be stored +in the hardware for quick retrieval. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/block_diagram.png + :alt: High-level block diagram of the Curev25519 engine + +Above is a high-level block diagram of the Curve25519 engine. Four clocks are present +in this microarchitecture, and they are phase-aligned thanks to the 7-Series MMCM +and low-skew global clock network. `eng_clk` is 50MHz, `mul_clk` is 100MHz, and +`rf_clk` is 200MHz. The slowest 50MHz `eng_clk` clock controls the `seq` state machine, whose +state names are listed on the left. A 50MHz base clock is chosen because this allows a +single-cycle 256-bit add/sub using hardware carry chains in the Spartan7 -1L speed grade, +greatly simplifying most of the arithmetic blocks. Faster clocks are used to pump the microcode +RAM (100MHz) and register file (200MHz), so that we are wasting less time fetching instructions +and operands. In particular, the register file uses four phases because we are emulating +a three-port register file (2R1W) using a single-port memory primitive, and the microcode RAM +runs at 100MHz (sysclk) for convenience of reading/writing instructions from the Wishbone bus. +Not shown in the diagram are the global "window" register bits, or the multiplexers that +switch off the datapaths when the system is not running allowing Wishbone full access to +the machine state. + +Execution units are subclasses of "ExecUnit", and their instantiation is controlled by +inclusion in the `exec_units` dictionary. Likewise, opcodes are defined in the `opcodes`, +dictionary, and opcodes are bound to ExecUnits by passing them as the `opcode_list` argument +to the execution units. + +Note that execution units can take an arbitrary amount of time to complete. Most will complete +in one cycle, but for example, the multiplier takes 52 cycles @ 100MHz, or 26 `eng_clk` cycles. +The current implementation does not allow pipelined operation; registered stages are provided +to break combinational paths and bring up the base clock rate, but every instruction must go through +the entire FETCH-EXEC-WAIT_DONE cycle before the next one can issue. + +The design is partially outfitted with registers to facilitate pipelining in the future, but +the current simplified implementation is expected to provide adequate speedup. It's +probably not worth the additional resources to do e.g. pipeline bypassing and hazard checking, +as the target FPGA design is nearly at capacity. + +A conservative implementation (no optimization of intermediate values, immediate reduction of +every add/sub operation) of Montgomery scalar multiplication using Engine25519 +completes one scalar multiply operation in 2.270ms, compared to 103ms in software. +This does not include the time required to do the final affine inversion (done in software, +with significant overhead -- about 100ms), or the time to load the microcode and operands (about 5us). +The affine inversion can also be microcoded, it just hasn't been done yet. + +The Engine address space is divided up as follows (expressed as offset from base):: + + 0x0_0000 - 0x0_0fff: microcode (one 4k byte page) + 0x1_0000 - 0x1_3fff: memory-mapped register file (4 x 4k pages = 16kbytes) + +Here are the currently implemented opcodes for The Engine: +{} + """.format(opdoc)) + + microcode_width = 32 + microcode_depth = 1024 + running = Signal() # asserted when microcode is running + + instruction = Record(instruction_layout) # current instruction to execute + illegal_opcode = Signal() + abort = Signal(); + + ### register file + rf_depth_raw = num_registers * 1 # total # or registers + rf_width_raw = 256 # width of a register + granule = 8 + granule_bits = log2_int(granule) + granule_num = rf_width_raw//granule + granule_num_bits = log2_int(granule_num) + + self.submodules.rf = rf = RegisterFile(depth=rf_depth_raw, width=rf_width_raw) + self.window = CSRStorage(fields=[ + CSRField("window", size=max(1, log2_int(rf_depth_raw) - log2_int(num_registers)), description="Selects the current register window to use"), + ]) + + self.mpstart = CSRStorage(fields=[ + CSRField("mpstart", size=log2_int(microcode_depth), description="Where to start execution") + ]) + self.mplen = CSRStorage(fields=[ + CSRField("mplen", size=log2_int(microcode_depth), description="Length of the current microcode program. Thus valid code must be in the range of [mpstart, mpstart + mplen]"), + ]) + self.control = CSRStorage(fields=[ + CSRField("go", size=1, pulse=True, description="Writing to this puts the engine in `run` mode, and it will execute mplen microcode instructions starting at mpstart"), + ]) + self.mpresume = CSRStatus(fields=[ + CSRField("mpresume", size=log2_int(microcode_depth), description="Where to resume execution after a pause") + ]) + + self.power = CSRStorage(fields=[ + CSRField("on", size=1, reset=0, + description="Writing `1` turns on the clocks to this block, `0` stops the clocks (for power savings). The handling of the clock gate is in a different module, this is just a flag to that block."), + CSRField("pause_req", size=1, description="Writing a `1` to this block will pause execution at the next micro-op, and allow for read-out of data from RF/microcode. Must check pause_gnt to confirm the pause has happened. Used to interrupt flow for suspend/resume."), + ]) + # bring pause into the eng_clk domain + pause_req = Signal() + self.sync.eng_clk += pause_req.eq(self.power.fields.pause_req) + # re-sync the eng_clk phase to the RF phase whenever clocks are re-applied. We don't guarantee that the clocks start exactly + # at the same time, so you can get phase shift... + power_on_delay = Signal(max=16, reset=15) + eng_powered_on = Signal() + self.sync += [ # stretch out any power on pulse so we can process a reset in the clk50 domain after its enable has been switched on + If(~self.power.fields.on, + power_on_delay.eq(15) + ).Elif(power_on_delay > 0, + power_on_delay.eq(power_on_delay - 1) + ).Else( + power_on_delay.eq(0) + ), + eng_powered_on.eq(power_on_delay == 0), # make a signal that specifies that the engine is powered on that happens 16 cycles after the clocks are turned on + # note that this signal drops only *after* the power has been toggled, because when the clock is cut, + # the downstream "eng_clk" domain signals won't capture the latest state. So, once the power comes on, + # eng_powered_on must drop for a few cycles, then come back up again, which properly triggers a synchronization of the RF. + ] + eng_on_50 = Signal() + eng_on_50_r = Signal() + self.specials += MultiReg(eng_powered_on, eng_on_50, "eng_clk") + self.sync.eng_clk += eng_on_50_r.eq(eng_on_50) + rf_reset_clear = Signal() + self.specials += MultiReg(ResetSignal("eng_clk"), rf_reset_clear, "eng_clk") # sync up the register file's fast clock to our slow clock + self.comb += rf.clear.eq(rf_reset_clear | (eng_on_50 & ~eng_on_50_r)) + + self.status = CSRStatus(fields=[ + CSRField("running", size=1, description="When set, the microcode engine is running. All wishbone access to RF and microcode memory areas will stall until this bit is clear"), + CSRField("mpc", size=log2_int(microcode_depth), description="Current location of the microcode program counter. Mostly for debug."), + CSRField("pause_gnt", size=1, description="When set, the engine execution has been paused, and the RF & microcode ROM can be read out for suspend/resume"), + CSRField("sigill", size=1, description="Illegal Instruction"), + CSRField("abort", size=1, description="Abort from failure"), + CSRField("finished", size=1, description="Finished"), + ]) + pause_gnt = Signal() + mpc = Signal(log2_int(microcode_depth)) # the microcode program counter + running_r = Signal() + self.sync += [ + self.status.fields.running.eq(running), + self.status.fields.pause_gnt.eq(pause_gnt), + self.status.fields.mpc.eq(mpc), + self.status.fields.sigill.eq(illegal_opcode), + self.status.fields.abort.eq(abort), + self.status.fields.finished.eq(((~running & running_r) | self.status.fields.finished) & (~(running & ~running_r))), + ] + + self.submodules.ev = EventManager() + self.ev.finished = EventSourcePulse(description="Microcode run finished execution") + self.ev.illegal_opcode = EventSourcePulse(description="Illegal opcode encountered") + self.ev.finalize() + ill_op_r = Signal() + self.sync += [ + running_r.eq(running), + ill_op_r.eq(illegal_opcode), + ] + self.comb += [ + self.ev.finished.trigger.eq(~running & running_r), # falling edge pulse on running + self.ev.illegal_opcode.trigger.eq(~ill_op_r & illegal_opcode), + ] + + ### microcode memory - 1rd/1wr dedicated to wishbone, 1rd for execution + microcode = Memory(microcode_width, microcode_depth) + self.specials += microcode + micro_wrport = microcode.get_port(write_capable=True, mode=READ_FIRST) # READ_FIRST allows BRAM inference + self.specials += micro_wrport + micro_rdport = microcode.get_port(mode=READ_FIRST) + self.specials += micro_rdport + micro_runport = microcode.get_port(mode=READ_FIRST) # , clock_domain="eng_clk" + self.specials += micro_runport + + self.comb += [ + micro_runport.adr.eq(mpc), + instruction.raw_bits().eq(micro_runport.dat_r), # mapping should follow the record definition *exactly* + instruction.eq(micro_runport.dat_r), + ] + instruction_fields = [] + for opcode, bits, description in instruction_layout: + instruction_fields.append(CSRField(opcode, size=bits, description=description)) + self.instruction = CSRStatus(description="Current instruction being executed by the engine. The format of this register exactly reflects the binary layout of an Engine instruction.", fields=instruction_fields) + self.comb += [ + self.instruction.status.eq(micro_runport.dat_r) + ] + + self.ls_status = CSRStatus(32, description="Status of the L/S unit") + + ### wishbone bus interface: decode the two address spaces and dispatch accordingly + self.bus = bus = wishbone.Interface() + wdata = Signal(32) + wadr = Signal(log2_int(rf_depth_raw) + 3) # wishbone bus is 32-bits wide, so 3 extra bits to select the sub-words out of the 256-bit registers + wmask = Signal(4) + wdata_we = Signal() + rdata_re = Signal() + rdata_ack = Signal() + rdata_req = Signal() + radr = Signal(log2_int(rf_depth_raw) + 3) + + micro_rd_waitstates = 2 + micro_rdack = Signal(max=(micro_rd_waitstates+1)) + self.sync += [ + If( ((bus.adr & ((0xFFFF_C000) >> 2)) >= ((prefix | 0x1_0000) >> 2)) & (((bus.adr & ((0xFFFF_C000) >> 2)) < ((prefix | 0x1_4000) >> 2))), + # fully decode register file address to avoid aliasing + If(bus.cyc & bus.stb & bus.we & ~bus.ack, + If(~running | pause_gnt, + wdata.eq(bus.dat_w), + wadr.eq(bus.adr[:wadr.nbits]), + wmask.eq(bus.sel), + wdata_we.eq(1), + If(rf.phase, + bus.ack.eq(1), + ).Else( + bus.ack.eq(0), + ), + ).Else( + wdata_we.eq(0), + bus.ack.eq(0), + ) + ).Elif(bus.cyc & bus.stb & ~bus.we & ~bus.ack, + If(~running | pause_gnt, + radr.eq(bus.adr[:radr.nbits]), + rdata_re.eq(1), + bus.dat_r.eq( rf.ra_dat >> ((radr & 0x7) * 32) ), + bus.ack.eq(rdata_ack), + rdata_req.eq(1), + ).Else( + rdata_re.eq(0), + bus.ack.eq(0), + rdata_req.eq(0), + ) + ).Else( + wdata_we.eq(0), + bus.ack.eq(0), + rdata_req.eq(0), + rdata_re.eq(0), + ) + ).Elif( (bus.adr & ((0xFFFF_F000) >> 2)) == ((0x0 | prefix) >> 2), + # fully decode microcode address to avoid aliasing + If(bus.cyc & bus.stb & bus.we & ~bus.ack, + micro_wrport.adr.eq(bus.adr), + micro_wrport.dat_w.eq(bus.dat_w), + micro_wrport.we.eq(1), + bus.ack.eq(1), + ).Elif(bus.cyc & bus.stb & ~bus.we & ~bus.ack, + micro_wrport.we.eq(0), + micro_rdport.adr.eq(bus.adr), + bus.dat_r.eq(micro_rdport.dat_r), + + If(micro_rdack == 0, # 1 cycle delay for read to occur + bus.ack.eq(1), + ).Else( + bus.ack.eq(0), + micro_rdack.eq(micro_rdack - 1), + ) + ).Else( + micro_wrport.we.eq(0), + micro_rdack.eq(micro_rd_waitstates), + bus.ack.eq(0), + ) + ).Else( + # handle all mis-target reads not explicitly decoded + If(bus.cyc & bus.stb & ~bus.we & ~bus.ack, + bus.dat_r.eq(0xC0DE_BADD), + bus.ack.eq(1), + ).Elif(bus.cyc & bus.stb & bus.we & ~bus.ack, + bus.ack.eq(1), # ignore writes -- but don't hang the bus + ).Else( + bus.ack.eq(0), + ) + + ) + ] + + ### execution path signals to register file + ra_dat = Signal(rf_width_raw) + ra_adr = Signal(log2_int(num_registers)) + ra_const = Signal() + r_shift = Signal() + rb_dat = Signal(rf_width_raw) + rb_adr = Signal(log2_int(num_registers)) + rb_const = Signal() + wd_dat = Signal(rf_width_raw) + wd_adr = Signal(log2_int(num_registers)) + wd_bwe = Signal(rf_width_raw//8, reset = 0xFFFF_FFFF) + rf_write = Signal() + + r_dat_f = Array(Signal(granule_num_bits-1, reset = 0) for x in range(4)) ## FIXME: mem ctrl is 256/2=128 bits so 1 fewer bits + r_dat_m = Array(Signal(granule_num, reset = ((1<<(granule_num))-1)) for x in range(4)) + + self.submodules.ra_const_rom = JarethConst(insert_docs=True) + self.submodules.rb_const_rom = JarethConst() + + ### merge execution path signals with host access paths + self.comb += [ + ra_const.eq(instruction.ca), + rb_const.eq(instruction.cb), + ra_adr.eq(instruction.ra), + rb_adr.eq(instruction.rb), + self.ra_const_rom.adr.eq(ra_adr), + self.rb_const_rom.adr.eq(rb_adr), + rf.window.eq(self.window.fields.window), + r_shift.eq(instruction.shift), + + If(running & ~pause_gnt, + rf.ra_adr.eq(Cat(ra_adr, self.window.fields.window)), + rf.rb_adr.eq(Cat(rb_adr, self.window.fields.window)), + rf.instruction_pipe_in.eq(instruction.raw_bits()), + rf.wd_adr.eq(Cat(wd_adr, self.window.fields.window)), + rf.wd_dat.eq(wd_dat), + rf.wd_bwe.eq(wd_bwe), + rf.we.eq(rf_write), + ).Else( + rf.ra_adr.eq(radr >> 3), + rf.wd_adr.eq(wadr >> 3), + rf.wd_dat.eq(Cat(wdata,wdata,wdata,wdata,wdata,wdata,wdata,wdata)), # replicate; use byte-enable to multiplex + rf.wd_bwe.eq(0xF << ((wadr & 0x7) * 4)), # select the byte + rf.we.eq(wdata_we), + ), + If(~ra_const, + #ra_dat.eq((rf.ra_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[0]))) & Cat(Replicate(r_dat_m[0][0], 8), Replicate(r_dat_m[0][1], 8), Replicate(r_dat_m[0][2], 8), Replicate(r_dat_m[0][3], 8), Replicate(r_dat_m[0][4], 8), Replicate(r_dat_m[0][5], 8), Replicate(r_dat_m[0][6], 8), Replicate(r_dat_m[0][7], 8), Replicate(r_dat_m[0][8], 8), Replicate(r_dat_m[0][9], 8), Replicate(r_dat_m[0][10], 8), Replicate(r_dat_m[0][11], 8), Replicate(r_dat_m[0][12], 8), Replicate(r_dat_m[0][13], 8), Replicate(r_dat_m[0][14], 8), Replicate(r_dat_m[0][15], 8), Replicate(r_dat_m[0][16], 8), Replicate(r_dat_m[0][17], 8), Replicate(r_dat_m[0][18], 8), Replicate(r_dat_m[0][19], 8), Replicate(r_dat_m[0][20], 8), Replicate(r_dat_m[0][21], 8), Replicate(r_dat_m[0][22], 8), Replicate(r_dat_m[0][23], 8), Replicate(r_dat_m[0][24], 8), Replicate(r_dat_m[0][25], 8), Replicate(r_dat_m[0][26], 8), Replicate(r_dat_m[0][27], 8), Replicate(r_dat_m[0][28], 8), Replicate(r_dat_m[0][29], 8), Replicate(r_dat_m[0][30], 8), Replicate(r_dat_m[0][31], 8))) + If(~r_shift, + ra_dat.eq(rf.ra_dat), + ).Else( + ra_dat.eq((rf.ra_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[0]))) & Cat([Replicate(r_dat_m[0][x], granule) for x in range(0, granule_num)])) + ) + ).Else( + ra_dat.eq(self.ra_const_rom.const), + ), + If(~rb_const, + # rb_dat.eq(rf.rb_dat[8*r_dat_f[1]:8+8*r_dat_l[1]]), + #Case(r_dat_f[1], + # {x: Case(r_dat_l[1], { y: rb_dat.eq(rf.rb_dat[x*8:(y+1)*8]) for y in range(x, 32) } ) for x in range(0, 32) } + #) + #rb_dat.eq((rf.rb_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[1]))) & Cat(Replicate(r_dat_m[1][0], 8), Replicate(r_dat_m[1][1], 8), Replicate(r_dat_m[1][2], 8), Replicate(r_dat_m[1][3], 8), Replicate(r_dat_m[1][4], 8), Replicate(r_dat_m[1][5], 8), Replicate(r_dat_m[1][6], 8), Replicate(r_dat_m[1][7], 8), Replicate(r_dat_m[1][8], 8), Replicate(r_dat_m[1][9], 8), Replicate(r_dat_m[1][10], 8), Replicate(r_dat_m[1][11], 8), Replicate(r_dat_m[1][12], 8), Replicate(r_dat_m[1][13], 8), Replicate(r_dat_m[1][14], 8), Replicate(r_dat_m[1][15], 8), Replicate(r_dat_m[1][16], 8), Replicate(r_dat_m[1][17], 8), Replicate(r_dat_m[1][18], 8), Replicate(r_dat_m[1][19], 8), Replicate(r_dat_m[1][20], 8), Replicate(r_dat_m[1][21], 8), Replicate(r_dat_m[1][22], 8), Replicate(r_dat_m[1][23], 8), Replicate(r_dat_m[1][24], 8), Replicate(r_dat_m[1][25], 8), Replicate(r_dat_m[1][26], 8), Replicate(r_dat_m[1][27], 8), Replicate(r_dat_m[1][28], 8), Replicate(r_dat_m[1][29], 8), Replicate(r_dat_m[1][30], 8), Replicate(r_dat_m[1][31], 8))) + If(~r_shift, + rb_dat.eq(rf.rb_dat), + ).Else( + rb_dat.eq((rf.rb_dat >> (Cat(Signal(granule_bits, reset = 0), r_dat_f[1]))) & Cat([Replicate(r_dat_m[1][x], granule) for x in range(0, granule_num)])), + ) + ).Else( + rb_dat.eq(self.rb_const_rom.const) + ) + ] + # simple machine to wait 2 RF clock cycles for data to propagate out of the register file and back to the host + rd_wait_states=4 + bus_rd_wait = Signal(max=(rd_wait_states+1)) + self.sync.rf_clk += [ + If(rdata_req, + If(~running | pause_gnt, + If(bus_rd_wait != 0, + bus_rd_wait.eq(bus_rd_wait-1), + ).Else( + rdata_ack.eq(1), + ) + ) + ).Else( + rdata_ack.eq(0), + bus_rd_wait.eq(rd_wait_states), + ) + ] + + sext_immediate = Signal(log2_int(microcode_depth)) + self.comb += sext_immediate.eq(Cat(instruction.immediate, instruction.immediate[8])) # migen signed math failed us. so manually sign extend. this breaks the configurability of the code. + + ### Microcode sequencer. Very simple: it can only run linear sections of microcode. Feature not bug; + ### constant time operation is a defense against timing attacks. + + # pulse-stretch the go from sys->eng_clk. Don't use Migen CDC primitives, as they add latency; a BlindTransfer + # primitive on its own will take about as much time as a couple instructions on The Engine. + engine_go = Signal() + go_stretch = Signal(2) + self.sync += [ # note that we will miss this if the system throttles our clocks when this pulse arrives + If(self.control.fields.go, + go_stretch.eq(2) + ).Else( + If(go_stretch != 0, + go_stretch.eq(go_stretch - 1), + ) + ) + ] + self.comb += engine_go.eq(self.control.fields.go | (go_stretch != 0)) + + self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE")) + mpc_stop = Signal(log2_int(microcode_depth)) + window_latch = Signal(self.window.fields.window.size) + exec = Signal() # indicates to execution units to start running + done = Signal() # indicates when the given execution units are done (as-muxed from subunits) + self.comb += rf.running.eq(~seq.ongoing("IDLE") | rdata_re), # let the RF know when we're not executing, so it can idle to save power + seq.act("IDLE", + NextValue(pause_gnt, 0), + If(engine_go, + If(pause_req, + NextValue(mpc, self.mpresume.fields.mpresume) + ).Else( + NextValue(mpc, self.mpstart.fields.mpstart) + ), + NextValue(mpc_stop, self.mpstart.fields.mpstart + self.mplen.fields.mplen - 1), + NextValue(window_latch, self.window.fields.window), + NextValue(running, 1), + NextState("FETCH"), + ).Else( + NextValue(running, 0), + ) + ) + seq.act("FETCH", + If(pause_req, + NextState("PAUSED"), + NextValue(pause_gnt, 1), + ).Else( + # one cycle latency for instruction fetch + NextState("EXEC"), + NextValue(pause_gnt, 0), + ) + ) + seq.act("EXEC", # not a great name. This is actually where the register file fetches its contents. + If(instruction.opcode == opcodes["BRZ"][0], + NextState("DO_BRZ"), + ).Elif(instruction.opcode == opcodes["BRNZ"][0], + NextState("DO_BRNZ"), + ).Elif(instruction.opcode == opcodes["FIN"][0], + NextState("IDLE"), + NextValue(running, 0), + ).Elif(instruction.opcode < opcodes["MAX"][0], # check if the opcode is legal before running it + exec.eq(1), + NextState("WAIT_DONE"), + ).Else( + NextState("ILLEGAL_OPCODE"), + ) + ) + seq.act("WAIT_DONE", # this is where the actual instruction execution happens. + If(done, # TODO: for now, we just wait for each instruction to finish; but the foundations are around for pipelining... + If(mpc < mpc_stop, + NextState("FETCH"), + NextValue(mpc, mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ) + ) + seq.act("ILLEGAL_OPCODE", + NextState("IDLE"), + NextValue(running, 0), + illegal_opcode.eq(1), + ) + seq.act("DO_BRZ", + If(ra_dat == 0, + If( (sext_immediate + mpc + 1 < mpc_stop) & (sext_immediate + mpc + 1 >= self.mpstart.fields.mpstart), # validate new PC is in range + NextState("FETCH"), + NextValue(mpc, sext_immediate + mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ).Else( + If(abort, + NextState("IDLE"), + NextValue(running, 0), + ).Elif(mpc < mpc_stop, + NextState("FETCH"), + NextValue(mpc, mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ), + ) + seq.act("DO_BRNZ", + If(ra_dat != 0, + If( (sext_immediate + mpc + 1 < mpc_stop) & (sext_immediate + mpc + 1 >= self.mpstart.fields.mpstart), # validate new PC is in range + NextState("FETCH"), + NextValue(mpc, sext_immediate + mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ).Else( + If(abort, + NextState("IDLE"), + NextValue(running, 0), + ).Elif(mpc < mpc_stop, + NextState("FETCH"), + NextValue(mpc, mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ), + ) + seq.act("PAUSED", + If(~pause_req, + NextValue(pause_gnt, 0), + NextState("FETCH"), # could probably go directly to "EXEC", but, this is a minor detail recovering from pause + ) + ) + + #pad_SBUS_DATA_OE_LED = platform.request("SBUS_DATA_OE_LED") + #led = Signal(reset = 1) + #self.comb += pad_SBUS_DATA_OE_LED.eq(led) + self.busls = wishbone.Interface(data_width = 128, adr_width = 28) # FIXME: hardwired (here and elsewhere) + exec_units = { + "exec_logic" : ExecLogic(width=rf_width_raw), + "exec_addsub" : ExecAddSub(width=rf_width_raw), + "exec_ls" : ExecLS(width=rf_width_raw, interface=self.busls, r_dat_f=r_dat_f, r_dat_m=r_dat_m, granule=granule), + } + exec_units_shift = { + "exec_logic": True, + "exec_addsub": False, + "exec_ls": False, + } + exec_unit_shift_num = { } + index = 0 + + for name, unit in exec_units.items(): + setattr(self.submodules, name, unit); + setattr(self, "done" + str(index), Signal(name="done"+str(index))) + setattr(self, "unit_q" + str(index), Signal(wd_dat.nbits, name="unit_q"+str(index))) + setattr(self, "unit_sel" + str(index), Signal(name="unit_sel"+str(index))) + setattr(self, "unit_wd" + str(index), Signal(log2_int(num_registers), name="unit_wd"+str(index))) + if (exec_units_shift[name]): + setattr(self, "unit_shift" + str(index), Signal(name="unit_shift"+str(index))) + subdecode = Signal() + for op in unit.opcode_list: + self.comb += [ + If(instruction.opcode == opcodes[op][0], + subdecode.eq(1) + ) + ] + instruction_out = Record(instruction_layout) + self.comb += [ + instruction_out.raw_bits().eq(unit.instruction_out) + ] + self.comb += [ + unit.start.eq(exec & subdecode), + getattr(self, "done" + str(index)).eq(unit.q_valid), + unit.a.eq(ra_dat), + unit.b.eq(rb_dat), + unit.instruction_in.eq(instruction.raw_bits()), + getattr(self, "unit_q" + str(index)).eq(unit.q), + getattr(self, "unit_sel" + str(index)).eq(subdecode), + getattr(self, "unit_wd" + str(index)).eq(instruction_out.wd), + ] + if (exec_units_shift[name]): + self.comb += [ getattr(self, "unit_shift" + str(index)).eq(instruction_out.shift), ] + exec_unit_shift_num[index] = exec_units_shift[name] + index += 1 + + for i in range(index): + if (exec_unit_shift_num[i]): + self.comb += [ + If(getattr(self, "done" + str(i)), + done.eq(1), # TODO: for proper pipelining, handle case of two units done simultaneously! + If(getattr(self, "unit_shift" + str(i)), + wd_dat.eq(getattr(self, "unit_q" + str(i)) << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2]))), + wd_adr.eq(getattr(self, "unit_wd" + str(i))), + wd_bwe.eq(Cat([Replicate(r_dat_m[2][x], granule//8) for x in range(0, granule_num)])), + ).Else( + wd_dat.eq(getattr(self, "unit_q" + str(i))), + wd_adr.eq(getattr(self, "unit_wd" + str(i))), + wd_bwe.eq(0xFFFF_FFFF), + ) + ).Elif(seq.ongoing("IDLE"), + done.eq(0), + ) + ] + else: + self.comb += [ + If(getattr(self, "done" + str(i)), + done.eq(1), # TODO: for proper pipelining, handle case of two units done simultaneously! + wd_dat.eq(getattr(self, "unit_q" + str(i))), + wd_adr.eq(getattr(self, "unit_wd" + str(i))), + wd_bwe.eq(0xFFFF_FFFF), + ).Elif(seq.ongoing("IDLE"), + done.eq(0), + ) + ] + + self.comb += [ + rf_write.eq(done), + ] + + self.sync += abort.eq((abort & ~engine_go) | (self.exec_ls.has_failure[0] | self.exec_ls.has_failure[1] | self.exec_ls.has_timeout[0] | self.exec_ls.has_timeout[1])) + self.comb += self.ls_status.status.eq(self.exec_ls.state) + + ##### TIMING CONSTRAINTS -- you want these. Trust me. + + clk50 = "clk50" + #clk100 = "clk100" + clk100 = "sysclk" + clk200 = "clk200" + # registered exec units need this set of rules + ### clk200->clk50 multi-cycle paths: + # we architecturally guarantee extra setup time from the register file to the point of consumption: + # read data is stable by the 3rd phase of the RF fetch cycle, and so it is in fact ready even before + # the other signals that trigger the execute mode, hence 4+1 cycles total setup time + platform.add_platform_command("set_multicycle_path 5 -setup -start -from [get_clocks " + clk200 + "] -to [get_clocks " + clk50 + "] -through [get_cells *rf_r*_dat_reg*]") + platform.add_platform_command("set_multicycle_path 4 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk50 + "] -through [get_cells *rf_r*_dat_reg*]") + ### clk200->clk100 multi-cycle paths: + # same as above, but for the multiplier path. + platform.add_platform_command("set_multicycle_path 3 -setup -start -from [get_clocks " + clk200 + "] -to [get_clocks " + clk100 + "] -through [get_cells *rf_r*_dat_reg*]") + platform.add_platform_command("set_multicycle_path 2 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk100 + "] -through [get_cells *rf_r*_dat_reg*]") + + # unregistered exec units need this set of rules + ### clk200->clk200 multi-cycle paths: + # this is for the case when we don't register the data, and just go straight from RF out put RF input. In the worst case + # we have three (? maybe five?) clk200 cycles to compute as we phase through the reads and writes + platform.add_platform_command("set_multicycle_path 3 -setup -from [get_clocks " + clk200 + "] -to [get_clocks " + clk200 + "] -through [get_cells *rf_r*_dat_reg*]") + platform.add_platform_command("set_multicycle_path 2 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk200 + "] -through [get_cells *rf_r*_dat_reg*]") + + # other paths + ### sys->clk200 multi-cycle paths: + # microcode fetch is stable 10ns before use by the register file, by design + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=ra_const) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=ra_const) + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=rb_const) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=rb_const) + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.ra_const_rom.adr) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.ra_const_rom.adr) + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.rb_const_rom.adr) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.rb_const_rom.adr) + # ignore the clk200 reset path for timing purposes -- there is >1 cycle guaranteed after reset for everything to settle before anything moves on these paths + platform.add_platform_command("set_false_path -through [get_nets " + clk200 + "_rst]") + # ignore the clk50 reset path for timing purposes -- there is > 1 cycle guaranteed after reset for everything to settle before anything moves on these paths (applies for other crypto engines, (SHA/AES) as well) + platform.add_platform_command("set_false_path -through [get_nets " + clk50 + "_rst]") + ### sys->clk50 multi-cycle paths: + # microcode fetch is guaranteed not to transition in the middle of an exec computation + platform.add_platform_command("set_multicycle_path 2 -setup -start -from [get_clocks " + clk100 + "] -to [get_clocks " + clk50 + "] -through [get_cells microcode_reg*]") + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk50 + "] -through [get_cells microcode_reg*]") + ### clk50->clk200 multi-cycle paths: + # engine running will set up a full eng_clk cycle before any RF accesses need to be valid + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_nets {{ {net1} {net2} {net3} }}]", net1=running, net2=running_r, net3=rf.running) + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_nets {{ {net1} {net2} {net3} }}]", net1=running, net2=running_r, net3=rf.running) + # this signal is a combo from clk50+sys + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") + # data writeback happens on phase==2, and thus is stable for at least two clk200 clocks extra + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + ### sys->clk200 multi-cycle paths: + # data writeback happens on phase==2, and thus is stable for at least two clk200 clocks extra + one full eng_clk (total 25ns) + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + # this signal is a combo from clk50+sys + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") diff --git a/sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock new file mode 100644 index 0000000..4be67b7 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.lock @@ -0,0 +1,13 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "jareth-as" +version = "0.1.0" + +[[package]] +name = "jareth_code" +version = "0.1.0" +dependencies = [ + "jareth-as 0.1.0", +] + diff --git a/sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml new file mode 100644 index 0000000..ead6a03 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/jareth_code/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "jareth_code" +version = "0.1.0" +authors = ["Romain Dolbeau "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[dependencies.jareth-as] +#git="https://github.com/betrusted-io/jareth-as.git" +#rev="6681e73c1fdc4a460b5ef9f9c7c91aef546d00f3" +path = "/home/dolbeau/jareth-as" + +[dev-dependencies.jareth-as] +#git="https://github.com/betrusted-io/jareth-as.git" +#rev="6681e73c1fdc4a460b5ef9f9c7c91aef546d00f3" +path = "/home/dolbeau/jareth-as" + +[[bin]] +name = "jareth_code" +path = "jareth_code.rs" diff --git a/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs new file mode 100644 index 0000000..aca9d00 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/jareth_code/jareth_code.rs @@ -0,0 +1,108 @@ +#![recursion_limit="768"] + +extern crate jareth_as; +use jareth_as::*; + +fn main() -> std::io::Result<()> { + let mcode = assemble_jareth!( + // 0..0 $DST / $DST / $SRC in %0 + // 0..0 $DST / $SRC / $DST in %1 + // size in %2 + // pattern in %3 + // ----- + // size & 7 in %5 + // size rounded down in %6 + // input in %16 + // output in %17 + // 0 in %31 + start: + resm %31 + setadr %31, %0 + load256inc %16, %0 + load256inc %17, %1 + // slow + setma %31, %0, #16 + // slow + setmq %31, %1, #16 + and %5, %2, #15 + sub %6, %2, %5 + brz done, %6 + loop: + psa %18, %16 + psa %19, %17 + psa* %17, %16 + psa %20, %17 + store128inc %31, %2, %17 + sub %6, %6, #16 + brz last, %6 + loadh128inc %16, %0, %16 + loadh128inc %17, %1, %17 + brz loop, #0 + last: + // FIXME: not if Q is aligned + loadh128inc %17, %1, %17 + store128inc %31, %2, %17 + done: + getadr %3 + getm %2 + fin + fin + ); + let _mcode3 = assemble_jareth!( + // 0..0 / $DST / $SRC in %0 + // size in %2 + // pattern in %3 + start: + resm %31 + psa %31, #0 + psa %30, #1 + sub %30, %31, %30 + psa %29, #2 + setmq %31, %29, %2 + setma %31, %0, %2 + psa* %30, %3 + getm %3 + resm %31 + psa %2, %30 + setadr %31 , %0 + load256 %1, %0 + load128 %0, %0 + fin + fin + ); + let _mcode2 = assemble_jareth!( + psa %1, %3 + setma %31, %0, %2 + psa %2, %3 + getm %3 + fin + fin + resm %31 + psa %0, %3 + setmq %31, %1, %2 + psa %1, %3 + fin + fin + fin + setma %31, %0, %2 + setma %31, %0, %2 + resm %31 + fin + fin + fin + fin + ); + + let mut pos; + + pos = 0; + println!("test code:"); + while pos < mcode.len() { + print!("0x{:08x},", mcode[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", mcode.len()); + + Ok(()) +} diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py index 7e7b199..d605e77 100644 --- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py @@ -31,6 +31,8 @@ USBOHCI_ADDR_PFX = Signal(12, reset = 0x008) SRAM_ADDR_PFX = Signal(12, reset = 0x009) # unmapped ; LE ENGINE_ADDR_PFXA = Signal(12, reset = 0x00a) ENGINE_ADDR_PFXB = Signal(12, reset = 0x00b) +JARETH_ADDR_PFXA = Signal(12, reset = 0x00c) +JARETH_ADDR_PFXB = Signal(12, reset = 0x00d) CG6_BT_ADDR_PFX = Signal(12, reset = 0x020) CG6_ALT_ADDR_PFX = Signal(12, reset = 0x028) CG6_FHC_ADDR_PFX = Signal(12, reset = 0x030) @@ -503,6 +505,8 @@ class SBusFPGABus(Module): (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXA) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXB) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXA) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXB) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_BT_ADDR_PFX) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_ALT_ADDR_PFX) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_FHC_ADDR_PFX) | @@ -704,6 +708,8 @@ class SBusFPGABus(Module): (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXA) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXB) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXA) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == JARETH_ADDR_PFXB) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_BT_ADDR_PFX) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_ALT_ADDR_PFX) | (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == CG6_FHC_ADDR_PFX) | diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py index bb17092..17ca500 100644 --- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_prom.py @@ -104,7 +104,8 @@ def get_prom(soc, cg3=False, cg6=False, cg3_res=None, - sdcard=False): + sdcard=False, + jareth=False): framebuffer = (bw2 or cg3 or cg6) @@ -119,7 +120,7 @@ def get_prom(soc, r += "\" RDOL,sbusstat\" device-name\n" r += get_header_map_stuff("sbus_bus_stat", "sbus_bus_stat", 256) - if (trng or usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard): + if (trng or usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard or jareth): r += "finish-device\nnew-device\n" if (trng): @@ -131,7 +132,7 @@ def get_prom(soc, r += " map-out-trng\n" r += ";\n" r += "disabletrng!\n" - if (usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard): + if (usb or (sdram or not sdram) or engine or i2c or framebuffer or sdcard or jareth): r += "finish-device\nnew-device\n" if (usb): @@ -151,7 +152,7 @@ def get_prom(soc, r += " map-out-usb_host_ctrl\n" r += ";\n" r += "my-reset!\n" - if ((sdram or not sdram) or engine or i2c or framebuffer or sdcard): + if ((sdram or not sdram) or engine or i2c or framebuffer or sdcard or jareth): r += "finish-device\nnew-device\n" if (sdram): @@ -176,15 +177,15 @@ def get_prom(soc, r += "\" RDOL,hidden_sdram\" device-name\n" r += get_header_mapx_stuff("mregs", [ "ddrphy", "sdram" ], [ 4096, 4096 ], [ "csr", "csr" ]) r += "fload sdram_init.fth\ninit!\n" - if (engine or i2c or framebuffer or sdcard): + if (engine or i2c or framebuffer or sdcard or jareth): r += "finish-device\nnew-device\n" if (engine): r += "\" betrustedc25519e\" device-name\n" r += ": sbusfpga_regionaddr_curve25519engine-microcode sbusfpga_regionaddr_curve25519engine ;\n" r += ": sbusfpga_regionaddr_curve25519engine-regfile sbusfpga_regionaddr_curve25519engine h# 10000 + ;\n" - r += get_header_mapx_stuff("curve25519engine", [ "curve25519engine-regs", "curve25519engine-microcode", "curve25519engine-regfile" ], [ 4096, 4096, 65536 ] , ["csr", "region", "region" ] ) - if (i2c or framebuffer or sdcard): + r += get_header_mapx_stuff("curve25519engine", [ "curve25519engine", "curve25519engine-microcode", "curve25519engine-regfile" ], [ 4096, 4096, 65536 ] , ["csr", "region", "region" ] ) + if (i2c or framebuffer or sdcard or jareth): r += "finish-device\nnew-device\n" if (i2c): @@ -199,7 +200,7 @@ def get_prom(soc, r += " \" lm75\" encode-string \" compatible\" property\n" r += " h# 48 encode-int \" addr\" property\n" r += " finish-device\n" - if (framebuffer or sdcard): + if (framebuffer or sdcard or jareth): r += "finish-device\nnew-device\n" if (framebuffer): @@ -233,7 +234,7 @@ def get_prom(soc, else: r += get_header_map_stuff("cg6extraregs", "cg6", 4096, reg=False) r += "fload cg6_init.fth\ncg6_init!\n" - if (sdcard): + if (sdcard or jareth): r += "finish-device\nnew-device\n" if (sdcard): @@ -249,6 +250,15 @@ def get_prom(soc, r += "sdcard-init!\n" r += "fload sdcard.fth\n" r += "fload sdcard_access.fth\n" + if (jareth): + r += "finish-device\nnew-device\n" + + if (jareth): + r += "\" jareth\" device-name\n" + r += ": sbusfpga_regionaddr_jareth-microcode sbusfpga_regionaddr_jareth ;\n" + r += ": sbusfpga_regionaddr_jareth-regfile sbusfpga_regionaddr_jareth h# 10000 + ;\n" + r += get_header_mapx_stuff("jareth", [ "jareth", "jareth-microcode", "jareth-regfile" ], [ 4096, 4096, 4096 ] , ["csr", "region", "region" ] ) + r += "end0\n" return r diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py index 4ec713f..06c98d6 100644 --- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py @@ -40,7 +40,6 @@ import bw2_fb import cg3_fb import cg6_fb import cg6_accel -#import cgtrois # Wishbone stuff from sbus_wb import WishboneDomainCrossingMaster @@ -65,7 +64,7 @@ class _CRG(Module): # self.clock_domains.cd_por = ClockDomain() # 48 MHz native, reset'ed by SBus, power-on-reset timer if (usb): self.clock_domains.cd_usb = ClockDomain() # 48 MHZ PLL, reset'ed by SBus (via pll), for USB controller - if (engine): # also used for cgtrois + if (engine): # also used for Jareth self.clock_domains.cd_clk50 = ClockDomain() # 50 MHz (gated) for curve25519engine -> eng_clk #self.clock_domains.cd_clk100 = ClockDomain() # 100 MHz for curve25519engine -> sys_clk self.clock_domains.cd_clk200 = ClockDomain() # 200 MHz (gated) for curve25519engine -> rf_clk @@ -121,7 +120,7 @@ class _CRG(Module): #platform.add_false_path_constraints(self.cd_sys.clk, self.cd_sbus.clk) #platform.add_false_path_constraints(self.cd_sbus.clk, self.cd_sys.clk) ##platform.add_false_path_constraints(self.cd_native.clk, self.cd_sys.clk) - if (engine): # also used for cgtrois + if (engine): # also used for Jareth pll.create_clkout(self.cd_clk50, sys_clk_freq/2, ce=pll.locked & self.curve25519_on) platform.add_platform_command("create_generated_clock -name clk50 [get_pins {{{{MMCME2_ADV/CLKOUT{}}}}}]".format(num_clk)) num_clk = num_clk + 1 @@ -212,7 +211,7 @@ class SBusFPGA(SoCCore): #if self.irq.enabled: #self.irq.add(name, use_loc_if_exists=True) - def __init__(self, variant, version, sys_clk_freq, trng, usb, sdram, engine, i2c, bw2, cg3, cg6, cg3_res, sdcard, **kwargs): + def __init__(self, variant, version, sys_clk_freq, trng, usb, sdram, engine, i2c, bw2, cg3, cg6, cg3_res, sdcard, jareth, **kwargs): framebuffer = (bw2 or cg3 or cg6) print(f"Building SBusFPGA for board version {version}") @@ -275,6 +274,7 @@ class SBusFPGA(SoCCore): "usb_host": 0x00080000, # OHCI registers are here, not in CSR #"usb_shared_mem": 0x00090000, # unused ATM "curve25519engine": 0x000a0000, # includes microcode (4 KiB@0) and registers (16 KiB @ 64 KiB) + "jareth": 0x000c0000, # includes microcode (4 KiB@0) and registers (2 KiB @ 64 KiB) "cg6_bt": 0x00200000, # required for compatibility, bt_regs for cg6 #"cg6_dhc": 0x00240000, # required for compatibility, unused "cg6_alt": 0x00280000, # required for compatibility @@ -291,7 +291,9 @@ class SBusFPGA(SoCCore): "dvma_bridge": 0xfc000000, # required to match DVMA virtual addresses } self.mem_map.update(wb_mem_map) - self.submodules.crg = _CRG(platform=platform, sys_clk_freq=sys_clk_freq, usb=usb, usb_clk_freq=48e6, engine=engine, framebuffer=framebuffer, pix_clk=litex.soc.cores.video.video_timings[cg3_res]["pix_clk"]) + self.submodules.crg = _CRG(platform=platform, sys_clk_freq=sys_clk_freq, usb=usb, usb_clk_freq=48e6, engine=(engine or jareth), framebuffer=framebuffer, pix_clk=litex.soc.cores.video.video_timings[cg3_res]["pix_clk"]) + + #self.platform.add_period_constraint(self.platform.lookup_request("SBUS_3V3_CLK", loose=True), 1e9/25e6) # SBus max ## add our custom timings after the clocks have been defined @@ -484,7 +486,7 @@ class SBusFPGA(SoCCore): #self.comb += pad_sdcard_interrupt.eq(sig_sdcard_interrupt) #self.comb += sig_sdcard_interrupt.eq(~self.sdirq.irq) ## - if (usb or engine or sdcard): + if (usb or engine or sdcard or jareth): # jareth only for testing if (not single_dvma_master): self.bus.add_slave(name="dvma_bridge", slave=self.wishbone_slave_sys, region=SoCRegion(origin=self.mem_map.get("dvma_bridge", None), size=0x03ffffff, cached=False)) @@ -502,7 +504,8 @@ class SBusFPGA(SoCCore): self.bus.add_master(name="curve25519engineLS", master=self.curve25519engine.busls) else: self.comb += self.curve25519engine.busls.connect(self.wishbone_slave_sys) - self.comb += self.crg.curve25519_on.eq(self.curve25519engine.power.fields.on) + if (not jareth): + self.comb += self.crg.curve25519_on.eq(self.curve25519engine.power.fields.on) if (i2c): self.submodules.i2c = RTLI2C(platform, pads=platform.request("i2c")) @@ -535,6 +538,16 @@ class SBusFPGA(SoCCore): self.add_ram("cg6_accel_rom", origin=self.mem_map["cg6_accel_rom"], size=rounded_cg6_rom_len, contents=cg6_rom_data, mode="r") self.add_ram("cg6_accel_ram", origin=self.mem_map["cg6_accel_ram"], size=2**12, mode="rw") + if (jareth): + from jareth import Jareth; + self.submodules.jareth = ClockDomainsRenamer({"eng_clk":"clk50", "rf_clk":"clk200", "mul_clk":"clk100_gated"})(Jareth(platform=platform,prefix=self.mem_map.get("jareth", None))) # , "sys":"clk100" + self.bus.add_slave("jareth", self.jareth.bus, SoCRegion(origin=self.mem_map.get("jareth", None), size=0x20000, cached=False)) + self.bus.add_master(name="jarethLS", master=self.jareth.busls) # Jareth doesn't need the DVMA + if (not engine): + self.comb += self.crg.curve25519_on.eq(self.jareth.power.fields.on) + else: + self.comb += self.crg.curve25519_on.eq(self.jareth.power.fields.on | self.curve25519engine.power.fields.on) + print("IRQ to Device map:\n") print(platform.irq_device_map) print("Device to IRQ map:\n") @@ -566,12 +579,13 @@ def main(): parser.add_argument("--cg3-res", default="1152x900@76Hz", help="Specify the CG3/CG6 resolution") parser.add_argument("--cg6", action="store_true", help="add a CG6 framebuffer [V1.2+VGA_RGB222 pmod]") parser.add_argument("--sdcard", action="store_true", help="add a sdcard {no SW yet}") + parser.add_argument("--jareth", action="store_true", help="add a Jareth vector core [all]") builder_args(parser) vivado_build_args(parser) args = parser.parse_args() if (args.sdram == False): - print(" ***** WARNING ***** : not enablling the SDRAM still adds a controller, but doesn't add the DMA engines\n") + print(" ***** WARNING ***** : not enabling the SDRAM still adds a controller, but doesn't add the DMA engines\n") if (args.usb and (args.version == "V1.0")): print(" ***** WARNING ***** : USB on V1.0 is an ugly hack \n"); if (args.i2c): @@ -596,7 +610,8 @@ def main(): cg3=args.cg3, cg6=args.cg6, cg3_res=args.cg3_res, - sdcard=args.sdcard) + sdcard=args.sdcard, + jareth=args.jareth) #soc.add_uart(name="uart", baudrate=115200, fifo_depth=16) version_for_filename = args.version.replace(".", "_") @@ -644,7 +659,8 @@ def main(): cg3=args.cg3, cg6=args.cg6, cg3_res=args.cg3_res, - sdcard=args.sdcard) + sdcard=args.sdcard, + jareth=args.jareth) write_to_file(os.path.join(f"prom_{version_for_filename}.fth"), prom_content)