diff --git a/NetBSD/9.0/etc/disktab.sbusfpga b/NetBSD/9.0/etc/disktab.sbusfpga new file mode 100644 index 0000000..31dd600 --- /dev/null +++ b/NetBSD/9.0/etc/disktab.sbusfpga @@ -0,0 +1,4 @@ +sbusfpga256|SBusFPGA with 256 MiB SDRAM: \ + :ns#2:nt#4:nc#65536:se#512: \ + :oa#0:pa#524288:ta=4.2BSD: \ + :oc#0:pc#524288: diff --git a/NetBSD/9.0/etc/rc.d/sbusfpga_sdram b/NetBSD/9.0/etc/rc.d/sbusfpga_sdram new file mode 100644 index 0000000..fe60532 --- /dev/null +++ b/NetBSD/9.0/etc/rc.d/sbusfpga_sdram @@ -0,0 +1,13 @@ +#!/bin/sh +# +# $NetBSD$ +# + +# PROVIDE: SBUSFPGA_SDRAM + +if test -b /dev/sbusfpga_sdram0; then + for DEVICE in /dev/sbusfpga_sdram[0-9]; do + test -b ${DEVICE} && disklabel -w ${DEVICE} sbusfpga256 || return + test -b ${DEVICE}a && newfs ${DEVICE}a || return + done +fi diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/ohci_sbus.c b/NetBSD/9.0/usr/src/sys/dev/sbus/ohci_sbus.c new file mode 100644 index 0000000..aaeb78a --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/ohci_sbus.c @@ -0,0 +1,197 @@ +/* $NetBSD$ */ + +/* + * Copyright (c) 1998, 2021 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Lennart Augustsson (lennart@augustsson.net) at + * Carlstedt Research & Technology. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +struct ohci_sbus_softc { + ohci_softc_t sc; + void *sc_ih; + int sc_node; + int sc_burst; +}; + +static int +ohci_sbus_match(device_t parent, cfdata_t match, void *aux) +{ + struct sbus_attach_args *sa = (struct sbus_attach_args *)aux; + /* generic-ohci is the default name, from device-tree */ + if (strcmp("generic-ohci", sa->sa_name) == 0) + return 1; + /* usb is the OFW name, qualified by device-type */ + const char* type = prom_getpropstring(sa->sa_node, "device-type"); + if (type != NULL && (strcmp("ohci", type) == 0)) + return 1; + return 0; +} + +static void +ohci_sbus_attach(device_t parent, device_t self, void *aux) +{ + struct ohci_sbus_softc *sc = device_private(self); + struct sbus_attach_args *sa = (struct sbus_attach_args *)aux; + struct sbus_softc *sbsc = device_private(parent); + int sbusburst; + + sc->sc.sc_dev = self; + sc->sc.sc_bus.ub_hcpriv = sc; + sc->sc.iot = sa->sa_bustag; + sc->sc.sc_size = sa->sa_size; + + /* **** SBus specific */ + sc->sc_node = sa->sa_node; + /* + * Get transfer burst size from PROM + */ + sbusburst = sbsc->sc_burst; + if (sbusburst == 0) + sbusburst = SBUS_BURST_32 - 1; /* 1->16 */ + sc->sc_burst = prom_getpropint(sc->sc_node, "burst-sizes", -1); + if (sc->sc_burst == -1) + /* take SBus burst sizes */ + sc->sc_burst = sbusburst; + /* Clamp at parent's burst sizes */ + sc->sc_burst &= sbusburst; + + if (0) { /* in PCI there's a test for some specific controller */ + sc->sc.sc_flags = OHCIF_SUPERIO; + } + + /* check if memory space access is enabled */ + /* CHECKME: not needed ? */ + + /* Map I/O registers */ + if (sbus_bus_map(sc->sc.iot, sa->sa_slot, sa->sa_offset, sc->sc.sc_size, + BUS_SPACE_MAP_LINEAR, &sc->sc.ioh) != 0) { + aprint_error_dev(self, ": cannot map registers\n"); + return; + } + + aprint_normal_dev(self, "nid 0x%x, bustag %p (0x%zx @ 0x%08lx), burst 0x%x (parent 0x%0x)\n", + sc->sc_node, + sc->sc.iot, + (size_t)sc->sc.sc_size, + sc->sc.ioh, + sc->sc_burst, + sbsc->sc_burst); + + /* we're SPECIAL!!! */ + /* sc->sc.sc_endian = OHCI_BIG_ENDIAN; */ + + /* Disable interrupts, so we don't get any spurious ones. */ + bus_space_write_4(sc->sc.iot, sc->sc.ioh, OHCI_INTERRUPT_DISABLE, + OHCI_ALL_INTRS); + + sc->sc.sc_bus.ub_dmatag = sa->sa_dmatag; + /* sc->sc.sc_bus.ub_dmatag = (void*)((char*)sc->sc.ioh + 0x10000); */ + + /* Enable the device. */ + /* CHECKME: not needed ? */ + + /* Map and establish the interrupt. */ + if (sa->sa_nintr != 0) { + sc->sc_ih = bus_intr_establish(sc->sc.iot, sa->sa_pri, + IPL_NET, ohci_intr, sc); // checkme: interrupt priority + if (sc->sc_ih == NULL) { + aprint_error_dev(self, "couldn't establish interrupt (%d)\n", sa->sa_nintr); + } else + aprint_normal_dev(self, "interrupting at %d / %d / %d\n", sa->sa_nintr, sa->sa_pri, IPL_NET); + } else { + aprint_error_dev(self, "no interrupt defined in PROM\n"); + goto fail; + } + + int err = ohci_init(&sc->sc); + if (err) { + aprint_error_dev(self, "init failed, error=%d\n", err); + goto fail; + } + + if (!pmf_device_register1(self, ohci_suspend, ohci_resume, + ohci_shutdown)) + aprint_error_dev(self, "couldn't establish power handler\n"); + + /* Attach usb device. */ + sc->sc.sc_child = config_found(self, &sc->sc.sc_bus, usbctlprint); + return; + +fail: + /* should we unmap ? */ + return; +} + +static int +ohci_sbus_detach(device_t self, int flags) +{ + struct ohci_sbus_softc *sc = device_private(self); + int rv; + + rv = ohci_detach(&sc->sc, flags); + if (rv) + return rv; + + pmf_device_deregister(self); + + ohci_shutdown(self, flags); + + /* Disable interrupts, so we don't get any spurious ones. */ + bus_space_write_4(sc->sc.iot, sc->sc.ioh, + OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS); + + /* can we disestablish the interrupt ? */ + /* can we unmap the registers ? */ + return 0; +} + +CFATTACH_DECL3_NEW(ohci_sbus, sizeof(struct ohci_sbus_softc), + ohci_sbus_match, ohci_sbus_attach, ohci_sbus_detach, ohci_activate, NULL, + ohci_childdet, DVF_DETACH_SHUTDOWN); diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.c b/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.c index de00be6..c8968ba 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.c @@ -112,7 +112,7 @@ extern struct cfdriver rdfpga_sdcard_cd; static int rdfpga_sdcard_wait_dma_ready(struct rdfpga_sdcard_softc *sc, const int count); static int rdfpga_sdcard_wait_device_ready(struct rdfpga_sdcard_softc *sc, const int count); -static int rdfpga_sdcard_read_block(struct rdfpga_sdcard_softc *sc, const u_int32_t block, void *data); +static int rdfpga_sdcard_read_block(struct rdfpga_sdcard_softc *sc, const u_int32_t block, const u_int32_t blkcnt, void *data); static int rdfpga_sdcard_write_block(struct rdfpga_sdcard_softc *sc, const u_int32_t block, void *data); struct rdfpga_sdcard_rb_32to512 { @@ -179,7 +179,7 @@ rdfpga_sdcard_ioctl (dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) case RDFPGA_SDCARD_RB: { struct rdfpga_sdcard_rb_32to512* u = data; - err = rdfpga_sdcard_read_block(sc, u->block, u->data); + err = rdfpga_sdcard_read_block(sc, u->block, 1, u->data); break; } case RDFPGA_SDCARD_WB: @@ -188,49 +188,6 @@ rdfpga_sdcard_ioctl (dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) err = rdfpga_sdcard_write_block(sc, u->block, u->data); break; } - - - #if 0 - case DIOCGDINFO: - *(struct disklabel *)data = *(sc->dk.sc_dkdev.dk_label); - break; - - case DIOCGDEFLABEL: - { - struct disklabel *lp = sc->dk.sc_dkdev.dk_label; - struct cpu_disklabel *clp = sc->dk.sc_dkdev.dk_cpulabel; - memset(lp, 0, sizeof(struct disklabel)); - memset(clp, 0, sizeof(struct cpu_disklabel)); - if (readdisklabel(dev, rdfpga_sdcard_strategy, lp, clp) != NULL) { - int i; - aprint_normal_dev(sc->dk.sc_dev, "read disk label OK\n"); - strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); - /* - * Reset the partition info; it might have gotten - * trashed in readdisklabel(). - * - * XXX Why do we have to do this? readdisklabel() - * should be safe... - */ - for (i = 0; i < MAXPARTITIONS; ++i) { - lp->d_partitions[i].p_offset = 0; - if (i == RAW_PART) { - lp->d_partitions[i].p_size = - lp->d_secpercyl * lp->d_ncylinders; - lp->d_partitions[i].p_fstype = FS_BSDFFS; - } else { - lp->d_partitions[i].p_size = 0; - lp->d_partitions[i].p_fstype = FS_UNUSED; - } - } - lp->d_npartitions = RAW_PART + 1; - memcpy(data, lp, sizeof(struct disklabel)); - } else { - aprint_normal_dev(sc->dk.sc_dev, "read disk label FAILED\n"); - } - } - break; -#endif /* case VNDIOCCLR: */ /* case VNDIOCCLR50: */ @@ -505,9 +462,10 @@ static int rdfpga_sdcard_wait_device_ready(struct rdfpga_sdcard_softc *sc, const return rdfpga_sdcard_wait_dma_ready(sc, count); } -static int rdfpga_sdcard_read_block(struct rdfpga_sdcard_softc *sc, const u_int32_t block, void *data) { +static int rdfpga_sdcard_read_block(struct rdfpga_sdcard_softc *sc, const u_int32_t block, const u_int32_t blkcnt, void *data) { int res = 0; - u_int32_t ctrl; + u_int32_t ctrl = 0; + u_int32_t idx = 0; /* aprint_normal_dev(sc->dk.sc_dev, "Reading block %u from sdcard\n", block); */ if ((res = rdfpga_sdcard_wait_device_ready(sc, 50000)) != 0) @@ -524,9 +482,6 @@ static int rdfpga_sdcard_read_block(struct rdfpga_sdcard_softc *sc, const u_int3 bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1); return ENXIO; } - - /* for testing only, remove */ - //memcpy(kvap, data, 512); if (bus_dmamap_load(sc->sc_dmatag, sc->sc_dmamap, kvap, RDFPGA_SDCARD_VAL_DMA_MAX_SZ, /* kernel space */ NULL, BUS_DMA_NOWAIT | BUS_DMA_STREAMING | BUS_DMA_WRITE)) { @@ -536,24 +491,28 @@ static int rdfpga_sdcard_read_block(struct rdfpga_sdcard_softc *sc, const u_int3 return ENXIO; } - bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, 512, BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, blkcnt * 512, BUS_DMASYNC_PREWRITE); - /* set DMA address */ - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs, RDFPGA_SDCARD_REG_DMAW_ADDR, (uint32_t)(sc->sc_dmamap->dm_segs[0].ds_addr)); - /* set block to read */ - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs, RDFPGA_SDCARD_REG_ADDR, block); - ctrl = RDFPGA_SDCARD_CTRL_START | RDFPGA_SDCARD_CTRL_READ; - /* initiate reading block from SDcard; once the read request is acknowledged, the HW will start the DMA engine */ - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs, RDFPGA_SDCARD_REG_CTRL, ctrl); + for (idx = 0 ; idx < blkcnt && !res; idx++) { + bus_addr_t addr = sc->sc_dmamap->dm_segs[0].ds_addr + 512 * idx; + + /* set DMA address */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs, RDFPGA_SDCARD_REG_DMAW_ADDR, (uint32_t)(addr)); + /* set block to read */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs, RDFPGA_SDCARD_REG_ADDR, (block + idx)); + ctrl = RDFPGA_SDCARD_CTRL_START | RDFPGA_SDCARD_CTRL_READ; + /* initiate reading block from SDcard; once the read request is acknowledged, the HW will start the DMA engine */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs, RDFPGA_SDCARD_REG_CTRL, ctrl); + + res = rdfpga_sdcard_wait_device_ready(sc, 100000); + } - res = rdfpga_sdcard_wait_device_ready(sc, 100000); - - bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, 512, BUS_DMASYNC_POSTWRITE); + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, blkcnt * 512, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->sc_dmatag, sc->sc_dmamap); /* aprint_normal_dev(sc->dk.sc_dev, "dma: unloaded\n"); */ - memcpy(data, kvap, 512); + memcpy(data, kvap, blkcnt * 512); bus_dmamem_unmap(sc->sc_dmatag, kvap, RDFPGA_SDCARD_VAL_DMA_MAX_SZ); /* aprint_normal_dev(sc->dk.sc_dev, "dma: unmapped\n"); */ @@ -625,100 +584,9 @@ static int rdfpga_sdcard_write_block(struct rdfpga_sdcard_softc *sc, const u_int void rdfpga_sdcard_strategy(struct buf *bp) { -#if 0 - struct rdfpga_sdcard_softc *sc = device_lookup_private(&rdfpga_sdcard_cd, DISKUNIT(bp->b_dev)); - int err = 0; - if (sc == NULL) { - aprint_error("%s:%d: sc == NULL! giving up\n", __PRETTY_FUNCTION__, __LINE__); - bp->b_resid = bp->b_bcount; - bp->b_error = EINVAL; - goto done; - } - /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bflags = 0x%08x\n", __PRETTY_FUNCTION__, __LINE__, bp->b_flags); */ - /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bufsize = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bufsize); */ - /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_blkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_blkno); */ - /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_rawblkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_rawblkno); */ - /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bcount = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bcount); */ - - bp->b_resid = bp->b_bcount; - - if (bp->b_bcount == 0) { - goto done; - } - - if (bp->b_flags & B_READ) { - unsigned char* data = bp->b_data; - daddr_t blk = bp->b_blkno; - struct partition *p = NULL; - - if (DISKPART(bp->b_dev) != RAW_PART) { - if ((err = bounds_check_with_label(&sc->dk.sc_dkdev, bp, 0)) <= 0) { - aprint_error("%s:%d: bounds_check_with_label -> %d\n", __PRETTY_FUNCTION__, __LINE__, err); - bp->b_resid = bp->b_bcount; - goto done; - } - p = &sc->dk.sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; - blk = bp->b_blkno + p->p_offset; - } - - while (bp->b_resid >= 512 && !bp->b_error) { - if (blk < 62521344) { - aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_blkno = %lld, computed %lld (part %d)\n", __PRETTY_FUNCTION__, __LINE__, bp->b_blkno, blk, DISKPART(bp->b_dev)); -aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_rawblkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_rawblkno); - bp->b_error = rdfpga_sdcard_read_block(sc, blk, data); - } else { - aprint_error("%s:%d: blk = %lld read out of range! giving up\n", __PRETTY_FUNCTION__, __LINE__, blk); - bp->b_error = EINVAL; - } - blk ++; - data += 512; - bp->b_resid -= 512; - } - } else { -#if 1 - bp->b_error = EINVAL; - aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bflags = 0x%08x\n", __PRETTY_FUNCTION__, __LINE__, bp->b_flags); - aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bufsize = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bufsize); - aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_blkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_blkno); - aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_rawblkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_rawblkno); - aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bcount = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bcount); -#else - unsigned char* data = bp->b_data; - daddr_t blk = bp->b_blkno; - - if (DISKPART(bp->b_dev) != RAW_PART) { - if (bounds_check_with_label(&sc->dk.sc_dkdev, bp, 0) <= 0) { - bp->b_resid = bp->b_bcount; - goto done; - } - p = &sc->dk.sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; - blk = bp->b_blkno + p->p_offset; - } - - while (bp->b_resid >= 512 && !bp->b_error) { - if (blk < 62521344) { - bp->b_error = rdfpga_sdcard_write_block(sc, blk, data); - } else { - aprint_error("%s:%d: blk = %lld write out of range! giving up\n", __PRETTY_FUNCTION__, __LINE__, blk); - bp->b_error = EINVAL; - } - blk ++; - data += 512; - bp->b_resid -= 512; - } -#endif - } - - /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_resid = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_resid); */ - /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_error = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_error); */ - - done: - biodone(bp); -#else struct rdfpga_sdcard_softc *sc = device_lookup_private(&rdfpga_sdcard_cd, DISKUNIT(bp->b_dev)); dk_strategy(&sc->dk, bp); -#endif } static void rdfpga_sdcard_set_geometry(struct rdfpga_sdcard_softc *sc) { @@ -749,8 +617,8 @@ rdfpga_sdcard_size(dev_t dev) { static void rdfpga_sdcard_minphys(struct buf *bp) { - if (bp->b_bcount > 16) - bp->b_bcount = 16; + if (bp->b_bcount > RDFPGA_SDCARD_VAL_DMA_MAX_SZ) + bp->b_bcount = RDFPGA_SDCARD_VAL_DMA_MAX_SZ; } static int @@ -792,15 +660,20 @@ rdfpga_sdcard_diskstart(device_t self, struct buf *bp) /* } */ while (bp->b_resid >= 512 && !err) { - if (blk < 62521344) { - err = rdfpga_sdcard_read_block(sc, blk, data); + u_int32_t blkcnt = bp->b_resid / 512; + + if (blkcnt > (RDFPGA_SDCARD_VAL_DMA_MAX_SZ/512)) + blkcnt = (RDFPGA_SDCARD_VAL_DMA_MAX_SZ/512); + + if (blk+blkcnt <= 62521344) { + err = rdfpga_sdcard_read_block(sc, blk, blkcnt, data); } else { aprint_error("%s:%d: blk = %lld read out of range! giving up\n", __PRETTY_FUNCTION__, __LINE__, blk); err = EINVAL; } - blk ++; - data += 512; - bp->b_resid -= 512; + blk += blkcnt; + data += 512 * blkcnt; + bp->b_resid -= 512 * blkcnt; } } else { #if 1 diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.h b/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.h index 2756faf..f00f0b0 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.h +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/rdfpga_sdcard.h @@ -66,7 +66,7 @@ struct rdfpga_sdcard_softc { #define RDFPGA_SDCARD_CTRL_START 0x80000000 #define RDFPGA_SDCARD_CTRL_READ 0x40000000 -/* one page, though we're likely to only use 512 bytes (one block) ATM */ -#define RDFPGA_SDCARD_VAL_DMA_MAX_SZ (4096) +/* 16 pages, though we're likely to only use 512 bytes (one block) ATM */ +#define RDFPGA_SDCARD_VAL_DMA_MAX_SZ (65536) #endif /* _RDFPGA_SDCARD_H_ */ diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c new file mode 100644 index 0000000..1f623d6 --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c @@ -0,0 +1,1070 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +int sbusfpga_curve25519engine_print(void *, const char *); +int sbusfpga_curve25519engine_match(device_t, cfdata_t, void *); +void sbusfpga_curve25519engine_attach(device_t, device_t, void *); + +CFATTACH_DECL_NEW(sbusfpga_c29e, sizeof(struct sbusfpga_curve25519engine_softc), + sbusfpga_curve25519engine_match, sbusfpga_curve25519engine_attach, NULL, NULL); + +dev_type_open(sbusfpga_curve25519engine_open); +dev_type_close(sbusfpga_curve25519engine_close); +dev_type_ioctl(sbusfpga_curve25519engine_ioctl); +dev_type_mmap(sbusfpga_curve25519engine_mmap); + + + +const struct cdevsw sbusfpga_c29e_cdevsw = { + .d_open = sbusfpga_curve25519engine_open, + .d_close = sbusfpga_curve25519engine_close, + .d_read = noread, + .d_write = nowrite, + .d_ioctl = sbusfpga_curve25519engine_ioctl, + .d_stop = nostop, + .d_tty = notty, + .d_poll = nopoll, + .d_mmap = sbusfpga_curve25519engine_mmap, + .d_kqfilter = nokqfilter, + .d_discard = nodiscard, + .d_flag = 0 +}; + +extern struct cfdriver sbusfpga_c29e_cd; + +struct sbusfpga_curve25519engine_montgomeryjob { + /* uint32_t x0_u[8]; */ + /* uint32_t x0_w[8]; */ + /* uint32_t x1_u[8]; */ + /* uint32_t x1_w[8]; */ + uint32_t affine_u[8]; + uint32_t scalar[8]; +}; +struct sbusfpga_curve25519engine_aesjob { + uint32_t data[8]; + uint32_t keys[120]; +}; + +static int init_programs(struct sbusfpga_curve25519engine_softc *sc); +static int write_inputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusfpga_curve25519engine_montgomeryjob *job, const int window); +static int start_job(struct sbusfpga_curve25519engine_softc *sc); +static int wait_job(struct sbusfpga_curve25519engine_softc *sc, uint32_t param); +static int read_outputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusfpga_curve25519engine_montgomeryjob *job, const int window); +static int dma_init(struct sbusfpga_curve25519engine_softc *sc); + +static int power_on(struct sbusfpga_curve25519engine_softc *sc); +static int power_off(struct sbusfpga_curve25519engine_softc *sc); + +int +sbusfpga_curve25519engine_open(dev_t dev, int flags, int mode, struct lwp *l) +{ + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, driver); + + if (sc == NULL) + return ENODEV; + + if ((unit != 0) && ((sc->active_sessions & (1 << unit)) == 0)) { + return ENODEV; + } + + /* first we need to turn the engine power on ... */ + power_on(sc); + + return (0); +} + +int +sbusfpga_curve25519engine_close(dev_t dev, int flags, int mode, struct lwp *l) +{ + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, driver); + + if (sc == NULL) + return ENODEV; + + if ((unit != 0) && (sc->active_sessions & (1 << unit))) { + device_printf(sc->sc_dev, "warning: close() on active session\n"); + sc->active_sessions &= ~(1 << unit); + sc->mapped_sessions &= ~(1 << unit); + } + + if (sc->active_sessions == 0) + power_off(sc); + + return (0); +} + +int +sbusfpga_curve25519engine_print(void *aux, const char *busname) +{ + + sbus_print(aux, busname); + return (UNCONF); +} + +int +sbusfpga_curve25519engine_match(device_t parent, cfdata_t cf, void *aux) +{ + struct sbus_attach_args *sa = (struct sbus_attach_args *)aux; + + return (strcmp("betrustedc25519e", sa->sa_name) == 0); +} + +static const uint32_t program_ec25519[134] = {0x00640840, 0x00680800, 0x006c0600, 0x00700840, 0x004c0a80, 0x00480800, 0x007407cc, 0x007c07cb, 0x0049d483, 0x0079b643, 0x0079e482, 0x00659783, 0x006db783, 0x0079c683, 0x0079e482, 0x0069a783, 0x0071c783, 0x00480740, 0x0001a645, 0x00780008, 0x0001e006, 0x0069a8c6, 0x0005a645, 0x00780048, 0x0005e046, 0x0009c6c5, 0x00780088, 0x0009e086, 0x0071c8c6, 0x000dc6c5, 0x007800c8, 0x000de0c6, 0x00100007, 0x00141047, 0x007458c6, 0x0019d105, 0x00780188, 0x0019e186, 0x001c3007, 0x00202047, 0x002481c5, 0x00780248, 0x0025e246, 0x007488c6, 0x0029d1c5, 0x00780288, 0x0029e286, 0x006c9247, 0x0030a287, 0x00346907, 0x00645107, 0x003c5345, 0x007803c8, 0x003de3c6, 0x0068f187, 0x0070c607, 0x010004c9, 0x004e14c6, 0xe5800809, 0x0079b643, 0x0079e482, 0x00659783, 0x006db783, 0x0079c683, 0x0079e482, 0x0069a783, 0x0071c783, 0x00740640, 0x00780680, 0x0001e787, 0x00040007, 0x00041047, 0x00081787, 0x000c2007, 0x001030c7, 0x00144087, 0x00700940, 0x00185147, 0x00721706, 0x01000709, 0x00186187, 0xfe000809, 0x001c5187, 0x00700980, 0x002071c7, 0x00721706, 0x01000709, 0x00208207, 0xfe000809, 0x00247207, 0x007009c0, 0x00289247, 0x00721706, 0x01000709, 0x0028a287, 0xfe000809, 0x002c9287, 0x00700980, 0x0030b2c7, 0x00721706, 0x01000709, 0x0030c307, 0xfe000809, 0x00347307, 0x00700a00, 0x0038d347, 0x00721706, 0x01000709, 0x0038e387, 0xfe000809, 0x003cd387, 0x00700a40, 0x0040f3c7, 0x00721706, 0x01000709, 0x00410407, 0xfe000809, 0x0044f407, 0x00700a00, 0x00491447, 0x00721706, 0x01000709, 0x00492487, 0xfe000809, 0x004cd487, 0x00700940, 0x005134c7, 0x00721706, 0x01000709, 0x00514507, 0xfe000809, 0x00543507, 0x007d5747, 0x0000000a }; + +static const uint32_t program_gcm[20] = {0x0010100d, 0x0094100d, 0x0118100d, 0x019c100d, 0x00186143, 0x00160191, 0x00186811, 0x001c61c3, 0x00105103, 0x008441ce, 0x0082010e, 0x00080010, 0x008e008f, 0x0112008f, 0x0396008f, 0x00083083, 0x00105103, 0x00084083, 0x00001083, 0x0000000a }; + +static const uint32_t program_aes[16] = {0x0001f003,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00811052,0x0000000a }; + +static const uint32_t program_gcm_pfx[30] = {0x01400411,0x00080840,0x00040800,0x0001f043,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00811052,0x03800089,0x003c0000,0x01400411,0x0042b405,0x01400411,0x00080800,0x00040400,0xf4800809,0x00380000,0x01bc03d1,0x003cf3d1,0x00340800 }; + +static const uint32_t program_gcm_ad[29] = {0x0d800309,0x000000d3,0x01800011,0x00000011,0x0000d003,0x000f00c5,0x00321306,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x00800309,0xf2800809,0x0000000a }; + +static const uint32_t program_gcm_aes[50] = {0x18000309,0x01400411,0x0042b405,0x01400411,0x0001f403,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00851052,0x000000d3,0x00001003,0x00ac02d3,0x01800011,0x00000011,0x0000d003,0x000f00c5,0x002f02c5,0x00321306,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x00800309,0xe8000809,0x0000000a }; + +static const uint32_t program_gcm_finish[71] = {0x16000309,0x01400411,0x0042b405,0x01400411,0x0001f403,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00851052,0x0004a054,0x000000d3,0x00001003,0x00ac02d3,0x01800011,0x00000011,0x0000d003,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x01a40251,0x00249251,0x0000d243,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x01b40351,0x0034d351,0x0020e343,0x0000000a }; + +// second and third are for testing and shall be removed +static const uint32_t* programs[8] = { program_ec25519, program_gcm, program_aes, program_gcm_pfx, program_gcm_ad, program_gcm_aes, program_gcm_finish, NULL }; +static const uint32_t program_len[8] = { 134, 20, 16, 30, 29, 50, 71, 0 }; +static uint32_t program_offset[8]; + +/* + * Attach all the sub-devices we can find + */ +void +sbusfpga_curve25519engine_attach(device_t parent, device_t self, void *aux) +{ + struct sbus_attach_args *sa = aux; + struct sbusfpga_curve25519engine_softc *sc = device_private(self); + struct sbus_softc *sbsc = device_private(parent); + int node; + int sbusburst; + + sc->sc_bustag = sa->sa_bustag; + sc->sc_dmatag = sa->sa_dmatag; + sc->sc_dev = self; + + aprint_normal("\n"); + + if (sa->sa_nreg < 3) { + aprint_error(": Not enough registers spaces\n"); + return; + } + + /* map registers */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[0].oa_space /* sa_slot */, + sa->sa_reg[0].oa_base /* sa_offset */, + sa->sa_reg[0].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_curve25519engine) != 0) { + aprint_error(": cannot map Curve25519Engine registers\n"); + return; + } else { + aprint_normal_dev(self, "Curve25519Engine registers @ %p\n", (void*)sc->sc_bhregs_curve25519engine); + } + /* map microcode */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[1].oa_space /* sa_slot */, + sa->sa_reg[1].oa_base /* sa_offset */, + sa->sa_reg[1].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_microcode) != 0) { + aprint_error(": cannot map Curve25519Engine microcode\n"); + return; + } else { + aprint_normal_dev(self, "Curve25519Engine microcode @ %p\n", (void*)sc->sc_bhregs_microcode); + } + /* map register file */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[2].oa_space /* sa_slot */, + sa->sa_reg[2].oa_base /* sa_offset */, + sa->sa_reg[2].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_regfile) != 0) { + aprint_error(": cannot map Curve25519Engine regfile\n"); + return; + } else { + aprint_normal_dev(self, "Curve25519Engine regfile @ %p\n", (void*)sc->sc_bhregs_regfile); + } + sc->sc_bufsiz_curve25519engine = sa->sa_reg[0].oa_size; + sc->sc_bufsiz_microcode = sa->sa_reg[1].oa_size; + sc->sc_bufsiz_regfile = sa->sa_reg[2].oa_size; + + node = sc->sc_node = sa->sa_node; + + /* + * Get transfer burst size from PROM + */ + sbusburst = sbsc->sc_burst; + if (sbusburst == 0) + sbusburst = SBUS_BURST_32 - 1; /* 1->16 */ + + sc->sc_burst = prom_getpropint(node, "burst-sizes", -1); + if (sc->sc_burst == -1) + /* take SBus burst sizes */ + sc->sc_burst = sbusburst; + + /* Clamp at parent's burst sizes */ + sc->sc_burst &= sbusburst; + + aprint_normal("\n"); + aprint_normal_dev(self, "nid 0x%x, bustag %p, burst 0x%x (parent 0x%0x)\n", + sc->sc_node, + sc->sc_bustag, + sc->sc_burst, + sbsc->sc_burst); + + /* first we need to turn the engine power on ... */ + power_on(sc); + + if (init_programs(sc)) { + if (init_programs(sc)) { + aprint_normal_dev(sc->sc_dev, "INIT - FAILED\n"); + sc->initialized = 0; + } else { + sc->initialized = 1; + } + } else { + sc->initialized = 1; + } + + power_off(sc); + + sc->active_sessions = 0; + sc->mapped_sessions = 0; + + if (!dma_init(sc)) { + // ouch + sc->active_sessions = 0xFFFFFFFF; + sc->mapped_sessions = 0xFFFFFFFF; + } +} + +#define CONFIG_CSR_DATA_WIDTH 32 +// define CSR_LEDS_BASE & others to avoid defining the CSRs of HW we don't handle +#define CSR_LEDS_BASE +//#define CSR_CURVE25519ENGINE_BASE +#define CSR_DDRPHY_BASE +#define CSR_EXCHANGE_WITH_MEM_BASE +#define CSR_SBUS_BUS_STAT_BASE +#define CSR_SDRAM_BASE +#define CSR_SDBLOCK2MEM_BASE +#define CSR_SDCORE_BASE +#define CSR_SDIRQ_BASE +#define CSR_SDMEM2BLOCK_BASE +#define CSR_SDPHY_BASE +#define CSR_TRNG_BASE +#include "dev/sbus/litex_csr.h" +#undef CSR_LEDS_BASE +//#undef CSR_CURVE25519ENGINE_BASE +#undef CSR_DDRPHY_BASE +#undef CSR_EXCHANGE_WITH_MEM_BASE +#undef CSR_SBUS_BUS_STAT_BASE +#undef CSR_SDRAM_BASE +#undef CSR_SDBLOCK2MEM_BASE +#undef CSR_SDCORE_BASE +#undef CSR_SDIRQ_BASE +#undef CSR_SDMEM2BLOCK_BASE +#undef CSR_SDPHY_BASE +#undef CSR_TRNG_BASE + +#define REG_BASE(reg) (base + (reg * 32)) +#define SUBREG_ADDR(reg, off) (REG_BASE(reg) + (off)*4) + +#include +//cprng_strong32() +struct sbusfpga_curve25519engine_session { + uint32_t session; + uint32_t cookie; +}; +struct sbusfpga_curve25519engine_session_len { + uint32_t session; + uint32_t cookie; + uint32_t len; +}; +struct sbusfpga_curve25519engine_session_len_data { + uint32_t session; + uint32_t cookie; + uint32_t len; + uint32_t data[8]; + uint32_t keys[60]; +}; +struct sbusfpga_curve25519engine_session_len_final { + uint32_t session; + uint32_t cookie; + uint32_t len; + uint32_t data[8]; +}; + +#define CHECKSESSION(ses) \ + do { \ + if ((ses->session >= MAX_ACTIVE_SESSION) || (ses->session >= MAX_SESSION)) \ + return EINVAL; \ + if (sc->sessions_cookies[ses->session] == 0) \ + return EINVAL; \ + if (sc->sessions_cookies[ses->session] != ses->cookie) \ + return EINVAL; \ + if (ses->session != unit) \ + return EINVAL; \ + if ((sc->active_sessions & (1 << ses->session)) == 0) \ + return EINVAL; \ + } while (0) + +#define SBUSFPGA_DO_MONTGOMERYJOB _IOWR(0, 0, struct sbusfpga_curve25519engine_montgomeryjob) +#define SBUSFPGA_EC25519_CHECKGCM _IOW(0, 1, struct sbusfpga_curve25519engine_montgomeryjob) +#define SBUSFPGA_EC25519_CHECKAES _IOW(0, 2, struct sbusfpga_curve25519engine_aesjob) + +#define SBUSFPGA_EC25519_GETSESSION _IOR(1, 0, struct sbusfpga_curve25519engine_session) +#define SBUSFPGA_EC25519_OPENSESSION _IOW(1, 1, struct sbusfpga_curve25519engine_session) +#define SBUSFPGA_EC25519_CLOSESESSION _IOW(1, 2, struct sbusfpga_curve25519engine_session) +#define SBUSFPGA_EC25519_GCMPFX _IOW(1, 3, struct sbusfpga_curve25519engine_session_len_data) +#define SBUSFPGA_EC25519_GCMAD _IOW(1, 4, struct sbusfpga_curve25519engine_session_len) +#define SBUSFPGA_EC25519_GCMAES _IOW(1, 5, struct sbusfpga_curve25519engine_session_len) +#define SBUSFPGA_EC25519_GCMFINISH _IOWR(1, 6, struct sbusfpga_curve25519engine_session_len_final) + +static int get_session(struct sbusfpga_curve25519engine_softc *sc) { + int i; + /* don't use 0, we use it for testing */ + /* also minor 0 is used to request session, 1-7 to open/close/map using session # */ + for (i = 1 ; (i < MAX_ACTIVE_SESSION) && (i < MAX_SESSION) ; i++) { + if (((sc->active_sessions & (1<mapped_sessions & (1<active_sessions |= (1<initialized) { + if (init_programs(sc)) { + return ENXIO; + } else { + sc->initialized = 1; + } + } + switch (cmd) { + case SBUSFPGA_DO_MONTGOMERYJOB: { + if (unit != 0) + return ENOTTY; + + struct sbusfpga_curve25519engine_montgomeryjob* job = (struct sbusfpga_curve25519engine_montgomeryjob*)data; + curve25519engine_mpstart_write(sc, program_offset[0]); /* EC25519 */ + curve25519engine_mplen_write(sc, program_len[0]); /* EC25519 */ + + err = write_inputs(sc, job, 0); + if (err) + return err; + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, 1); + if (err) + return err; + err = read_outputs(sc, job, 0); + if (err) + return err; + } + break; + case SBUSFPGA_EC25519_CHECKGCM: { + if (unit != 0) + return ENOTTY; + + const uint32_t base = 0; + struct sbusfpga_curve25519engine_montgomeryjob* job = (struct sbusfpga_curve25519engine_montgomeryjob*)data; + int reg, i; + + curve25519engine_mpstart_write(sc, program_offset[1]); /* GCM */ + curve25519engine_mplen_write(sc, program_len[1]); /* GCM */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,i), job->affine_u[i]); + } + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(1,i), job->scalar[i]); + } + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, 1); + /* if (err) */ + /* return err; */ + + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } + } + break; + case SBUSFPGA_EC25519_CHECKAES: { + if (unit != 0) + return ENOTTY; + + const uint32_t base = 0; + struct sbusfpga_curve25519engine_aesjob* job = (struct sbusfpga_curve25519engine_aesjob*)data; + int reg, i; + + curve25519engine_mpstart_write(sc, program_offset[2]); /* AES */ + curve25519engine_mplen_write(sc, program_len[2]); /* AES */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,i), job->data[i]); + } + for (reg = 31 ; reg > 16 ; reg--) { + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i), job->keys[i+8*(31-reg)]); + } + } + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, 1); + /* if (err) */ + /* return err; */ + + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "AES %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } + } + break; + + case SBUSFPGA_EC25519_GCMPFX: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len_data* job = (struct sbusfpga_curve25519engine_session_len_data*)data; + int reg, i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + //void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 128) { + device_printf(sc->sc_dev, "job->len too big: %u", job->len); + return EINVAL; + } + + curve25519engine_mpstart_write(sc, program_offset[3]); /* GCM_PFX */ + curve25519engine_mplen_write(sc, program_len[3] + program_len[4]); /* GCM_PFX + GCM_AD */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); + } + /* data */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(16,i), job->data[i]); + } + for (reg = 31 ; reg > 16 ; reg--) { + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i), job->keys[i+4*(31-reg)]); + } + } + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, job->len); + if (err) + return err; + +#if 0 + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM_PFX %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } +#endif + } + break; + + case SBUSFPGA_EC25519_GCMAD: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len* job = (struct sbusfpga_curve25519engine_session_len*)data; + int i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + //void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 128) + return EINVAL; + + curve25519engine_mpstart_write(sc, program_offset[4]); /* GCM_AES */ + curve25519engine_mplen_write(sc, program_len[4]); /* GCM_AES */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); + } + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, job->len); + if (err) + return err; + +#if 0 + int reg; + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM_AD %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } +#endif + } + break; + + case SBUSFPGA_EC25519_GCMAES: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len* job = (struct sbusfpga_curve25519engine_session_len*)data; + int i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 128) + return EINVAL; + + curve25519engine_mpstart_write(sc, program_offset[5]); /* GCM_AES */ + curve25519engine_mplen_write(sc, program_len[5]); /* GCM_AES */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_addr */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), (i == 0) ? ((uint32_t)wr_ptr) : 0); + } + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); + } + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, job->len); + if (err) + return err; +#if 0 + int reg; + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM_AES %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } +#endif + } + break; + + + case SBUSFPGA_EC25519_GCMFINISH: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len_final* job = (struct sbusfpga_curve25519engine_session_len_final*)data; + int i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 15) + return EINVAL; + + curve25519engine_mpstart_write(sc, program_offset[6]); /* GCM_FINISH */ + curve25519engine_mplen_write(sc, program_len[6]); /* GCM_FINISH */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_addr */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), (i == 0) ? ((uint32_t)wr_ptr) : 0); + } + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); + } + /* final block */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(9,i), job->data[i]); + } + /* create and generate MMASK */ + for (i = 0 ; i < 4 ; i ++) { + uint32_t mask; + int idx = i; + if (job->len <= (idx*4)) { + mask = 0; + } else if (job->len >= (idx+1)*4) { + mask = 0xFFFFFFFF; + } else { + mask = 0xFFFFFFFF >> (8*(4-(job->len%4))); + } + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(10,i), mask); + } + + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, job->len); + if (err) + return err; + + /* final accum */ + for (i = 0 ; i < 4 ; i ++) { + job->data[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(8,i)); + } + +#if 0 + int reg; + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM_FINISH %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } +#endif + } + break; + + case SBUSFPGA_EC25519_GETSESSION:{ + if (unit != 0) + return ENOTTY; + + struct sbusfpga_curve25519engine_session* ses = (struct sbusfpga_curve25519engine_session*)data; + int s = get_session(sc); + if (s < 0) + return EBUSY; + ses->session = s; + sc->sessions_cookies[s] = cprng_strong32(); + ses->cookie = sc->sessions_cookies[s]; + } + break; + case SBUSFPGA_EC25519_OPENSESSION:{ + if (unit == 0) + return ENOTTY; + + struct sbusfpga_curve25519engine_session* ses = (struct sbusfpga_curve25519engine_session*)data; + CHECKSESSION(ses); + if ((sc->mapped_sessions & (1 << ses->session)) != 0) + return EINVAL; + } + break; + case SBUSFPGA_EC25519_CLOSESESSION:{ + if (unit == 0) + return ENOTTY; + + struct sbusfpga_curve25519engine_session* ses = (struct sbusfpga_curve25519engine_session*)data; + + CHECKSESSION(ses); + + /* if ((sc->mapped_sessions & (1 << ses->session)) != 0) */ + /* return EBUSY; */ + sc->sessions_cookies[ses->session] = 0; + sc->active_sessions &= ~(1 << ses->session); + sc->mapped_sessions &= ~(1 << ses->session); // FIXME + } + break; + + default: + err = EINVAL; + break; + } + + return(err); +} + + +static int power_on(struct sbusfpga_curve25519engine_softc *sc) { + int err = 0; + if ((curve25519engine_power_read(sc) & 1) == 0) { + curve25519engine_power_write(sc, 1); + delay(1); + } + return err; +} +static int power_off(struct sbusfpga_curve25519engine_softc *sc) { + int err = 0; + curve25519engine_power_write(sc, 0); + return err; +} + +static int init_programs(struct sbusfpga_curve25519engine_softc *sc) { + /* the microcode is a the beginning */ + int err = 0; + uint32_t i, j; + uint32_t offset = 0; + + for (j = 0 ; programs[j] != NULL; j ++) { + program_offset[j] = offset; + for (i = 0 ; i < program_len[j] ; i++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_microcode, ((offset+i)*4), programs[j][i]); + if ((i%16)==15) + delay(1); + } + offset += program_len[j]; + } + + curve25519engine_window_write(sc, 0); /* could use window_window to access fields, but it creates a RMW cycle for nothing */ + curve25519engine_mpstart_write(sc, 0); /* EC25519 */ + curve25519engine_mplen_write(sc, program_len[0]); /* EC25519 */ + + aprint_normal_dev(sc->sc_dev, "INIT - Curve25519Engine status: 0x%08x\n", curve25519engine_status_read(sc)); + +#if 1 + /* double check */ + u_int32_t x; + int count = 0; + for (i = 0 ; i < program_len[0] && count < 10; i++) { + x = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_microcode, (i*4)); + if (x != programs[0][i]) { + aprint_error_dev(sc->sc_dev, "INIT - Curve25519Engine program failure: [%d] 0x%08x <> 0x%08x\n", i, x, programs[0][i]); + err = 1; + count ++; + } + if ((i%8)==7) + delay(1); + } + if ((x = curve25519engine_window_read(sc)) != 0) { + aprint_error_dev(sc->sc_dev, "INIT - Curve25519Engine register failure: window = 0x%08x\n", x); + err = 1; + } + if ((x = curve25519engine_mpstart_read(sc)) != 0) { + aprint_error_dev(sc->sc_dev, "INIT - Curve25519Engine register failure: mpstart = 0x%08x\n", x); + err = 1; + } + if ((x = curve25519engine_mplen_read(sc)) != program_len[0]) { + aprint_error_dev(sc->sc_dev, "INIT - Curve25519Engine register failure: mplen = 0x%08x\n", x); + err = 1; + } + const int test_reg_num = 73; + const uint32_t test_reg_value = 0x0C0FFEE0; + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile, 4*test_reg_num, test_reg_value); + delay(1); + if ((x = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile, 4*test_reg_num)) != test_reg_value) { + aprint_error_dev(sc->sc_dev, "INIT - Curve25519Engine register file failure: 0x%08x != 0x%08x\n", x, test_reg_value); + err = 1; + } +#endif + + return err; +} + +static int write_inputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusfpga_curve25519engine_montgomeryjob *job, const int window) { + const uint32_t base = window * 0x400; + int i; + uint32_t status = curve25519engine_status_read(sc); + int err = 0; + if (status & (1<sc_dev, "WRITE - Curve25519Engine status: 0x%08x, still running?\n", status); + return ENXIO; + } + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(24,i), job->affine_u[i]); + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i), job->x0_u[i]); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i), job->x0_w[i]); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i), job->x1_u[i]); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i), job->x1_w[i]); */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(31,i), job->scalar[i]); + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(19,i), ((i == 0) ? 254 : 0)); */ + /* delay(1); */ + } + +#if 1 + for (i = 0 ; i < 8 && !err; i ++) { + if (job->affine_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(24,i))) err = EIO; + /* if (job->x0_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i))) err = EIO; */ + /* if (job->x0_w[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i))) err = EIO; */ + /* if (job->x1_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i))) err = EIO; */ + /* if (job->x1_w[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i))) err = EIO; */ + if (job->scalar[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(31,i))) err = EIO; + /* delay(1); */ + } + if (err) aprint_error_dev(sc->sc_dev, "WRITE - data did not read-write properly\n"); +#endif + + return err; +} + +static int start_job(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t status = curve25519engine_status_read(sc); + if (status & (1<sc_dev, "START - Curve25519Engine status: 0x%08x, still running?\n", status); + return ENXIO; + } + curve25519engine_control_write(sc, 1); + //aprint_normal_dev(sc->sc_dev, "START - Curve25519Engine status: 0x%08x\n", curve25519engine_status_read(sc)); + + return 0; +} + +static int wait_job(struct sbusfpga_curve25519engine_softc *sc, uint32_t param) { + uint32_t status = curve25519engine_status_read(sc); + int count = 0; + int max_count = 250; + int del = 1; + const int max_del = 32; + static int max_del_seen = 1; + static int max_cnt_seen = 0; + + while ((status & (1<sc_dev, "WAIT - ongoing, Curve25519Engine status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, ls_status); + count ++; + delay(del); + del = del < max_del ? 2*del : del; + status = curve25519engine_status_read(sc); + } + if (del > max_del_seen) { + max_del_seen = del; + aprint_normal_dev(sc->sc_dev, "WAIT - new max delay %d after %d count (param was %u)\n", max_del_seen, count, param); + } + if (count > max_cnt_seen) { + max_cnt_seen = count; + aprint_normal_dev(sc->sc_dev, "WAIT - new max count %d with %d delay (param was %u)\n", max_cnt_seen, del, param); + + } + + //curve25519engine_control_write(sc, 0); + if (status & (1<sc_dev, "WAIT - Curve25519Engine status: 0x%08x, did not finish in time? [inst: 0x%08x ls_status: 0x%08x]\n", status, curve25519engine_instruction_read(sc), curve25519engine_ls_status_read(sc)); + return ENXIO; + } else if (status & (1<sc_dev, "WAIT - Curve25519Engine status: 0x%08x, sigill [inst: 0x%08x ls_status: 0x%08x]\n", status, curve25519engine_instruction_read(sc), curve25519engine_ls_status_read(sc)); + return ENXIO; + } else if (status & (1<sc_dev, "WAIT - Curve25519Engine status: 0x%08x, aborted [inst: 0x%08x ls_status: 0x%08x]\n", status, curve25519engine_instruction_read(sc), curve25519engine_ls_status_read(sc)); + return ENXIO; + } else { + //aprint_normal_dev(sc->sc_dev, "WAIT - Curve25519Engine status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, curve25519engine_ls_status_read(sc)); + } + + return 0; +} + +static int read_outputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusfpga_curve25519engine_montgomeryjob *job, const int window) { + const uint32_t base = window * 0x400; + int i; + uint32_t status = curve25519engine_status_read(sc); + if (status & (1<sc_dev, "READ - Curve25519Engine status: 0x%08x, still running?\n", status); + return ENXIO; + } + + for (i = 0 ; i < 8 ; i ++) { + /* job->affine_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(24,i)); */ + /* job->x0_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i)); */ + /* job->x0_w[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i)); */ + /* job->x1_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i)); */ + /* job->x1_w[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i)); */ + job->scalar[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,i)); + /* delay(1); */ + } + aprint_normal_dev(sc->sc_dev, "READ - Curve25519Engine 19 low 32 bits: 0x%08x\n", bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(19,0))); + + return 0; +} + + +static int +dma_init(struct sbusfpga_curve25519engine_softc *sc) { + + /* Allocate a dmamap */ + if (bus_dmamap_create(sc->sc_dmatag, SBUSFPGA_CURVE25519ENGINE_VAL_DMA_MAX_SZ, 1, SBUSFPGA_CURVE25519ENGINE_VAL_DMA_MAX_SZ, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, &sc->sc_dmamap) != 0) { + aprint_error_dev(sc->sc_dev, "DMA map create failed\n"); + return 0; + } else { + aprint_normal_dev(sc->sc_dev, "dmamap: %lu %lu %d (%p)\n", sc->sc_dmamap->dm_maxsegsz, sc->sc_dmamap->dm_mapsize, sc->sc_dmamap->dm_nsegs, sc->sc_dmatag->_dmamap_load); + } + + if (bus_dmamem_alloc(sc->sc_dmatag, SBUSFPGA_CURVE25519ENGINE_VAL_DMA_MAX_SZ, 64, 64, &sc->sc_segs, 1, &sc->sc_rsegs, BUS_DMA_NOWAIT | BUS_DMA_STREAMING)) { + aprint_error_dev(sc->sc_dev, "cannot allocate DVMA memory"); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + if (bus_dmamem_map(sc->sc_dmatag, &sc->sc_segs, 1, SBUSFPGA_CURVE25519ENGINE_VAL_DMA_MAX_SZ, &sc->sc_dma_kva, BUS_DMA_NOWAIT)) { + aprint_error_dev(sc->sc_dev, "cannot allocate DVMA address"); + bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + if (bus_dmamap_load(sc->sc_dmatag, sc->sc_dmamap, sc->sc_dma_kva, SBUSFPGA_CURVE25519ENGINE_VAL_DMA_MAX_SZ, /* kernel space */ NULL, + BUS_DMA_NOWAIT | BUS_DMA_STREAMING | BUS_DMA_WRITE)) { + aprint_error_dev(sc->sc_dev, "cannot load dma map"); + bus_dmamem_unmap(sc->sc_dmatag, &sc->sc_dma_kva, SBUSFPGA_CURVE25519ENGINE_VAL_DMA_MAX_SZ); + bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + aprint_normal_dev(sc->sc_dev, "DMA: SW -> kernel address is %p, dvma address is 0x%08llx, seg %llx / %ld\n", sc->sc_dma_kva, sc->sc_dmamap->dm_segs[0].ds_addr, sc->sc_segs.ds_addr, sc->sc_segs.ds_len); + + return 1; +} + +paddr_t sbusfpga_curve25519engine_mmap(dev_t dev, off_t offset, int prot) { + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, driver); + paddr_t addr = -1; + + device_printf(sc->sc_dev, "%s:%d: %lld %d for %d / %d\n", __PRETTY_FUNCTION__, __LINE__, offset, prot, driver, unit); + + if (offset != 0) + return -1; + if (prot & PROT_EXEC) + return -1; + /* if (sc->mapped_sessions & (1 << unit)) */ + /* return -1; */ + if ((sc->active_sessions & (1 << unit)) == 0) + return -1; + if (unit >= MAX_ACTIVE_SESSION) + return -1; + if (unit <= 0) + return -1; + + // addr = bus_dmamem_mmap(sc->sc_dmatag, sc->sc_dmamap->dm_segs, 1, (off_t)(4096*unit), prot, BUS_DMA_NOWAIT); + if (pmap_extract(pmap_kernel(), ((vaddr_t)sc->sc_dma_kva) + (unit * 4096), &addr)) { + + device_printf(sc->sc_dev, "mapped page %d to 0x%08lx [0x%08lx], kernel is %p\n", unit, addr, atop(addr), (void*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096))); + + ((uint32_t*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096)))[0] = 0xDEADBEEF; + sc->mapped_sessions |= (1 << unit); + + return addr; + } + + return -1; +} diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.h b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.h new file mode 100644 index 0000000..df352ca --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.h @@ -0,0 +1,62 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SBUSFPGA_CURVE25519ENGINE_H_ +#define _SBUSFPGA_CURVE25519ENGINE_H_ + +#define MAX_SESSION 32 // HW limit +#define MAX_ACTIVE_SESSION 8 // SW-imposed limit +// Single 4KiB pages per session +#define SBUSFPGA_CURVE25519ENGINE_VAL_DMA_MAX_SZ (MAX_ACTIVE_SESSION*4*1024) + +struct sbusfpga_curve25519engine_softc { + device_t sc_dev; /* us as a device */ + u_int sc_rev; /* revision */ + int sc_node; /* PROM node ID */ + int sc_burst; /* DVMA burst size in effect */ + bus_space_tag_t sc_bustag; /* bus tag */ + bus_space_handle_t sc_bhregs_curve25519engine; /* bus handle */ + bus_space_handle_t sc_bhregs_microcode; /* bus handle */ + bus_space_handle_t sc_bhregs_regfile; /* bus handle */ + //void * sc_buffer; /* VA of the registers */ + int sc_bufsiz_curve25519engine; /* Size of buffer */ + int sc_bufsiz_microcode; /* Size of buffer */ + int sc_bufsiz_regfile; /* Size of buffer */ + int initialized; + uint32_t active_sessions; + uint32_t mapped_sessions; + uint32_t sessions_cookies[MAX_ACTIVE_SESSION]; + /* DMA kernel structures */ + bus_dma_tag_t sc_dmatag; + bus_dmamap_t sc_dmamap; + bus_dma_segment_t sc_segs; + int sc_rsegs; + void * sc_dma_kva; +}; + +#endif /* _SBUSFPGA_CURVE25519ENGINE_H_ */ diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_sdram.c b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_sdram.c new file mode 100644 index 0000000..c99832b --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_sdram.c @@ -0,0 +1,1898 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2021 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include + +#include + + #include + +int sbusfpga_sdram_match(device_t, cfdata_t, void *); +void sbusfpga_sdram_attach(device_t, device_t, void *); + +CFATTACH_DECL_NEW(sbusfpga_sdram, sizeof(struct sbusfpga_sdram_softc), + sbusfpga_sdram_match, sbusfpga_sdram_attach, NULL, NULL); + +dev_type_open(sbusfpga_sdram_open); +dev_type_close(sbusfpga_sdram_close); +dev_type_read(sbusfpga_sdram_read); +dev_type_write(sbusfpga_sdram_write); +dev_type_ioctl(sbusfpga_sdram_ioctl); +dev_type_strategy(sbusfpga_sdram_strategy); +dev_type_size(sbusfpga_sdram_size); + +const struct bdevsw sbusfpga_sdram_bdevsw = { + .d_open = sbusfpga_sdram_open, + .d_close = sbusfpga_sdram_close, + .d_strategy = sbusfpga_sdram_strategy, + .d_ioctl = sbusfpga_sdram_ioctl, + .d_dump = nodump, + .d_psize = sbusfpga_sdram_size, + .d_discard = nodiscard, + .d_flag = D_DISK +}; + +const struct cdevsw sbusfpga_sdram_cdevsw = { + .d_open = sbusfpga_sdram_open, + .d_close = sbusfpga_sdram_close, + .d_read = sbusfpga_sdram_read, + .d_write = sbusfpga_sdram_write, + .d_ioctl = sbusfpga_sdram_ioctl, + .d_stop = nostop, + .d_tty = notty, + .d_poll = nopoll, + .d_mmap = nommap, + .d_kqfilter = nokqfilter, + .d_discard = nodiscard, + .d_flag = 0 +}; + +static void sbusfpga_sdram_set_geometry(struct sbusfpga_sdram_softc *sc); +static void sbusfpga_sdram_minphys(struct buf *); +static int sbusfpga_sdram_diskstart(device_t self, struct buf *bp); + +struct dkdriver sbusfpga_sdram_dkdriver = { + .d_strategy = sbusfpga_sdram_strategy, + .d_minphys = sbusfpga_sdram_minphys, + .d_diskstart = sbusfpga_sdram_diskstart +}; + +extern struct cfdriver sbusfpga_sdram_cd; + +static int sbusfpga_sdram_read_block(struct sbusfpga_sdram_softc *sc, const u_int32_t block, const u_int32_t blkcnt, void *data); +static int sbusfpga_sdram_write_block(struct sbusfpga_sdram_softc *sc, const u_int32_t block, const u_int32_t blkcnt, void *data); + +struct sbusfpga_sdram_rwpg { + u_int32_t pgdata[512]; + u_int32_t checksum[8]; + u_int32_t checksumbis[8]; + u_int32_t pgnum; + u_int32_t last_blk; + u_int32_t last_dma; + u_int32_t dma_wrdone; +}; +#define SBUSFPGA_READ_PG _IOWR('X', 0, struct sbusfpga_sdram_rwpg) +#define SBUSFPGA_WRITE_PG _IOWR('X', 1, struct sbusfpga_sdram_rwpg) + +static inline void exchange_with_mem_checksum_read(struct sbusfpga_sdram_softc *sc, uint32_t* data); +static inline void exchange_with_mem_checksum_write(struct sbusfpga_sdram_softc *sc, uint32_t* data); + +int +sbusfpga_sdram_open(dev_t dev, int flag, int fmt, struct lwp *l) +{ + struct sbusfpga_sdram_softc *sd = device_lookup_private(&sbusfpga_sdram_cd, DISKUNIT(dev)); + struct dk_softc *dksc; + int error = 0; + + if (sd == NULL) { + aprint_error("%s:%d: sd == NULL! giving up\n", __PRETTY_FUNCTION__, __LINE__); + return (ENXIO); + } else { + aprint_normal("%s:%d: open device, part is %d\n", __PRETTY_FUNCTION__, __LINE__, DISKPART(dev)); + } + dksc = &sd->dk; + + if (!device_is_active(dksc->sc_dev)) { + return (ENODEV); + } + + error = dk_open(dksc, dev, flag, fmt, l); + + return error; +} + +int +sbusfpga_sdram_close(dev_t dev, int flag, int fmt, struct lwp *l) +{ + struct sbusfpga_sdram_softc *sd = device_lookup_private(&sbusfpga_sdram_cd, DISKUNIT(dev)); + struct dk_softc *dksc; + int error = 0; + + if (sd == NULL) { + aprint_error("%s:%d: sd == NULL! giving up\n", __PRETTY_FUNCTION__, __LINE__); + return (ENXIO); + } + + dksc = &sd->dk; + + error = dk_close(dksc, dev, flag, fmt, l); + + return error; +} + +int +sbusfpga_sdram_read(dev_t dev, struct uio *uio, int flags) +{ + return physio(sbusfpga_sdram_strategy, NULL, dev, B_READ, sbusfpga_sdram_minphys, uio); +} + +int +sbusfpga_sdram_write(dev_t dev, struct uio *uio, int flags) +{ + return physio(sbusfpga_sdram_strategy, NULL, dev, B_WRITE, sbusfpga_sdram_minphys, uio); +} + +int +sbusfpga_sdram_match(device_t parent, cfdata_t cf, void *aux) +{ + struct sbus_attach_args *sa = (struct sbus_attach_args *)aux; + + return (strcmp("RDOL,sdram", sa->sa_name) == 0); +} + +int +sdram_init(struct sbusfpga_sdram_softc *sc); + +int +dma_init(struct sbusfpga_sdram_softc *sc); + +int +dma_memtest(struct sbusfpga_sdram_softc *sc); + +/* + * Attach all the sub-devices we can find + */ +void +sbusfpga_sdram_attach(device_t parent, device_t self, void *aux) +{ + struct sbus_attach_args *sa = aux; + struct sbusfpga_sdram_softc *sc = device_private(self); + struct sbus_softc *sbsc = device_private(parent); + int node; + int sbusburst; + + sc->sc_bustag = sa->sa_bustag; + sc->sc_dmatag = sa->sa_dmatag; + sc->dk.sc_dev = self; + + aprint_normal("\n"); + + if (sa->sa_nreg < 3) { + aprint_error(": Not enough registers spaces\n"); + return; + } + + /* map DDR PHY */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[0].oa_space /* sa_slot */, + sa->sa_reg[0].oa_base /* sa_offset */, + sa->sa_reg[0].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_ddrphy) != 0) { + aprint_error(": cannot map DDR PHY registers\n"); + return; + } else { + aprint_normal_dev(self, "DDR PHY registers @ %p\n", (void*)sc->sc_bhregs_ddrphy); + } + /* map SDRAM DFII */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[1].oa_space /* sa_slot */, + sa->sa_reg[1].oa_base /* sa_offset */, + sa->sa_reg[1].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_sdram) != 0) { + aprint_error(": cannot map SDRAM DFII registers\n"); + return; + } else { + aprint_normal_dev(self, "SDRAM DFII registers @ %p\n", (void*)sc->sc_bhregs_sdram); + } + /* custom DMA */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[2].oa_space /* sa_slot */, + sa->sa_reg[2].oa_base /* sa_offset */, + sa->sa_reg[2].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_exchange_with_mem) != 0) { + aprint_error(": cannot map DMA registers\n"); + return; + } else { + aprint_normal_dev(self, "DMA registers @ %p\n", (void*)sc->sc_bhregs_exchange_with_mem); + } +#if 0 + if (sa->sa_nreg >= 4) { + /* if we map some of the memory itself */ + /* normally disabled, it's a debug feature */ + if (sbus_bus_map(sc->sc_bustag, + sa->sa_reg[3].oa_space /* sa_slot */, + sa->sa_reg[3].oa_base /* sa_offset */, + sa->sa_reg[3].oa_size /* sa_size */, + BUS_SPACE_MAP_LINEAR, + &sc->sc_bhregs_mmap) != 0) { + aprint_error(": cannot map MMAP\n"); + return; + } else { + aprint_normal_dev(self, "MMAP @ %p\n", (void*)sc->sc_bhregs_mmap); + } + sc->sc_bufsiz_mmap = sa->sa_reg[3].oa_size; + } else { + sc->sc_bufsiz_mmap = 0; + } +#else + sc->sc_bufsiz_mmap = 0; +#endif + + sc->sc_bufsiz_ddrphy = sa->sa_reg[0].oa_size; + sc->sc_bufsiz_sdram = sa->sa_reg[1].oa_size; + sc->sc_bufsiz_exchange_with_mem = sa->sa_reg[2].oa_size; + + node = sc->sc_node = sa->sa_node; + + /* + * Get transfer burst size from PROM + */ + sbusburst = sbsc->sc_burst; + if (sbusburst == 0) + sbusburst = SBUS_BURST_32 - 1; /* 1->16 */ + + sc->sc_burst = prom_getpropint(node, "burst-sizes", -1); + if (sc->sc_burst == -1) + /* take SBus burst sizes */ + sc->sc_burst = sbusburst; + + /* Clamp at parent's burst sizes */ + sc->sc_burst &= sbusburst; + + aprint_normal_dev(self, "nid 0x%x, bustag %p, burst 0x%x (parent 0x%0x)\n", + sc->sc_node, + sc->sc_bustag, + sc->sc_burst, + sbsc->sc_burst); + + if (!sdram_init(sc)) { + aprint_error_dev(self, "couldn't initialize SDRAM\n"); + return; + } + + if (!dma_init(sc)) { + aprint_error_dev(self, "couldn't initialize DMA for SDRAM\n"); + return; + } + + if (!dma_memtest(sc)) { + aprint_error_dev(self, "DMA-MEMTEST failed for SDRAM\n"); + return; + } + + /* we seem OK hardware-wise */ + dk_init(&sc->dk, self, DKTYPE_FLASH); + disk_init(&sc->dk.sc_dkdev, device_xname(sc->dk.sc_dev), &sbusfpga_sdram_dkdriver); + dk_attach(&sc->dk); + disk_attach(&sc->dk.sc_dkdev); + sbusfpga_sdram_set_geometry(sc); + + bufq_alloc(&sc->dk.sc_bufq, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK); /* needed ? */ + if (0) { + struct disklabel *lp = sc->dk.sc_dkdev.dk_label; + struct cpu_disklabel *clp = sc->dk.sc_dkdev.dk_cpulabel; + memset(lp, 0, sizeof(struct disklabel)); + memset(clp, 0, sizeof(struct cpu_disklabel)); + + lp->d_type = DKTYPE_FLASH; + lp->d_secsize = 512; + lp->d_nsectors = 4; + lp->d_ntracks = 2; + lp->d_ncylinders = sc->dma_real_mem_size / (lp->d_secsize * lp->d_nsectors * lp->d_ntracks); + lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; + lp->d_secperunit = lp->d_secpercyl * lp->d_ncylinders; + lp->d_rpm = 3600; + + strncpy(lp->d_typename, "sdramdisk", sizeof(lp->d_typename)); + strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); + lp->d_interleave = 0; + + lp->d_partitions[0].p_offset = lp->d_secpercyl * lp->d_secsize; + lp->d_partitions[0].p_size = lp->d_secpercyl * (lp->d_ncylinders - 1); + lp->d_partitions[0].p_fstype = FS_SWAP; + + lp->d_partitions[RAW_PART].p_offset = 0; + lp->d_partitions[RAW_PART].p_size = lp->d_secpercyl * lp->d_ncylinders; + lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; + lp->d_npartitions = RAW_PART + 1; + + lp->d_magic = DISKMAGIC; + lp->d_magic2 = DISKMAGIC; + lp->d_checksum = dkcksum(lp); + } + + /* + aprint_normal_dev(self, "sc->dk.sc_dkdev.dk_blkshift = %d\n", sc->dk.sc_dkdev.dk_blkshift); + aprint_normal_dev(self, "sc->dk.sc_dkdev.dk_byteshift = %d\n", sc->dk.sc_dkdev.dk_byteshift); + aprint_normal_dev(self, "sc->dk.sc_dkdev.dk_label = %p\n", sc->dk.sc_dkdev.dk_label); + aprint_normal_dev(self, "sc->dk.sc_dkdev.dk_cpulabel = %p\n", sc->dk.sc_dkdev.dk_cpulabel); + */ +} + +void +sbusfpga_sdram_strategy(struct buf *bp) +{ + struct sbusfpga_sdram_softc *sc = device_lookup_private(&sbusfpga_sdram_cd, DISKUNIT(bp->b_dev)); + + dk_strategy(&sc->dk, bp); +} + +static void sbusfpga_sdram_set_geometry(struct sbusfpga_sdram_softc *sc) { + struct dk_softc *dksc = &sc->dk; + struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; + + memset(dg, 0, sizeof(*dg)); + + dg->dg_secsize = 512; + dg->dg_nsectors = 2; + dg->dg_ntracks = 4; + dg->dg_ncylinders = sc->dma_real_mem_size / (dg->dg_secsize * dg->dg_nsectors * dg->dg_ntracks); + dg->dg_secpercyl = dg->dg_nsectors * dg->dg_ntracks; + dg->dg_secperunit = dg->dg_secpercyl * dg->dg_ncylinders; + dg->dg_pcylinders = dg->dg_ncylinders; + dg->dg_sparespertrack = 0; + dg->dg_sparespercyl = 0; + + disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, "sbusfpga_sdram"); +} + +int +sbusfpga_sdram_size(dev_t dev) { + struct sbusfpga_sdram_softc *sc = device_lookup_private(&sbusfpga_sdram_cd, DISKUNIT(dev)); + return sc->dma_real_mem_size / 512; +} + +static void +sbusfpga_sdram_minphys(struct buf *bp) +{ + if (bp->b_bcount > SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ) + bp->b_bcount = SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ; +} + + +static int +sbusfpga_sdram_diskstart(device_t self, struct buf *bp) +{ + struct sbusfpga_sdram_softc *sc = device_private(self); + int err = 0; + if (sc == NULL) { + aprint_error("%s:%d: sc == NULL! giving up\n", __PRETTY_FUNCTION__, __LINE__); + err = EINVAL; + goto done; + } + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: part %d\n", __PRETTY_FUNCTION__, __LINE__, DISKPART(bp->b_dev)); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bflags = 0x%08x\n", __PRETTY_FUNCTION__, __LINE__, bp->b_flags); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bufsize = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bufsize); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_blkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_blkno); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_rawblkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_rawblkno); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bcount = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bcount); */ + + bp->b_resid = bp->b_bcount; + + if (bp->b_bcount == 0) { + goto done; + } + + /* + { + paddr_t pap; + pmap_t pk = pmap_kernel(); + if (pmap_extract(pk, (vaddr_t)bp->b_data, &pap)) { + aprint_normal_dev(sc->dk.sc_dev, "KVA %p mapped to PA 0x%08lx\n", bp->b_data, pap); + if (bp->b_bcount > 4096) { + u_int32_t np = (bp->b_bcount + 4095) / 4096; + u_int32_t pn; + for (pn = 1 ; pn < np ; pn ++) { + paddr_t papn; + if (pmap_extract(pk, (vaddr_t)bp->b_data + pn * 4096, &papn)) { + if (papn != (pap + pn * 4096)) + break; + } else break; + } + aprint_normal_dev(sc->dk.sc_dev, "And we have %u out %u consecutive PA pages\n", pn, np); + } + } else { + aprint_normal_dev(sc->dk.sc_dev, "KVA %p not mapped\n", bp->b_data); + } + } + */ + + if (bp->b_flags & B_READ) { + unsigned char* data = bp->b_data; + daddr_t blk = bp->b_rawblkno; + /* struct partition *p = NULL; */ + + /* if (DISKPART(bp->b_dev) != RAW_PART) { */ + /* if ((err = bounds_check_with_label(&sc->dk.sc_dkdev, bp, 0)) <= 0) { */ + /* aprint_error("%s:%d: bounds_check_with_label -> %d\n", __PRETTY_FUNCTION__, __LINE__, err); */ + /* bp->b_resid = bp->b_bcount; */ + /* goto done; */ + /* } */ + /* p = &sc->dk.sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; */ + /* blk = bp->b_blkno + p->p_offset; */ + /* } */ + + while (bp->b_resid >= 512 && !err) { + u_int32_t blkcnt = bp->b_resid / 512; + + if (blkcnt > (SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ/512)) + blkcnt = (SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ/512); + + if (blk+blkcnt <= (sc->dma_real_mem_size / 512)) { + err = sbusfpga_sdram_read_block(sc, blk, blkcnt, data); + } else { + aprint_error("%s:%d: blk = %lld read out of range! giving up\n", __PRETTY_FUNCTION__, __LINE__, blk); + err = EINVAL; + break; + } + blk += blkcnt; + data += 512 * blkcnt; + bp->b_resid -= 512 * blkcnt; + } + } else { + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: part %d\n", __PRETTY_FUNCTION__, __LINE__, DISKPART(bp->b_dev)); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bflags = 0x%08x\n", __PRETTY_FUNCTION__, __LINE__, bp->b_flags); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bufsize = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bufsize); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_blkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_blkno); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_rawblkno = %lld\n", __PRETTY_FUNCTION__, __LINE__, bp->b_rawblkno); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_bcount = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_bcount); */ + unsigned char* data = bp->b_data; + daddr_t blk = bp->b_rawblkno; + /* struct partition *p = NULL; */ + + /* if (DISKPART(bp->b_dev) != RAW_PART) { */ + /* if (bounds_check_with_label(&sc->dk.sc_dkdev, bp, 0) <= 0) { */ + /* bp->b_resid = bp->b_bcount; */ + /* goto done; */ + /* } */ + /* p = &sc->dk.sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; */ + /* blk = bp->b_blkno + p->p_offset; */ + /* } */ + + while (bp->b_resid >= 512 && !err) { + u_int32_t blkcnt = bp->b_resid / 512; + + if (blkcnt > (SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ/512)) + blkcnt = (SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ/512); + + if (blk+blkcnt <= (sc->dma_real_mem_size / 512)) { + err = sbusfpga_sdram_write_block(sc, blk, blkcnt, data); + } else { + aprint_error("%s:%d: blk = %lld write out of range! giving up\n", __PRETTY_FUNCTION__, __LINE__, blk); + err = EINVAL; + break; + } + blk += blkcnt; + data += 512 * blkcnt; + bp->b_resid -= 512 * blkcnt; + } + } + + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_resid = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_resid); */ + /* aprint_normal_dev(sc->dk.sc_dev, "%s:%d: bp->b_error = %d\n", __PRETTY_FUNCTION__, __LINE__, bp->b_error); */ + + done: + biodone(bp); + return err; +} + + +#define CONFIG_CSR_DATA_WIDTH 32 +// define CSR_LEDS_BASE & others to avoid defining the CSRs of HW we don't handle +#define CSR_LEDS_BASE +#define CSR_CURVE25519ENGINE_BASE +//#define CSR_DDRPHY_BASE +//#define CSR_SDRAM_BASE +//#define CSR_EXCHANGE_WITH_MEM_BASE +#define CSR_SBUS_BUS_STAT_BASE +#define CSR_SDBLOCK2MEM_BASE +#define CSR_SDCORE_BASE +#define CSR_SDIRQ_BASE +#define CSR_SDMEM2BLOCK_BASE +#define CSR_SDPHY_BASE +#define CSR_TRNG_BASE + +/* grrr */ +#define sbusfpga_exchange_with_mem_softc sbusfpga_sdram_softc +#define sbusfpga_ddrphy_softc sbusfpga_sdram_softc + +#include "dev/sbus/litex_csr.h" +#undef CSR_LEDS_BASE +#undef CSR_CURVE25519ENGINE_BASE +//#undef CSR_DDRPHY_BASE +//#undef CSR_SDRAM_BASE +//#undef CSR_EXCHANGE_WITH_MEM_BASE +#undef CSR_SBUS_BUS_STAT_BASE +#undef CSR_SDBLOCK2MEM_BASE +#undef CSR_SDCORE_BASE +#undef CSR_SDIRQ_BASE +#undef CSR_SDMEM2BLOCK_BASE +#undef CSR_SDPHY_BASE +#undef CSR_TRNG_BASE + +/* not yet generated */ +static inline void exchange_with_mem_checksum_read(struct sbusfpga_sdram_softc *sc, uint32_t* data) { + int i; + for (i = 0 ; i < 8 ; i++) { // FIXME + data[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 4*i+(CSR_EXCHANGE_WITH_MEM_CHECKSUM_ADDR - CSR_EXCHANGE_WITH_MEM_BASE)); + } +} +static inline void exchange_with_mem_checksum_write(struct sbusfpga_sdram_softc *sc, uint32_t* data) { + int i; + for (i = 0 ; i < 8 ; i++) { // FIXME + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 4*i+(CSR_EXCHANGE_WITH_MEM_CHECKSUM_ADDR - CSR_EXCHANGE_WITH_MEM_BASE), data[i]); + } +} + +int +sbusfpga_sdram_ioctl (dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) +{ + struct sbusfpga_sdram_softc *sc = device_lookup_private(&sbusfpga_sdram_cd, DISKUNIT(dev)); + int err = 0;//, err2 = 0; + + if (sc == NULL) { + aprint_error("%s:%d: sc == NULL! giving up\n", __PRETTY_FUNCTION__, __LINE__); + return (ENXIO); + } + + switch (cmd) { + case SBUSFPGA_READ_PG: { + struct sbusfpga_sdram_rwpg* pg = (struct sbusfpga_sdram_rwpg*)data; + exchange_with_mem_checksum_write(sc, pg->checksum); + err = sbusfpga_sdram_read_block(sc, pg->pgnum * 4, 4, pg->pgdata); + exchange_with_mem_checksum_read(sc, pg->checksum); + delay(1); + exchange_with_mem_checksum_read(sc, pg->checksumbis); + pg->last_blk = exchange_with_mem_last_blk_read(sc); + pg->last_dma = exchange_with_mem_last_dma_read(sc); + pg->dma_wrdone = exchange_with_mem_dma_wrdone_read(sc); + if (err != 0) + err = EIO; + goto done; + } + case SBUSFPGA_WRITE_PG: { + struct sbusfpga_sdram_rwpg* pg = (struct sbusfpga_sdram_rwpg*)data; + exchange_with_mem_checksum_write(sc, pg->checksum); + err = sbusfpga_sdram_write_block(sc, pg->pgnum * 4, 4, pg->pgdata); + exchange_with_mem_checksum_read(sc, pg->checksum); + delay(1); + exchange_with_mem_checksum_read(sc, pg->checksumbis); + pg->last_blk = exchange_with_mem_last_blk_read(sc); + pg->last_dma = exchange_with_mem_last_dma_read(sc); + pg->dma_wrdone = exchange_with_mem_dma_wrdone_read(sc); + if (err != 0) + err = EIO; + goto done; + } + } + + err = dk_ioctl(&sc->dk, dev, cmd, data, flag, l); + /*if (err2 != EPASSTHROUGH) + err = err2; + else + err = ENOTTY;*/ + + done: + return err; +} + +#define DMA_STATUS_CHECK_BITS (0x01F) + +int +dma_init(struct sbusfpga_sdram_softc *sc) { + sc->dma_blk_size = exchange_with_mem_blk_size_read(sc); + sc->dma_blk_base = exchange_with_mem_blk_base_read(sc); + sc->dma_mem_size = exchange_with_mem_mem_size_read(sc); + sc->dma_real_mem_size = sc->dma_mem_size * sc->dma_blk_size; + aprint_normal_dev(sc->dk.sc_dev, "DMA: HW -> block size is %d, base address is 0x%08x (%d MiB)\n", + sc->dma_blk_size, + sc->dma_blk_base * sc->dma_blk_size, + sc->dma_real_mem_size / 1048576); + + /* Allocate a dmamap */ + if (bus_dmamap_create(sc->sc_dmatag, SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ, 1, SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, &sc->sc_dmamap) != 0) { + aprint_error_dev(sc->dk.sc_dev, "DMA map create failed\n"); + return 0; + } else { + aprint_normal_dev(sc->dk.sc_dev, "dmamap: %lu %lu %d (%p)\n", sc->sc_dmamap->dm_maxsegsz, sc->sc_dmamap->dm_mapsize, sc->sc_dmamap->dm_nsegs, sc->sc_dmatag->_dmamap_load); + } + + if (bus_dmamem_alloc(sc->sc_dmatag, SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ, 64, 64, &sc->sc_segs, 1, &sc->sc_rsegs, BUS_DMA_NOWAIT | BUS_DMA_STREAMING)) { + aprint_error_dev(sc->dk.sc_dev, "cannot allocate DVMA memory"); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + if (bus_dmamem_map(sc->sc_dmatag, &sc->sc_segs, 1, SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ, &sc->sc_dma_kva, BUS_DMA_NOWAIT)) { + aprint_error_dev(sc->dk.sc_dev, "cannot allocate DVMA address"); + bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + if (bus_dmamap_load(sc->sc_dmatag, sc->sc_dmamap, sc->sc_dma_kva, SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ, /* kernel space */ NULL, + BUS_DMA_NOWAIT | BUS_DMA_STREAMING | BUS_DMA_WRITE)) { + aprint_error_dev(sc->dk.sc_dev, "cannot load dma map"); + bus_dmamem_unmap(sc->sc_dmatag, &sc->sc_dma_kva, SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ); + bus_dmamem_free(sc->sc_dmatag, &sc->sc_segs, 1); + bus_dmamap_destroy(sc->sc_dmatag, sc->sc_dmamap); + return 0; + } + + aprint_normal_dev(sc->dk.sc_dev, "DMA: SW -> kernel address is %p, dvma address is 0x%08llx\n", sc->sc_dma_kva, sc->sc_dmamap->dm_segs[0].ds_addr); + + return 1; +} + +/* tuned on my SPARCstation 20 with 25 MHz SBus & 2*SM61 */ +/* asynchronous would be better ... */ +#define DEF_BLK_DELAY 14 + +static inline unsigned long +lfsr (unsigned long bits, unsigned long prev); +int +dma_memtest(struct sbusfpga_sdram_softc *sc) { + unsigned long *kva_ulong = (unsigned long*)sc->sc_dma_kva; + unsigned long val; + unsigned int blkn = 0; // 113; + const unsigned int testdatasize = 4096; + unsigned int blkcnt ; + int count; + + aprint_normal_dev(sc->dk.sc_dev, "Initializing DMA buffer.\n"); + + val = 0xDEADBEEF; + for (int i = 0 ; i < testdatasize/sizeof(unsigned long) ; i++) { + val = lfsr(32, val); + kva_ulong[i] = val; + } + aprint_normal_dev(sc->dk.sc_dev, "First / last value: 0x%08lx 0x%08lx\n", kva_ulong[0], kva_ulong[(testdatasize/sizeof(unsigned long))-1]); + +#if 0 + if (sc->sc_bufsiz_mmap > 0) { + int idx = blkn * sc->dma_blk_size / sizeof(unsigned long), x; + int bound = sc->sc_bufsiz_mmap / sizeof(unsigned long); + if (bound > idx) { + if ((bound - idx) > 10) + bound = idx + 10; + count = 0; + for (x = idx ; x < bound; x++) { + unsigned long data = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_mmap, x*sizeof(unsigned long)); + aprint_normal_dev(sc->dk.sc_dev, "Prior to write [mmap] at %d: 0x%08lx\n", x, data); + } + } + } +#endif + + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, 4096, BUS_DMASYNC_PREREAD); + + aprint_normal_dev(sc->dk.sc_dev, "Starting DMA Write-to-Sdram.\n"); + + exchange_with_mem_blk_addr_write(sc, blkn + sc->dma_blk_base); + exchange_with_mem_dma_addr_write(sc, sc->sc_dmamap->dm_segs[0].ds_addr); + exchange_with_mem_blk_cnt_write(sc, 0x80000000 | (testdatasize / sc->dma_blk_size)); + + aprint_normal_dev(sc->dk.sc_dev, "DMA Write-to-Sdram started, polling\n"); + + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, 4096, BUS_DMASYNC_POSTREAD); + + delay(DEF_BLK_DELAY * 8); + + count = 0; + while (((blkcnt = exchange_with_mem_blk_cnt_read(sc)) != 0) && (count < 10)) { + aprint_normal_dev(sc->dk.sc_dev, "DMA Write-to-Sdram ongoing (%u, status 0x%08x, lastblk req 0x%08x, last phys addr written 0x%08x)\n", + blkcnt & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_wr_tosdram_read(sc)); + count ++; + delay(DEF_BLK_DELAY); + } + + if (blkcnt) { + aprint_error_dev(sc->dk.sc_dev, "DMA Write-to-Sdram didn't finish ? (%u, status 0x%08x, 0x%08x, 0x%08x, lastblk req 0x%08x, last phys addr written 0x%08x)\n", + blkcnt & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_wr_tosdram_read(sc)); + return 0; + } else { + aprint_normal_dev(sc->dk.sc_dev, "DMA Write-to-Sdram done (status 0x%08x, 0x%08x, 0x%08x, 0x%08x, last phys addr written 0x%08x)\n", + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc), + exchange_with_mem_wr_tosdram_read(sc)); + } + + count = 0; + while ((((blkcnt = exchange_with_mem_dma_status_read(sc)) & DMA_STATUS_CHECK_BITS) != 0) && (count < 10)) { + aprint_normal_dev(sc->dk.sc_dev, "DMA Write-to-Sdram hasn't reached SDRAM yet (status 0x%08x)\n", blkcnt); + count ++; + delay(DEF_BLK_DELAY); + } + + if (blkcnt & DMA_STATUS_CHECK_BITS) { + aprint_error_dev(sc->dk.sc_dev, "DMA Write-to-Sdram can't reach SDRAM ? (%u, status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", blkcnt & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + return 0; + } else { + aprint_normal_dev(sc->dk.sc_dev, "DMA Write-to-Sdram has reached SDRAM (status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + } + +#if 0 + if (sc->sc_bufsiz_mmap > 0) { + int idx = blkn * sc->dma_blk_size / sizeof(unsigned long), x; + int bound = sc->sc_bufsiz_mmap / sizeof(unsigned long); + if (bound > idx) { + count = 0; + val = 0xDEADBEEF; + if ((bound - idx) > (testdatasize / sizeof(unsigned long))) + bound = idx + (testdatasize / sizeof(unsigned long)); + for (x = idx ; x < bound && count < 10; x++) { + unsigned long data = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_mmap, x*sizeof(unsigned long)); + val = lfsr(32, val); + if (val != data) { + aprint_error_dev(sc->dk.sc_dev, "Read-after-write [mmap] error at %d: 0x%08lx vs. 0x%08lx (0x%08lx)\n", x, data, val, val ^ data); + count ++; + } + } + } + } +#endif + + for (int i = 0 ; i < testdatasize/sizeof(unsigned long) ; i++) { + kva_ulong[i] = 0x0c0ffee0; + } + aprint_normal_dev(sc->dk.sc_dev, "First / last value: 0x%08lx 0x%08lx\n", kva_ulong[0], kva_ulong[(testdatasize/sizeof(unsigned long))-1]); + + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, 4096, BUS_DMASYNC_PREWRITE); + + aprint_normal_dev(sc->dk.sc_dev, "Starting DMA Read-from-Sdram.\n"); + + exchange_with_mem_blk_addr_write(sc, blkn + sc->dma_blk_base); + exchange_with_mem_dma_addr_write(sc, sc->sc_dmamap->dm_segs[0].ds_addr); + exchange_with_mem_blk_cnt_write(sc, 0x00000000 | (testdatasize / sc->dma_blk_size)); + + aprint_normal_dev(sc->dk.sc_dev, "DMA Read-from-Sdram started, polling\n"); + + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, 4096, BUS_DMASYNC_POSTWRITE); + + delay(DEF_BLK_DELAY * 8); + + count = 0; + while (((blkcnt = exchange_with_mem_blk_cnt_read(sc)) != 0) && (count < 10)) { + aprint_normal_dev(sc->dk.sc_dev, "DMA Read-from-Sdram ongoing (%u, status 0x%08x)\n", blkcnt & 0x0000FFFF, exchange_with_mem_dma_status_read(sc)); + count ++; + delay(DEF_BLK_DELAY); + } + + if (blkcnt) { + aprint_error_dev(sc->dk.sc_dev, "DMA Read-from-Sdram didn't finish ? (%u, status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", + blkcnt & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + return 0; + } else { + aprint_normal_dev(sc->dk.sc_dev, "DMA Read-from-Sdram done (status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + } + + count = 0; + while ((((blkcnt = exchange_with_mem_dma_status_read(sc)) & DMA_STATUS_CHECK_BITS) != 0) && (count < 10)) { + aprint_normal_dev(sc->dk.sc_dev, "DMA Read-from-Sdram hasn't reached memory yet (status 0x%08x)\n", blkcnt); + count ++; + delay(DEF_BLK_DELAY); + } + + aprint_normal_dev(sc->dk.sc_dev, "First /last value: 0x%08lx 0x%08lx\n", kva_ulong[0], kva_ulong[(testdatasize/sizeof(unsigned long))-1]); + + if (blkcnt & DMA_STATUS_CHECK_BITS) { + aprint_error_dev(sc->dk.sc_dev, "DMA Read-from-Sdram can't reach memory ? (%u, status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", blkcnt & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + return 0; + } else { + aprint_normal_dev(sc->dk.sc_dev, "DMA Read-from-Sdram has reached memory (status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + } + + count = 0; + val = 0xDEADBEEF; + for (int i = 0 ; i < testdatasize/sizeof(unsigned long) && count < 10; i++) { + val = lfsr(32, val); + if (kva_ulong[i] != val) { + aprint_error_dev(sc->dk.sc_dev, "Read-after-write error at %d: 0x%08lx vs. 0x%08lx (0x%08lx)\n", i, kva_ulong[i], val, val ^ kva_ulong[i]); + count ++; + } + } + + if (count) + return 0; + + return 1; +} + + +static int sbusfpga_sdram_read_block(struct sbusfpga_sdram_softc *sc, const u_int32_t block, const u_int32_t blkcnt, void *data) { + int res = 0; + int count; + unsigned int check; + + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, blkcnt * 512, BUS_DMASYNC_PREWRITE); + + exchange_with_mem_blk_addr_write(sc, sc->dma_blk_base + (block * 512 / sc->dma_blk_size) ); + exchange_with_mem_dma_addr_write(sc, sc->sc_dmamap->dm_segs[0].ds_addr); + exchange_with_mem_blk_cnt_write(sc, 0x00000000 | (blkcnt * 512 / sc->dma_blk_size) ); + + delay(DEF_BLK_DELAY * blkcnt); + + count = 0; + while (((check = exchange_with_mem_blk_cnt_read(sc)) != 0) && (count < (4*blkcnt))) { + count ++; + delay(DEF_BLK_DELAY); + } + + if (check) { + aprint_error_dev(sc->dk.sc_dev, "DMA didn't finish ? (%u, status 0x%08x, 0x%08x, 0x%08x, lastblk req 0x%08x, last phys addr written 0x%08x)\n", + check & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_wr_tosdram_read(sc)); + return ENXIO; + } +#if 0 + else { + aprint_normal_dev(sc->dk.sc_dev, "DMA READ finish for %d blk in %d attempts.\n", blkcnt, count); + } +#endif + + count = 0; + while ((((check = exchange_with_mem_dma_status_read(sc)) & DMA_STATUS_CHECK_BITS) != 0) && (count < blkcnt)) { + //aprint_normal_dev(sc->dk.sc_dev, "DMA Write-to-Sdram hasn't reached SDRAM yet (status 0x%08x)\n", check); + count ++; + delay(DEF_BLK_DELAY); + } + + if (check & DMA_STATUS_CHECK_BITS) { + aprint_error_dev(sc->dk.sc_dev, "DMA can't reach memory/SDRAM ? (%u, status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", + check & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + return ENXIO; + } + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, blkcnt * 512, BUS_DMASYNC_POSTWRITE); + + memcpy(data, sc->sc_dma_kva, blkcnt * 512); + + return res; +} + + +static int sbusfpga_sdram_write_block(struct sbusfpga_sdram_softc *sc, const u_int32_t block, const u_int32_t blkcnt, void *data) { + int res = 0; + int count; + unsigned int check; + + memcpy(sc->sc_dma_kva, data, blkcnt * 512); + + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, blkcnt * 512, BUS_DMASYNC_PREREAD); + + exchange_with_mem_blk_addr_write(sc, sc->dma_blk_base + (block * 512 / sc->dma_blk_size) ); + exchange_with_mem_dma_addr_write(sc, sc->sc_dmamap->dm_segs[0].ds_addr); + exchange_with_mem_blk_cnt_write(sc, 0x80000000 | (blkcnt * 512 / sc->dma_blk_size) ); + + delay(DEF_BLK_DELAY * blkcnt); + + count = 0; + while (((check = exchange_with_mem_blk_cnt_read(sc)) != 0) && (count < (4*blkcnt))) { + count ++; + delay(DEF_BLK_DELAY); + } + + if (check) { + aprint_error_dev(sc->dk.sc_dev, "DMA didn't finish ? (%u, status 0x%08x, 0x%08x, 0x%08x, lastblk req 0x%08x, last phys addr written 0x%08x)\n", + check & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_wr_tosdram_read(sc)); + return ENXIO; + } +#if 0 + else { + aprint_normal_dev(sc->dk.sc_dev, "DMA WRITE finish for %d blk in %d attempts.\n", blkcnt, count); + } +#endif + + count = 0; + while ((((check = exchange_with_mem_dma_status_read(sc)) & DMA_STATUS_CHECK_BITS) != 0) && (count < blkcnt)) { + //aprint_normal_dev(sc->dk.sc_dev, "DMA Read_from-Sdram hasn't reached SDRAM yet (status 0x%08x)\n", check); + count ++; + delay(DEF_BLK_DELAY); + } + + if (check & DMA_STATUS_CHECK_BITS) { + aprint_error_dev(sc->dk.sc_dev, "DMA can't reach memory/SDRAM ? (%u, status 0x%08x, 0x%08x, 0x%08x, 0x%08x)\n", + check & 0x0000FFFF, + exchange_with_mem_dma_status_read(sc), + exchange_with_mem_last_blk_read(sc), + exchange_with_mem_last_dma_read(sc), + exchange_with_mem_blk_rem_read(sc)); + return ENXIO; + } + bus_dmamap_sync(sc->sc_dmatag, sc->sc_dmamap, 0, blkcnt * 512, BUS_DMASYNC_POSTREAD); + + return res; +} + +/* auto-generated sdram_phy.h + sc */ +#define DFII_CONTROL_SEL 0x01 +#define DFII_CONTROL_CKE 0x02 +#define DFII_CONTROL_ODT 0x04 +#define DFII_CONTROL_RESET_N 0x08 + +#define DFII_COMMAND_CS 0x01 +#define DFII_COMMAND_WE 0x02 +#define DFII_COMMAND_CAS 0x04 +#define DFII_COMMAND_RAS 0x08 +#define DFII_COMMAND_WRDATA 0x10 +#define DFII_COMMAND_RDDATA 0x20 + +#define SDRAM_PHY_A7DDRPHY +#define SDRAM_PHY_XDR 2 +#define SDRAM_PHY_DATABITS 16 +#define SDRAM_PHY_PHASES 4 +#define SDRAM_PHY_CL 6 +#define SDRAM_PHY_CWL 5 +#define SDRAM_PHY_CMD_LATENCY 0 +#define SDRAM_PHY_RDPHASE 2 +#define SDRAM_PHY_WRPHASE 3 +#define SDRAM_PHY_WRITE_LATENCY_CALIBRATION_CAPABLE +#define SDRAM_PHY_READ_LEVELING_CAPABLE +#define SDRAM_PHY_MODULES SDRAM_PHY_DATABITS/8 +#define SDRAM_PHY_DELAYS 32 +#define SDRAM_PHY_BITSLIPS 8 + +void cdelay(int i); + +__attribute__((unused)) static inline void command_p0(struct sbusfpga_sdram_softc *sc, int cmd) +{ + sdram_dfii_pi0_command_write(sc, cmd); + sdram_dfii_pi0_command_issue_write(sc, 1); +} +__attribute__((unused)) static inline void command_p1(struct sbusfpga_sdram_softc *sc, int cmd) +{ + sdram_dfii_pi1_command_write(sc, cmd); + sdram_dfii_pi1_command_issue_write(sc, 1); +} +__attribute__((unused)) static inline void command_p2(struct sbusfpga_sdram_softc *sc, int cmd) +{ + sdram_dfii_pi2_command_write(sc, cmd); + sdram_dfii_pi2_command_issue_write(sc, 1); +} +__attribute__((unused)) static inline void command_p3(struct sbusfpga_sdram_softc *sc, int cmd) +{ + sdram_dfii_pi3_command_write(sc, cmd); + sdram_dfii_pi3_command_issue_write(sc, 1); +} + +#define DFII_PIX_DATA_SIZE CSR_SDRAM_DFII_PI0_WRDATA_SIZE + +static inline unsigned long sdram_dfii_pix_wrdata_addr(int phase){ + switch (phase) { + case 0: return CSR_SDRAM_DFII_PI0_WRDATA_ADDR; + case 1: return CSR_SDRAM_DFII_PI1_WRDATA_ADDR; + case 2: return CSR_SDRAM_DFII_PI2_WRDATA_ADDR; + case 3: return CSR_SDRAM_DFII_PI3_WRDATA_ADDR; + default: return 0; + } +} + +static inline unsigned long sdram_dfii_pix_rddata_addr(int phase){ + switch (phase) { + case 0: return CSR_SDRAM_DFII_PI0_RDDATA_ADDR; + case 1: return CSR_SDRAM_DFII_PI1_RDDATA_ADDR; + case 2: return CSR_SDRAM_DFII_PI2_RDDATA_ADDR; + case 3: return CSR_SDRAM_DFII_PI3_RDDATA_ADDR; + default: return 0; + } +} + +#define DDRX_MR_WRLVL_ADDRESS 1 +#define DDRX_MR_WRLVL_RESET 6 +#define DDRX_MR_WRLVL_BIT 7 + +static inline void init_sequence(struct sbusfpga_sdram_softc *sc) +{ + /* Release reset */ + sdram_dfii_pi0_address_write(sc, 0x0); + sdram_dfii_pi0_baddress_write(sc, 0); + sdram_dfii_control_write(sc, DFII_CONTROL_ODT|DFII_CONTROL_RESET_N); + cdelay(50000); + + /* Bring CKE high */ + sdram_dfii_pi0_address_write(sc, 0x0); + sdram_dfii_pi0_baddress_write(sc, 0); + sdram_dfii_control_write(sc, DFII_CONTROL_CKE|DFII_CONTROL_ODT|DFII_CONTROL_RESET_N); + cdelay(10000); + + /* Load Mode Register 2, CWL=5 */ + sdram_dfii_pi0_address_write(sc, 0x200); + sdram_dfii_pi0_baddress_write(sc, 2); + command_p0(sc, DFII_COMMAND_RAS|DFII_COMMAND_CAS|DFII_COMMAND_WE|DFII_COMMAND_CS); + + /* Load Mode Register 3 */ + sdram_dfii_pi0_address_write(sc, 0x0); + sdram_dfii_pi0_baddress_write(sc, 3); + command_p0(sc, DFII_COMMAND_RAS|DFII_COMMAND_CAS|DFII_COMMAND_WE|DFII_COMMAND_CS); + + /* Load Mode Register 1 */ + sdram_dfii_pi0_address_write(sc, 0x6); + sdram_dfii_pi0_baddress_write(sc, 1); + command_p0(sc, DFII_COMMAND_RAS|DFII_COMMAND_CAS|DFII_COMMAND_WE|DFII_COMMAND_CS); + + /* Load Mode Register 0, CL=6, BL=8 */ + sdram_dfii_pi0_address_write(sc, 0x920); + sdram_dfii_pi0_baddress_write(sc, 0); + command_p0(sc, DFII_COMMAND_RAS|DFII_COMMAND_CAS|DFII_COMMAND_WE|DFII_COMMAND_CS); + cdelay(200); + + /* ZQ Calibration */ + sdram_dfii_pi0_address_write(sc, 0x400); + sdram_dfii_pi0_baddress_write(sc, 0); + command_p0(sc, DFII_COMMAND_WE|DFII_COMMAND_CS); + cdelay(200); +} + +/* from hw/common.h, +sc */ + +/* CSR data width (subreg. width) in bytes, for direct comparson to sizeof() */ +#define CSR_DW_BYTES (CONFIG_CSR_DATA_WIDTH/8) +#define CSR_OFFSET_BYTES 4 + +/* Number of subregs required for various total byte sizes, by subreg width: + * NOTE: 1, 2, 4, and 8 bytes represent uint[8|16|32|64]_t C types; However, + * CSRs of intermediate byte sizes (24, 40, 48, and 56) are NOT padded + * (with extra unallocated subregisters) to the next valid C type! + * +-----+-----------------+ + * | csr | bytes | + * | _dw | 1 2 3 4 5 6 7 8 | + * | |-----=---=-=-=---| + * | 1 | 1 2 3 4 5 6 7 8 | + * | 2 | 1 1 2 2 3 3 4 4 | + * | 4 | 1 1 1 1 2 2 2 2 | + * | 8 | 1 1 1 1 1 1 1 1 | + * +-----+-----------------+ */ +static inline int num_subregs(int csr_bytes) +{ + return (csr_bytes - 1) / CSR_DW_BYTES + 1; +} + +/* Read a CSR of size 'csr_bytes' located at address 'a'. */ +static inline uint64_t _csr_rd(struct sbusfpga_sdram_softc *sc, unsigned long a, int csr_bytes) +{ + uint64_t r = bus_space_read_4(sc->sc_bustag, 0, a); + for (int i = 1; i < num_subregs(csr_bytes); i++) { + r <<= CONFIG_CSR_DATA_WIDTH; + a += CSR_OFFSET_BYTES; + r |= bus_space_read_4(sc->sc_bustag, 0, a); + } + return r; +} + +/* Write value 'v' to a CSR of size 'csr_bytes' located at address 'a'. */ +static inline void _csr_wr(struct sbusfpga_sdram_softc *sc, unsigned long a, uint64_t v, int csr_bytes) +{ + int ns = num_subregs(csr_bytes); + for (int i = 0; i < ns; i++) { + bus_space_write_4(sc->sc_bustag, 0, a , v >> (CONFIG_CSR_DATA_WIDTH * (ns - 1 - i))); + a += CSR_OFFSET_BYTES; + } +} + +// FIXME: - should we provide 24, 40, 48, and 56 bit csr_[rd|wr] methods? + +static inline uint8_t csr_rd_uint8(struct sbusfpga_sdram_softc *sc, unsigned long a) +{ + return _csr_rd(sc, a, sizeof(uint8_t)); +} + +static inline void csr_wr_uint8(struct sbusfpga_sdram_softc *sc, uint8_t v, unsigned long a) +{ + _csr_wr(sc, a, v, sizeof(uint8_t)); +} + +static inline uint16_t csr_rd_uint16(struct sbusfpga_sdram_softc *sc, unsigned long a) +{ + return _csr_rd(sc, a, sizeof(uint16_t)); +} + +static inline void csr_wr_uint16(struct sbusfpga_sdram_softc *sc, uint16_t v, unsigned long a) +{ + _csr_wr(sc, a, v, sizeof(uint16_t)); +} + +static inline uint32_t csr_rd_uint32(struct sbusfpga_sdram_softc *sc, unsigned long a) +{ + return _csr_rd(sc, a, sizeof(uint32_t)); +} + +static inline void csr_wr_uint32(struct sbusfpga_sdram_softc *sc, uint32_t v, unsigned long a) +{ + _csr_wr(sc, a, v, sizeof(uint32_t)); +} + +static inline uint64_t csr_rd_uint64(struct sbusfpga_sdram_softc *sc, unsigned long a) +{ + return _csr_rd(sc, a, sizeof(uint64_t)); +} + +static inline void csr_wr_uint64(struct sbusfpga_sdram_softc *sc, uint64_t v, unsigned long a) +{ + _csr_wr(sc, a, v, sizeof(uint64_t)); +} + +/* Read a CSR located at address 'a' into an array 'buf' of 'cnt' elements. + * + * NOTE: Since CSR_DW_BYTES is a constant here, we might be tempted to further + * optimize things by leaving out one or the other of the if() branches below, + * depending on each unsigned type width; + * However, this code is also meant to serve as a reference for how CSRs are + * to be manipulated by other programs (e.g., an OS kernel), which may benefit + * from dynamically handling multiple possible CSR subregister data widths + * (e.g., by passing a value in through the Device Tree). + * Ultimately, if CSR_DW_BYTES is indeed a constant, the compiler should be + * able to determine on its own whether it can automatically optimize away one + * of the if() branches! */ +#define _csr_rd_buf(sc, a, buf, cnt) \ +{ \ + int i, j, nsubs, n_sub_elem; \ + uint64_t r; \ + if (sizeof(buf[0]) >= CSR_DW_BYTES) { \ + /* one or more subregisters per element */ \ + for (i = 0; i < cnt; i++) { \ + buf[i] = _csr_rd(sc, a, sizeof(buf[0])); \ + a += CSR_OFFSET_BYTES * num_subregs(sizeof(buf[0])); \ + } \ + } else { \ + /* multiple elements per subregister (2, 4, or 8) */ \ + nsubs = num_subregs(sizeof(buf[0]) * cnt); \ + n_sub_elem = CSR_DW_BYTES / sizeof(buf[0]); \ + for (i = 0; i < nsubs; i++) { \ + r = bus_space_read_4(sc->sc_bustag, 0, a); \ + for (j = n_sub_elem - 1; j >= 0; j--) { \ + if (i * n_sub_elem + j < cnt) \ + buf[i * n_sub_elem + j] = r; \ + r >>= sizeof(buf[0]) * 8; \ + } \ + a += CSR_OFFSET_BYTES; \ + } \ + } \ +} + +/* Write an array 'buf' of 'cnt' elements to a CSR located at address 'a'. + * + * NOTE: The same optimization considerations apply here as with _csr_rd_buf() + * above. + */ +#define _csr_wr_buf(sc, a, buf, cnt) \ +{ \ + int i, j, nsubs, n_sub_elem; \ + uint64_t v; \ + if (sizeof(buf[0]) >= CSR_DW_BYTES) { \ + /* one or more subregisters per element */ \ + for (i = 0; i < cnt; i++) { \ + _csr_wr(sc, a, buf[i], sizeof(buf[0])); \ + a += CSR_OFFSET_BYTES * num_subregs(sizeof(buf[0])); \ + } \ + } else { \ + /* multiple elements per subregister (2, 4, or 8) */ \ + nsubs = num_subregs(sizeof(buf[0]) * cnt); \ + n_sub_elem = CSR_DW_BYTES / sizeof(buf[0]); \ + for (i = 0; i < nsubs; i++) { \ + v = buf[i * n_sub_elem + 0]; \ + for (j = 1; j < n_sub_elem; j++) { \ + if (i * n_sub_elem + j == cnt) \ + break; \ + v <<= sizeof(buf[0]) * 8; \ + v |= buf[i * n_sub_elem + j]; \ + } \ + bus_space_write_4(sc->sc_bustag, 0, a, v); \ + a += CSR_OFFSET_BYTES; \ + } \ + } \ +} + +static inline void csr_rd_buf_uint8(struct sbusfpga_sdram_softc *sc, unsigned long a, uint8_t *buf, int cnt) +{ + _csr_rd_buf(sc, a, buf, cnt); +} + +static inline void csr_wr_buf_uint8(struct sbusfpga_sdram_softc *sc, unsigned long a, + const uint8_t *buf, int cnt) +{ + _csr_wr_buf(sc, a, buf, cnt); +} + +static inline void csr_rd_buf_uint16(struct sbusfpga_sdram_softc *sc, unsigned long a, uint16_t *buf, int cnt) +{ + _csr_rd_buf(sc, a, buf, cnt); +} + +static inline void csr_wr_buf_uint16(struct sbusfpga_sdram_softc *sc, unsigned long a, + const uint16_t *buf, int cnt) +{ + _csr_wr_buf(sc, a, buf, cnt); +} + +static inline void csr_rd_buf_uint32(struct sbusfpga_sdram_softc *sc, unsigned long a, uint32_t *buf, int cnt) +{ + _csr_rd_buf(sc, a, buf, cnt); +} + +static inline void csr_wr_buf_uint32(struct sbusfpga_sdram_softc *sc, unsigned long a, + const uint32_t *buf, int cnt) +{ + _csr_wr_buf(sc, a, buf, cnt); +} + +/* NOTE: the macros' "else" branch is unreachable, no need to be warned + * about a >= 64bit left shift! */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshift-count-overflow" +static inline void csr_rd_buf_uint64(struct sbusfpga_sdram_softc *sc, unsigned long a, uint64_t *buf, int cnt) +{ + _csr_rd_buf(sc, a, buf, cnt); +} + +static inline void csr_wr_buf_uint64(struct sbusfpga_sdram_softc *sc, unsigned long a, + const uint64_t *buf, int cnt) +{ + _csr_wr_buf(sc, a, buf, cnt); +} +#pragma GCC diagnostic pop + +/* sdram.c from liblitedram, preprocessed for our case, + sc */ + +static inline unsigned long +lfsr (unsigned long bits, unsigned long prev) +{ + static const unsigned long long lfsr_taps[] = { + 0x0L, + 0x0L, + 0x3L, + 0x6L, + 0xcL, + 0x14L, + 0x30L, + 0x60L, + 0xb8L, + 0x110L, + 0x240L, + 0x500L, + 0x829L, + 0x100dL, + 0x2015L, + 0x6000L, + 0xd008L, + 0x12000L, + 0x20400L, + 0x40023L, + 0x90000L, + 0x140000L, + 0x300000L, + 0x420000L, + 0xe10000L, + 0x1200000L, + 0x2000023L, + 0x4000013L, + 0x9000000L, + 0x14000000L, + 0x20000029L, + 0x48000000L, + 0x80200003L, + 0x100080000L, + 0x204000003L, + 0x500000000L, + 0x801000000L, + 0x100000001fL, + 0x2000000031L, + 0x4400000000L, + 0xa000140000L, + 0x12000000000L, + 0x300000c0000L, + 0x63000000000L, + 0xc0000030000L, + 0x1b0000000000L, + 0x300003000000L, + 0x420000000000L, + 0xc00000180000L, + 0x1008000000000L, + 0x3000000c00000L, + 0x6000c00000000L, + 0x9000000000000L, + 0x18003000000000L, + 0x30000000030000L, + 0x40000040000000L, + 0xc0000600000000L, + 0x102000000000000L, + 0x200004000000000L, + 0x600003000000000L, + 0xc00000000000000L, + 0x1800300000000000L, + 0x3000000000000030L, + 0x6000000000000000L, + 0x800000000000000dL + }; + unsigned long lsb = prev & 1; + prev >>= 1; + prev ^= (-lsb) & lfsr_taps[bits]; + return prev; +} + +__attribute__((unused)) + void + cdelay (int i) +{ + while (i > 0) { + __asm__ volatile (""); + i--; + } +} +#if 0 +int +sdram_get_databits (void) +{ + return 16; +} +int +sdram_get_freq (void) +{ + return 2 * 4 * 100000000; +} +int +sdram_get_cl (void) +{ + return 6; +} +int +sdram_get_cwl (void) +{ + return 5; +} +#endif +static unsigned char +sdram_dfii_get_rdphase(struct sbusfpga_sdram_softc *sc) +{ + return ddrphy_rdphase_read(sc); +} +static unsigned char +sdram_dfii_get_wrphase(struct sbusfpga_sdram_softc *sc) +{ + return ddrphy_wrphase_read(sc); +} +static void +sdram_dfii_pix_address_write(struct sbusfpga_sdram_softc *sc, unsigned char phase, unsigned int value) +{ + switch (phase) { + case 3: + sdram_dfii_pi3_address_write(sc, value); + break; + case 2: + sdram_dfii_pi2_address_write(sc, value); + break; + case 1: + sdram_dfii_pi1_address_write(sc, value); + break; + default: + sdram_dfii_pi0_address_write(sc, value); + } +} +static void +sdram_dfii_pird_address_write(struct sbusfpga_sdram_softc *sc, unsigned int value) +{ + unsigned char rdphase = sdram_dfii_get_rdphase(sc); + sdram_dfii_pix_address_write(sc, rdphase, value); +} +static void +sdram_dfii_piwr_address_write(struct sbusfpga_sdram_softc *sc, unsigned int value) +{ + unsigned char wrphase = sdram_dfii_get_wrphase(sc); + sdram_dfii_pix_address_write(sc, wrphase, value); +} +static void +sdram_dfii_pix_baddress_write(struct sbusfpga_sdram_softc *sc, unsigned char phase, unsigned int value) +{ + switch (phase) { + case 3: + sdram_dfii_pi3_baddress_write(sc, value); + break; + case 2: + sdram_dfii_pi2_baddress_write(sc, value); + break; + case 1: + sdram_dfii_pi1_baddress_write(sc, value); + break; + default: + sdram_dfii_pi0_baddress_write(sc, value); + } +} +static void +sdram_dfii_pird_baddress_write(struct sbusfpga_sdram_softc *sc, unsigned int value) +{ + unsigned char rdphase = sdram_dfii_get_rdphase(sc); + sdram_dfii_pix_baddress_write(sc, rdphase, value); +} +static void +sdram_dfii_piwr_baddress_write(struct sbusfpga_sdram_softc *sc, unsigned int value) +{ + unsigned char wrphase = sdram_dfii_get_wrphase(sc); + sdram_dfii_pix_baddress_write(sc, wrphase, value); +} +static void +command_px(struct sbusfpga_sdram_softc *sc, unsigned char phase, unsigned int value) +{ + switch (phase) { + case 3: + command_p3(sc, value); + break; + case 2: + command_p2(sc, value); + break; + case 1: + command_p1(sc, value); + break; + default: + command_p0(sc, value); + } +} +static void +command_prd(struct sbusfpga_sdram_softc *sc, unsigned int value) +{ + unsigned char rdphase = sdram_dfii_get_rdphase(sc); + command_px(sc, rdphase, value); +} +static void +command_pwr (struct sbusfpga_sdram_softc *sc, unsigned int value) +{ + unsigned char wrphase = sdram_dfii_get_wrphase(sc); + command_px(sc, wrphase, value); +} +static void +sdram_software_control_on(struct sbusfpga_sdram_softc *sc) +{ + unsigned int previous; + previous = sdram_dfii_control_read(sc); + if (previous != (0x02 | 0x04 | 0x08)) { + sdram_dfii_control_write(sc, (0x02 | 0x04 | 0x08)); + aprint_normal_dev(sc->dk.sc_dev, "Switching SDRAM to software control.\n"); + } +} +static void +sdram_software_control_off(struct sbusfpga_sdram_softc *sc) +{ + unsigned int previous; + previous = sdram_dfii_control_read(sc); + if (previous != (0x01)) { + sdram_dfii_control_write(sc, (0x01)); + aprint_normal_dev(sc->dk.sc_dev, "Switching SDRAM to hardware control.\n"); + } +} +__attribute__((unused)) static void +sdram_mode_register_write(struct sbusfpga_sdram_softc *sc, char reg, int value) +{ + sdram_dfii_pi0_address_write(sc, value); + sdram_dfii_pi0_baddress_write(sc, reg); + command_p0(sc, 0x08 | 0x04 | 0x02 | 0x01); +} +typedef void (*delay_callback) (struct sbusfpga_sdram_softc *sc, int module); +static void +sdram_activate_test_row(struct sbusfpga_sdram_softc *sc) +{ + sdram_dfii_pi0_address_write(sc, 0); + sdram_dfii_pi0_baddress_write(sc, 0); + command_p0(sc, 0x08 | 0x01); + cdelay (15); +} +static void +sdram_precharge_test_row(struct sbusfpga_sdram_softc *sc) +{ + sdram_dfii_pi0_address_write(sc, 0); + sdram_dfii_pi0_baddress_write(sc, 0); + command_p0(sc, 0x08 | 0x02 | 0x01); + cdelay (15); +} +#if 0 +// available from kern.h +static unsigned int +popcount (unsigned int x) +{ + x -= ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0F0F0F0F; + x += (x >> 8); + x += (x >> 16); + return x & 0x0000003F; +} +#endif +static void +print_scan_errors (unsigned int errors) +{ + aprint_normal("%d", errors == 0); +} +static unsigned int +sdram_write_read_check_test_pattern (struct sbusfpga_sdram_softc *sc, int module, unsigned int seed) +{ + int p, i; + unsigned int errors; + unsigned int prv; + unsigned char tst[1 * 32 / 8]; + unsigned char prs[4][1 * 32 / 8]; + prv = seed; + for (p = 0; p < 4; p++) { + for (i = 0; i < 1 * 32 / 8; i++) { + prv = lfsr (32, prv); + prs[p][i] = prv; + } + } + sdram_activate_test_row(sc); + for (p = 0; p < 4; p++) + csr_wr_buf_uint8(sc, sc->sc_bhregs_sdram + (sdram_dfii_pix_wrdata_addr (p) - CSR_SDRAM_BASE), prs[p], 1 * 32 / 8); /* cleanme */ + sdram_dfii_piwr_address_write(sc, 0); + sdram_dfii_piwr_baddress_write(sc, 0); + command_pwr(sc, 0x04 | 0x02 | 0x01 | 0x10); + cdelay (15); + sdram_dfii_pird_address_write(sc, 0); + sdram_dfii_pird_baddress_write(sc, 0); + command_prd(sc, 0x04 | 0x01 | 0x20); + cdelay (15); + sdram_precharge_test_row(sc); + errors = 0; + for (p = 0; p < 4; p++) { + csr_rd_buf_uint8(sc, sc->sc_bhregs_sdram + (sdram_dfii_pix_rddata_addr (p) - CSR_SDRAM_BASE), tst, 1 * 32 / 8); /* cleanme */ + errors += + popcount (prs[p][16 / 8 - 1 - module] ^ tst[16 / 8 - 1 - module]); + errors += + popcount (prs[p][2 * 16 / 8 - 1 - module] ^ + tst[2 * 16 / 8 - 1 - module]); + } + return errors; +} +static void +sdram_leveling_center_module (struct sbusfpga_sdram_softc *sc, int module, int show_short, int show_long, + delay_callback rst_delay, + delay_callback inc_delay) +{ + int i; + int show; + int working; + unsigned int errors; + int delay, delay_mid, delay_range; + int delay_min = -1, delay_max = -1; + if (show_long) + aprint_normal_dev(sc->dk.sc_dev, "m%d: |", module); + delay = 0; + rst_delay(sc, module); + while (1) { + errors = sdram_write_read_check_test_pattern(sc, module, 42); + errors += sdram_write_read_check_test_pattern(sc, module, 84); + working = errors == 0; + show = show_long; + if (show) + print_scan_errors(errors); + if (working && delay_min < 0) { + delay_min = delay; + break; + } + delay++; + if (delay >= 32) + break; + inc_delay(sc, module); + } + delay++; + inc_delay(sc, module); + while (1) { + errors = sdram_write_read_check_test_pattern(sc, module, 42); + errors += sdram_write_read_check_test_pattern(sc, module, 84); + working = errors == 0; + show = show_long; + if (show) + print_scan_errors(errors); + if (!working && delay_max < 0) { + delay_max = delay; + } + delay++; + if (delay >= 32) + break; + inc_delay(sc, module); + } + if (delay_max < 0) { + delay_max = delay; + } + if (show_long) + aprint_normal_dev(sc->dk.sc_dev, "| "); + delay_mid = (delay_min + delay_max) / 2 % 32; + delay_range = (delay_max - delay_min) / 2; + if (show_short) { + if (delay_min < 0) + aprint_normal("delays: -"); + else + aprint_normal("delays: %02d+-%02d", delay_mid, delay_range); + } + if (show_long) + aprint_normal("\n"); + rst_delay(sc, module); + cdelay (100); + for (i = 0; i < delay_mid; i++) { + inc_delay(sc, module); + cdelay (100); + } +} +int _sdram_tck_taps; +int _sdram_write_leveling_bitslips[16]; +static void +sdram_read_leveling_rst_delay (struct sbusfpga_sdram_softc *sc, int module) +{ + ddrphy_dly_sel_write(sc, 1 << module); + ddrphy_rdly_dq_rst_write(sc, 1); + ddrphy_dly_sel_write(sc, 0); +} +static void +sdram_read_leveling_inc_delay (struct sbusfpga_sdram_softc *sc, int module) +{ + ddrphy_dly_sel_write(sc, 1 << module); + ddrphy_rdly_dq_inc_write(sc, 1); + ddrphy_dly_sel_write(sc, 0); +} +static void +sdram_read_leveling_rst_bitslip (struct sbusfpga_sdram_softc *sc, char m) +{ + ddrphy_dly_sel_write(sc, 1 << m); + ddrphy_rdly_dq_bitslip_rst_write(sc, 1); + ddrphy_dly_sel_write(sc, 0); +} +static void +sdram_read_leveling_inc_bitslip (struct sbusfpga_sdram_softc *sc, char m) +{ + ddrphy_dly_sel_write(sc, 1 << m); + ddrphy_rdly_dq_bitslip_write(sc, 1); + ddrphy_dly_sel_write(sc, 0); +} +static unsigned int +sdram_read_leveling_scan_module (struct sbusfpga_sdram_softc *sc, int module, int bitslip, int show) +{ + const unsigned int max_errors = 2 * (4 * 2 * 32); + int i; + unsigned int score; + unsigned int errors; + score = 0; + if (show) + aprint_normal_dev(sc->dk.sc_dev, " m%d, b%02d: |", module, bitslip); + sdram_read_leveling_rst_delay(sc, module); + for (i = 0; i < 32; i++) { + int working; + int _show = show; + errors = sdram_write_read_check_test_pattern(sc, module, 42); + errors += sdram_write_read_check_test_pattern(sc, module, 84); + working = errors == 0; + score += (working * max_errors * 32) + (max_errors - errors); + if (_show) { + print_scan_errors(errors); + } + sdram_read_leveling_inc_delay(sc, module); + } + if (show) + aprint_normal("| "); + return score; +} +static void +sdram_read_leveling(struct sbusfpga_sdram_softc *sc) +{ + int module; + int bitslip; + unsigned int score; + unsigned int best_score; + int best_bitslip; + for (module = 0; module < 16 / 8; module++) { + best_score = 0; + best_bitslip = 0; + sdram_read_leveling_rst_bitslip(sc, module); + for (bitslip = 0; bitslip < 8; bitslip++) { + score = sdram_read_leveling_scan_module(sc, module, bitslip, 1); + sdram_leveling_center_module(sc, module, 1, 0, + sdram_read_leveling_rst_delay, + sdram_read_leveling_inc_delay); + aprint_normal("\n"); + if (score > best_score) { + best_bitslip = bitslip; + best_score = score; + } + if (bitslip == 8 - 1) + break; + sdram_read_leveling_inc_bitslip(sc, module); + } + aprint_normal_dev(sc->dk.sc_dev, " best: m%d, b%02d ", module, best_bitslip); + sdram_read_leveling_rst_bitslip(sc, module); + for (bitslip = 0; bitslip < best_bitslip; bitslip++) + sdram_read_leveling_inc_bitslip(sc, module); + sdram_leveling_center_module(sc, module, 1, 0, + sdram_read_leveling_rst_delay, + sdram_read_leveling_inc_delay); + aprint_normal("\n"); + } +} +static void +sdram_write_latency_calibration(struct sbusfpga_sdram_softc *sc) +{ + int i; + int module; + int bitslip; + unsigned int score; + unsigned int subscore; + unsigned int best_score; + int best_bitslip; + for (module = 0; module < 16 / 8; module++) { + best_score = 0; + best_bitslip = -1; + for (bitslip = 0; bitslip < 8; bitslip += 2) { + score = 0; + ddrphy_dly_sel_write(sc, 1 << module); + ddrphy_wdly_dq_bitslip_rst_write(sc, 1); + for (i = 0; i < bitslip; i++) { + ddrphy_wdly_dq_bitslip_write(sc, 1); + } + ddrphy_dly_sel_write(sc, 0); + score = 0; + sdram_read_leveling_rst_bitslip(sc, module); + for (i = 0; i < 8; i++) { + subscore = sdram_read_leveling_scan_module(sc, module, i, 0); + score = subscore > score ? subscore : score; + sdram_read_leveling_inc_bitslip(sc, module); + } + if (score > best_score) { + best_bitslip = bitslip; + best_score = score; + } + } + if (_sdram_write_leveling_bitslips[module] < 0) + bitslip = best_bitslip; + else + bitslip = _sdram_write_leveling_bitslips[module]; + if (bitslip == -1) + aprint_normal_dev(sc->dk.sc_dev, "m%d:- ", module); + else + aprint_normal_dev(sc->dk.sc_dev, "m%d:%d ", module, bitslip); + ddrphy_dly_sel_write(sc, 1 << module); + ddrphy_wdly_dq_bitslip_rst_write(sc, 1); + for (i = 0; i < bitslip; i++) { + ddrphy_wdly_dq_bitslip_write(sc, 1); + } + ddrphy_dly_sel_write(sc, 0); + } + aprint_normal("\n"); +} +static int +sdram_leveling(struct sbusfpga_sdram_softc *sc) +{ + int module; + sdram_software_control_on(sc); + for (module = 0; module < 16 / 8; module++) { + sdram_read_leveling_rst_delay(sc, module); + sdram_read_leveling_rst_bitslip(sc, module); + } + aprint_normal_dev(sc->dk.sc_dev, "Write latency calibration:\n"); + sdram_write_latency_calibration(sc); + aprint_normal_dev(sc->dk.sc_dev, "Read leveling:\n"); + sdram_read_leveling(sc); + sdram_software_control_off(sc); + return 1; +} +int +sdram_init(struct sbusfpga_sdram_softc *sc) +{ + ddrphy_rdphase_write(sc, 2); + ddrphy_wrphase_write(sc, 3); + aprint_normal_dev(sc->dk.sc_dev, "Initializing SDRAM @0x%08lx...\n", 0x80000000L); + sdram_software_control_on(sc); + ddrphy_rst_write(sc, 1); + cdelay (1000); + ddrphy_rst_write(sc, 0); + cdelay (1000); + init_sequence(sc); + sdram_leveling(sc); + sdram_software_control_off(sc); +#if 0 + if (!memtest ((unsigned int *) 0x80000000L, (2 * 1024 * 1024))) { + return 0; + } + memspeed ((unsigned int *) 0x80000000L, (2 * 1024 * 1024), 0); +#endif + return 1; +} diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_sdram.h b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_sdram.h new file mode 100644 index 0000000..2539b0d --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_sdram.h @@ -0,0 +1,62 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SBUSFPGA_SDRAM_H_ +#define _SBUSFPGA_SDRAM_H_ + +struct sbusfpga_sdram_softc { + struct dk_softc dk; + /* device_t sc_dev; */ /* us as a device */ /* in dk */ + u_int sc_rev; /* revision */ + int sc_node; /* PROM node ID */ + int sc_burst; /* DVMA burst size in effect */ + bus_space_tag_t sc_bustag; /* bus tag */ + bus_space_handle_t sc_bhregs_ddrphy; /* bus handle */ + bus_space_handle_t sc_bhregs_sdram; /* bus handle */ + bus_space_handle_t sc_bhregs_exchange_with_mem; /* bus handle */ + bus_space_handle_t sc_bhregs_mmap; /* bus handle */ + int sc_bufsiz_ddrphy; /* Size of buffer */ + int sc_bufsiz_sdram; /* Size of buffer */ + int sc_bufsiz_exchange_with_mem; /* bus handle */ + int sc_bufsiz_mmap; /* bus handle */ + /* specific of the DMA engine */ + u_int dma_blk_size; + u_int dma_blk_base; + u_int dma_mem_size; /* in blk_size */ + u_int dma_real_mem_size; /* precomputed in bytes */ + /* DMA kernel structures */ + bus_dma_tag_t sc_dmatag; + bus_dmamap_t sc_dmamap; + bus_dma_segment_t sc_segs; + int sc_rsegs; + void * sc_dma_kva; +}; + +#define SBUSFPGA_SDRAM_VAL_DMA_MAX_SZ (64*1024) + +#endif /* _SBUSFPGA_SDRAM_H_ */ diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_stat.c b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_stat.c new file mode 100644 index 0000000..c63dd02 --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_stat.c @@ -0,0 +1,257 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +int sbusfpga_stat_print(void *, const char *); +int sbusfpga_stat_match(device_t, cfdata_t, void *); +void sbusfpga_stat_attach(device_t, device_t, void *); + +CFATTACH_DECL_NEW(sbusfpga_stat, sizeof(struct sbusfpga_sbus_bus_stat_softc), + sbusfpga_stat_match, sbusfpga_stat_attach, NULL, NULL); + +dev_type_open(sbusfpga_stat_open); +dev_type_close(sbusfpga_stat_close); +dev_type_ioctl(sbusfpga_stat_ioctl); + + +const struct cdevsw sbusfpga_stat_cdevsw = { + .d_open = sbusfpga_stat_open, + .d_close = sbusfpga_stat_close, + .d_read = noread, + .d_write = nowrite, + .d_ioctl = sbusfpga_stat_ioctl, + .d_stop = nostop, + .d_tty = notty, + .d_poll = nopoll, + .d_mmap = nommap, + .d_kqfilter = nokqfilter, + .d_discard = nodiscard, + .d_flag = 0 +}; + +extern struct cfdriver sbusfpga_stat_cd; +int +sbusfpga_stat_open(dev_t dev, int flags, int mode, struct lwp *l) +{ + return (0); +} + +int +sbusfpga_stat_close(dev_t dev, int flags, int mode, struct lwp *l) +{ + return (0); +} + +int +sbusfpga_stat_print(void *aux, const char *busname) +{ + + sbus_print(aux, busname); + return (UNCONF); +} + +int +sbusfpga_stat_match(device_t parent, cfdata_t cf, void *aux) +{ + struct sbus_attach_args *sa = (struct sbus_attach_args *)aux; + + return (strcmp("RDOL,sbusstat", sa->sa_name) == 0); +} + +#define CONFIG_CSR_DATA_WIDTH 32 +// define CSR_LEDS_BASE & others to avoid defining the CSRs of HW we don't handle +#define CSR_LEDS_BASE +#define CSR_CURVE25519ENGINE_BASE +#define CSR_DDRPHY_BASE +#define CSR_EXCHANGE_WITH_MEM_BASE +// #define CSR_SBUS_BUS_STAT_BASE +#define CSR_SDRAM_BASE +#define CSR_SDBLOCK2MEM_BASE +#define CSR_SDCORE_BASE +#define CSR_SDIRQ_BASE +#define CSR_SDMEM2BLOCK_BASE +#define CSR_SDPHY_BASE +#define CSR_TRNG_BASE +#include "dev/sbus/litex_csr.h" +#undef CSR_LEDS_BASE +#undef CSR_CURVE25519ENGINE_BASE +#undef CSR_DDRPHY_BASE +#undef CSR_EXCHANGE_WITH_MEM_BASE +// #undef CSR_SBUS_BUS_STAT_BASE +#undef CSR_SDRAM_BASE +#undef CSR_SDBLOCK2MEM_BASE +#undef CSR_SDCORE_BASE +#undef CSR_SDIRQ_BASE +#undef CSR_SDMEM2BLOCK_BASE +#undef CSR_SDPHY_BASE +//#undef CSR_TRNG_BASE + + +static void sbusfpga_stat_display(void *); + +/* + * Attach all the sub-devices we can find + */ +void +sbusfpga_stat_attach(device_t parent, device_t self, void *aux) +{ + struct sbus_attach_args *sa = aux; + struct sbusfpga_sbus_bus_stat_softc *sc = device_private(self); + struct sbus_softc *sbsc = device_private(parent); + int node; + int sbusburst; + + sc->sc_bustag = sa->sa_bustag; + sc->sc_dev = self; + + if (sbus_bus_map(sc->sc_bustag, sa->sa_slot, sa->sa_offset, sa->sa_size, + BUS_SPACE_MAP_LINEAR, &sc->sc_bhregs_sbus_bus_stat) != 0) { + aprint_error(": cannot map registers\n"); + return; + } + + sc->sc_bufsiz = sa->sa_size; + + node = sc->sc_node = sa->sa_node; + + /* + * Get transfer burst size from PROM + */ + sbusburst = sbsc->sc_burst; + if (sbusburst == 0) + sbusburst = SBUS_BURST_32 - 1; /* 1->16 */ + + sc->sc_burst = prom_getpropint(node, "burst-sizes", -1); + if (sc->sc_burst == -1) + /* take SBus burst sizes */ + sc->sc_burst = sbusburst; + + /* Clamp at parent's burst sizes */ + sc->sc_burst &= sbusburst; + + aprint_normal("\n"); + aprint_normal_dev(self, "nid 0x%x, bustag %p, burst 0x%x (parent 0x%0x)\n", + sc->sc_node, + sc->sc_bustag, + sc->sc_burst, + sbsc->sc_burst); + + sc->sc_delay = 5 * hz; // five seconds + + callout_init(&sc->sc_display, CALLOUT_MPSAFE); + callout_setfunc(&sc->sc_display, sbusfpga_stat_display, sc); + /* disable by default */ + sc->sc_enable = 0; + /* do it once during boot*/ + callout_schedule(&sc->sc_display, sc->sc_delay); +} + +#define SBUSFPGA_STAT_ON _IO(0, 1) +#define SBUSFPGA_STAT_OFF _IO(0, 0) + +int +sbusfpga_stat_ioctl (dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) +{ + struct sbusfpga_sbus_bus_stat_softc *sc = device_lookup_private(&sbusfpga_stat_cd, minor(dev)); + int err = 0; + + switch (cmd) { + case SBUSFPGA_STAT_ON: + if (!sc->sc_enable) { + sc->sc_enable = 1; + callout_schedule(&sc->sc_display, sc->sc_delay); + } + break; + case SBUSFPGA_STAT_OFF: + if (sc->sc_enable) { + callout_stop(&sc->sc_display); + sc->sc_enable = 0; + } + break; + default: + err = ENOTTY; + break; + } + + return err; +} + +static void sbusfpga_stat_display(void *args) { + struct sbusfpga_sbus_bus_stat_softc *sc = args; + unsigned int c = sbus_bus_stat_stat_cycle_counter_read(sc), c2; + int count; + sbus_bus_stat_stat_ctrl_write(sc, 1); + delay(1); + count = 0; + while (count < 10 && ((c2 = sbus_bus_stat_stat_cycle_counter_read(sc)) == c)) { + count ++; + delay(1); + } + if ((c2 == c) || (c2 == 0)){ + device_printf(sc->sc_dev, "Statistics didn't update\n"); + } else { + device_printf(sc->sc_dev, "%u: slave %u %u %u %u\n", + c2, + sbus_bus_stat_stat_slave_start_counter_read(sc), + sbus_bus_stat_stat_slave_done_counter_read(sc), + sbus_bus_stat_stat_slave_rerun_counter_read(sc), + sbus_bus_stat_stat_slave_early_error_counter_read(sc)); + device_printf(sc->sc_dev, "%u: master %u %u %u %u (0x%08x)\n", + c2, + sbus_bus_stat_stat_master_start_counter_read(sc), + sbus_bus_stat_stat_master_done_counter_read(sc), + sbus_bus_stat_stat_master_error_counter_read(sc), + sbus_bus_stat_stat_master_rerun_counter_read(sc), + sbus_bus_stat_sbus_master_error_virtual_read(sc)); + } + sbus_bus_stat_stat_ctrl_write(sc, 0); + if (sc->sc_enable) + callout_schedule(&sc->sc_display, sc->sc_delay); +} diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_stat.h b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_stat.h new file mode 100644 index 0000000..1a6699f --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_stat.h @@ -0,0 +1,45 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SBUSFPGA_STAT_H_ +#define _SBUSFPGA_STAT_H_ + +struct sbusfpga_sbus_bus_stat_softc { + device_t sc_dev; /* us as a device */ + u_int sc_rev; /* revision */ + int sc_node; /* PROM node ID */ + int sc_burst; /* DVMA burst size in effect */ + bus_space_tag_t sc_bustag; /* bus tag */ + bus_space_handle_t sc_bhregs_sbus_bus_stat; /* bus handle */ + int sc_bufsiz; /* Size of buffer */ + callout_t sc_display; + int sc_delay; + int sc_enable; +}; + +#endif /* _SBUSFPGA_STAT_H_ */ diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_trng.c b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_trng.c new file mode 100644 index 0000000..96b7c85 --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_trng.c @@ -0,0 +1,213 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +int sbusfpga_trng_print(void *, const char *); +int sbusfpga_trng_match(device_t, cfdata_t, void *); +void sbusfpga_trng_attach(device_t, device_t, void *); + +CFATTACH_DECL_NEW(sbusfpga_trng, sizeof(struct sbusfpga_trng_softc), + sbusfpga_trng_match, sbusfpga_trng_attach, NULL, NULL); + +dev_type_open(sbusfpga_trng_open); +dev_type_close(sbusfpga_trng_close); +dev_type_ioctl(sbusfpga_trng_ioctl); + + + +const struct cdevsw sbusfpga_trng_cdevsw = { + .d_open = sbusfpga_trng_open, + .d_close = sbusfpga_trng_close, + .d_read = noread, + .d_write = nowrite, + .d_ioctl = noioctl, + .d_stop = nostop, + .d_tty = notty, + .d_poll = nopoll, + .d_mmap = nommap, + .d_kqfilter = nokqfilter, + .d_discard = nodiscard, + .d_flag = 0 +}; + +extern struct cfdriver sbusfpga_trng_cd; +int +sbusfpga_trng_open(dev_t dev, int flags, int mode, struct lwp *l) +{ + return (0); +} + +int +sbusfpga_trng_close(dev_t dev, int flags, int mode, struct lwp *l) +{ + return (0); +} + +int +sbusfpga_trng_print(void *aux, const char *busname) +{ + + sbus_print(aux, busname); + return (UNCONF); +} + +int +sbusfpga_trng_match(device_t parent, cfdata_t cf, void *aux) +{ + struct sbus_attach_args *sa = (struct sbus_attach_args *)aux; + + return (strcmp("RDOL,neorv32trng", sa->sa_name) == 0); +} + +#define CONFIG_CSR_DATA_WIDTH 32 +// define CSR_LEDS_BASE & others to avoid defining the CSRs of HW we don't handle +#define CSR_LEDS_BASE +#define CSR_CURVE25519ENGINE_BASE +#define CSR_DDRPHY_BASE +#define CSR_EXCHANGE_WITH_MEM_BASE +#define CSR_SBUS_BUS_STAT_BASE +#define CSR_SDRAM_BASE +#define CSR_SDBLOCK2MEM_BASE +#define CSR_SDCORE_BASE +#define CSR_SDIRQ_BASE +#define CSR_SDMEM2BLOCK_BASE +#define CSR_SDPHY_BASE +//#define CSR_TRNG_BASE +#include "dev/sbus/litex_csr.h" +#undef CSR_LEDS_BASE +#undef CSR_CURVE25519ENGINE_BASE +#undef CSR_DDRPHY_BASE +#undef CSR_EXCHANGE_WITH_MEM_BASE +#undef CSR_SBUS_BUS_STAT_BASE +#undef CSR_SDRAM_BASE +#undef CSR_SDBLOCK2MEM_BASE +#undef CSR_SDCORE_BASE +#undef CSR_SDIRQ_BASE +#undef CSR_SDMEM2BLOCK_BASE +#undef CSR_SDPHY_BASE +//#undef CSR_TRNG_BASE + +static void +sbusfpga_trng_getentropy(size_t nbytes, void *cookie) { + struct sbusfpga_trng_softc *sc = cookie; + size_t dbytes = 0; + int failure = 0; + while (nbytes > dbytes) { + u_int32_t data = trng_data_read(sc); + if (data) { + rnd_add_data_sync(&sc->sc_rndsource, &data, 4, 32); // 32 is perhaps optimistic + dbytes += 4; + } else { + failure ++; + if (failure > (1+(dbytes/4))) { // something going on + device_printf(sc->sc_dev, "out of entropy after %zd / %zd bytes\n", dbytes, nbytes); + return; + } + delay(1); + } + if (((dbytes%32)==0) && (nbytes > dbytes)) + delay(1); // let the hardware breathes if the OS needs a lof of bytes + } + device_printf(sc->sc_dev, "gathered %zd bytes [%d]\n", dbytes, failure); +} + +/* + * Attach all the sub-devices we can find + */ +void +sbusfpga_trng_attach(device_t parent, device_t self, void *aux) +{ + struct sbus_attach_args *sa = aux; + struct sbusfpga_trng_softc *sc = device_private(self); + struct sbus_softc *sbsc = device_private(parent); + int node; + int sbusburst; + + sc->sc_bustag = sa->sa_bustag; + sc->sc_dev = self; + + if (sbus_bus_map(sc->sc_bustag, sa->sa_slot, sa->sa_offset, sa->sa_size, + BUS_SPACE_MAP_LINEAR, &sc->sc_bhregs_trng) != 0) { + aprint_error(": cannot map registers\n"); + return; + } + + //sc->sc_buffer = bus_space_vaddr(sc->sc_bustag, sc->sc_bhregs_trng); + sc->sc_bufsiz = sa->sa_size; + + node = sc->sc_node = sa->sa_node; + + /* + * Get transfer burst size from PROM + */ + sbusburst = sbsc->sc_burst; + if (sbusburst == 0) + sbusburst = SBUS_BURST_32 - 1; /* 1->16 */ + + sc->sc_burst = prom_getpropint(node, "burst-sizes", -1); + if (sc->sc_burst == -1) + /* take SBus burst sizes */ + sc->sc_burst = sbusburst; + + /* Clamp at parent's burst sizes */ + sc->sc_burst &= sbusburst; + + aprint_normal("\n"); + aprint_normal_dev(self, "nid 0x%x, bustag %p, burst 0x%x (parent 0x%0x)\n", + sc->sc_node, + sc->sc_bustag, + sc->sc_burst, + sbsc->sc_burst); + + trng_ctrl_write(sc, 0x02); // start the TRNG + + rndsource_setcb(&sc->sc_rndsource, sbusfpga_trng_getentropy, sc); + rnd_attach_source(&sc->sc_rndsource, device_xname(self), RND_TYPE_RNG, RND_FLAG_HASCB | RND_FLAG_COLLECT_VALUE); +} diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_trng.h b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_trng.h new file mode 100644 index 0000000..86557de --- /dev/null +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_trng.h @@ -0,0 +1,43 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2020 Romain Dolbeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SBUSFPGA_TRNG_H_ +#define _SBUSFPGA_TRNG_H_ + +struct sbusfpga_trng_softc { + device_t sc_dev; /* us as a device */ + u_int sc_rev; /* revision */ + int sc_node; /* PROM node ID */ + int sc_burst; /* DVMA burst size in effect */ + bus_space_tag_t sc_bustag; /* bus tag */ + bus_space_handle_t sc_bhregs_trng; /* bus handle */ + int sc_bufsiz; /* Size of buffer */ + struct krndsource sc_rndsource; +}; + +#endif /* _SBUSFPGA_TRNG_H_ */ diff --git a/README.md b/README.md index 8fd0df9..a969c94 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Goal -The goal of this repository is to be able to interface a modern (2020 era) [FPGA](https://en.wikipedia.org/wiki/Field-programmable_gate_array) with a [SBus](https://en.wikipedia.org/wiki/SBus) host. SBus was widely used in SPARCstation and compatibles system in the first halt of the 90s. It was progressively displaced by PCI from the mid-90s onward, and is thoroughly obsolete. +The goal of this repository is to be able to interface a modern (2020 era) [FPGA](https://en.wikipedia.org/wiki/Field-programmable_gate_array) with a [SBus](https://en.wikipedia.org/wiki/SBus) host. SBus was widely used in SPARCstation and compatibles system in the first half of the 90s. It was progressively displaced by PCI from the mid-90s onward, and is thoroughly obsolete. So unless you're a retrocomputing enthusiast with such a machine, this is useless. To be honest, even if you are such an enthusiast, it's probably not that useful... @@ -12,25 +12,50 @@ To save on PCB cost, the board is smaller than a 'true' SBus board; the hardware ## Current status -2021-03-21: The adapter board seems to work fine in two different SS20. Currently the embedded PROM code exposes three devices in the FPGA: +2021-07-18: The old VHDL gateware has been replaced by a new Migen-based gateware, see below for details. -* "RDOL,cryptoengine": exposes a (way too large) polynomial multiplier to implement GCM mode and a AES block. Currently used to implement DMA-based acceleration of AES-256-CBC through /dev/crypto. Unfortunately OpenSSL doesn't support AES-256-GCM in the cryptodev engine, and disagree with NetBSD's /dev/crypto on how to implement AES-256-CTR. And the default SSH cannot use cryptodev, it closes all file descriptors after cryptodev has opened /dev/crypto... still WiP. - -* "RDOL,trng": exposes a 5 MHz counter (didn't realize the SS20 already had a good counter) and a so-far-not-true TRNG (implemented by a PRNG). The 'true' random generators I've found make Vivado screams very loudly when synthesizing... anyway both works fine in NetBSD 9.0 as a timecounter and an entropy source (which a PRNG really isn't, I know). still WiP. - -* "RDOL,sdcard": trying to expose the micro-sd card slot as a storage device, at first using SPI mode. So far reading seems to work, and NetBSD can see a Sun disklabel on the micro-sd card if it has been partitioned that way. Mounting a FAT filesystem read-only now works (with very little testing as of yet). Writing not working yet. Very much WiP. +2021-08-22: Short version: the board enables a 256 MiB SDRAM disk (for fast swapping), a TRNG, a USB OHCI host controller (for USB peripherals) and a Curve25519 accelerator. ## The hardware Directory 'sbus-to-ztex' -The custom board is a SBus-compliant (I hope...) board, designed to receive a [ZTex USB-FPGA Module 2.13](https://www.ztex.de/usb-fpga-2/usb-fpga-2.13.e.html) as a daughterboard. The ZTex module contains the actual FPGA (Artix-7), some RAM, programming hardware, etc. The SBus board contains level-shifters ICs to interface between the SBus signals and the FPGA, a serial header, some Leds, a JTAG header, and a micro-sd card slot. +The custom board is a SBus-compliant (I hope...) board, designed to receive a [ZTex USB-FPGA Module 2.13](https://www.ztex.de/usb-fpga-2/usb-fpga-2.13.e.html) as a daughterboard. The ZTex module contains the actual FPGA (Artix-7), some RAM, programming hardware, etc. The SBus board contains level-shifters ICs to interface between the SBus signals and the FPGA, a serial header, some Leds, a JTAG header, and a micro-sd card slot. It only connects interrupt line 7 (highest priority) and 1 (lowest priority), which was a mistake (more interrupts are needed and 7 is too high-priority to use at this stage, so just the level 1 is usable), but otherwise supports every SBus feature except the optional parity (i.e. it can do both slave and master modes). The PCB was designed with Kicad 5.0 -## The gateware +## The gateware (Migen) -Directory 'sbus-to-ztex-gateware' +### Intro + +The gateware was rewritten from scratch in the Migen language, choosen because that's what [Litex](https://github.com/enjoy-digital/litex/) uses. +It implements a simple CPU-less Litex SoC built around a Wishbone bus, with a bridge between the SBus and the Wishbone. + +A ROM, a SDRAM controller ([litedram](https://github.com/enjoy-digital/litedram) to the on-board DDR3), a TRNG (using the [NeoRV32](https://github.com/stnolting/neorv32) TRNG), an USB OHCI (host controller, using the Litex wrapper around the [SpinalHDL](https://github.com/SpinalHDL/SpinalHDL) implementation) and a Curve25519 Crypto Engine (taken from the [Betrusted.IO](https://betrusted.io/) project) are connected to that bus. + +### Details + +Master access to the SBus by the host are routed to the Wishbone to access the various CSRs / control registers of the devices. + +The ROM doesn't do much beyond exposing the devices' existence and specifications to the host. + +The SDRAM has its own custom DMA controller, using native Litedram DMA to the memory, and some FIFO to/from the SBus. A custom NetBSD driver exposes it as a drive on which you can swap. It's also usable as a 'fast', volatile disk (for e.g. /tmp or similar temporary filesystem). It could use a interrupt line, but the only usable one in the current HW design is in use by the USB. + +The TRNG has a NetBSD driver to add entropy to the entropy pool. + +The USB OHCI DMA is bridged from the Wishbone to the SBus by having the physical addresses of the Wishbone (that match the virtual addresses from NetBSD DVMA allocations) to the bridge. Reads are buffered by block of 16 bytes; currently writes are unbuffered (and somewhat slow, as they need a full SBus master cycle for every transaction of 32 bits or less). The standard NetBSD OHCI driver is used, with just a small custom SBus-OHCI driver mirroring the PCI-OHCI one. It uses the interrupt level 1 available on the board. As the board has no USB connectors, the D+ and D- lines are routed to the Serial header pins, those (and GND) are connected to a pair of pins of [Dolu1990's USB PMod](https://github.com/Dolu1990/pmod_usb_host_x4), and the associated USB port is connected to an external self-powered USB hub (which is the one supplying the VBus). It's quite ugly but it works (of course I should redesign the PCB with a proper USB connector and a VBus). + +The Curve25519 Engine currently exposes an IOCTL to do the computation, which has yet to be integrated usefully in e.g. OpenSSL. It could use a interrupt line, but the only usable one in the current HW design is in use by the USB. + +### Special Notes + +Currently the design uses a Wishbone Crossbar Interconnect from Litex instead of a Shared Interconnect, as for some reason using a Shared Interconnect causes issues between devices (disabling the USB OHCI seem also to solve the issue, it generates a lot of cycles on the buses). I might be misusing Wishbone. With the Crossbar, all devices are usable simultaneously. + +As not everything lives in the same clock domain, the design also use a Wishbone CDC, a wrapper around the one from [Verilog Wishbone Components](https://github.com/alexforencich/verilog-wishbone). + +## The gateware (VHDL, obsolete) + +Directory 'sbus-to-ztex-gateware', this is obsolete and replaced by the Migen gateware above. The function embedded in the FPGA currently includes the PROM, lighting Led to display a 32-bits value, and a GHASH MAC (128 polynomial accumulator, used for the AES-GCM encryption scheme). The device is a fairly basic scale, but should be able to read from the PROM and read/write from the GCM space with any kind of SBus burst (1, 2, 4, 8 or 16 words). @@ -40,5 +65,5 @@ The gateware is currently synthesized with Vivado 2020.1 Directory 'NetBSD' -Some basic drivers for NetBSD 9.0/sparc to enable the deviced as described above. +Some basic drivers for NetBSD 9.0/sparc to enable the devices as described above. diff --git a/sbus-to-ztex-gateware-migen/engine.py b/sbus-to-ztex-gateware-migen/engine.py new file mode 100644 index 0000000..d18d232 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/engine.py @@ -0,0 +1,2530 @@ +from migen import * +from migen.genlib.cdc import MultiReg + +from litex.soc.interconnect.csr import * +from litex.soc.integration.doc import AutoDoc, ModuleDoc +from litex.soc.interconnect import wishbone +from litex.soc.interconnect.csr_eventmanager import * + +prime_string = "$2^{{255}}-19$" # 2\ :sup:`255`-19 +field_latex = "$\mathbf{{F}}_{{{{2^{{255}}}}-19}}$" + +opcode_bits = 6 # number of bits used to encode the opcode field +opcodes = { # mnemonic : [bit coding, docstring] + "UDF" : [-1, "Placeholder for undefined opcodes"], + "PSA" : [0, "Wd $\gets$ Ra // pass A"], + "PSB" : [1, "Wd $\gets$ Rb // pass B"], # Is that needed ??? + "MSK" : [2, "Wd $\gets$ Replicate(Ra[0], 256) & Rb // for doing cswap()"], + "XOR" : [3, "Wd $\gets$ Ra ^ Rb // bitwise XOR"], + "NOT" : [4, "Wd $\gets$ ~Ra // binary invert"], + "ADD" : [5, "Wd $\gets$ Ra + Rb // 256-bit binary add, must be followed by TRD,SUB"], + "SUB" : [6, "Wd $\gets$ Ra - Rb // 256-bit binary subtraction, this is not the same as a subtraction in the finite field"], + "MUL" : [7, f"Wd $\gets$ Ra * Rb // multiplication in {field_latex} - result is reduced"], + "TRD" : [8, "If Ra $\geqq 2^{{255}}-19$ then Wd $\gets$ $2^{{255}}-19$, else Wd $\gets$ 0 // Test reduce"], + "BRZ" : [9, "If Ra == 0 then mpc[9:0] $\gets$ mpc[9:0] + immediate[9:0] + 1, else mpc $\gets$ mpc + 1 // Branch if zero"], + "FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"], + "SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"], + "XBT" : [12, "Wd[0] $\gets$ Ra[254] // extract the 255th bit of Ra and put it into the 0th bit of Wd"], + "AND" : [20, "Wd $\gets$ Ra & Rb // bitwise AND"], + # for CLMUL, bit #31 indicates both lanes are needed; currently same speed + "CLMUL": [13, "carry-less multiplication; reg-reg only; per 128-bits block"], # basically 256-bits form of vpclmulqdq + "GCM_SHLMI": [14, "Shift A left by imm, insert B MSB as dest LSB; reg-reg or reg-imm; per 128-bits block"], # make SHL redundant: SHL %rd, %ra == GCM_SHLMI %rd, %ra, #0, #1 + "GCM_SHRMI": [15, "Shift A right by imm, insert B LSB as dest MSB; reg-reg or reg-imm; per 128-bits block"], # + "GCM_CMPD": [16, "Compute D:X0 from X1:X0; reg ; per 128-bits block"], # specific + "GCM_SWAP64": [17, "Swap doubleword (64 bits) ; reg-reg or imm-reg or reg-imm; per 128-bits block ; imm != 0 -> BYTEREV*"], # + # for AESESMI, bit #31 indicates both lanes are needed; currently same speed + "AESESMI" : [18, "AES ; reg-reg ; per 128-bits block; imm[0] is 1 for aesesi (shared opcode)" ], + # for MEM, bit #31 indicates both lanes are needed; b[31] == 0 faster as the second access is not done + "MEM" : [19, "MEM ; imm[0] == 0 for LOAD, imm[0] == 1 for STORE (beware, store copy the address in the output reg)" ], + "MAX" : [21, "Maximum opcode number (for bounds checking)"], +} + +num_registers = 32 +instruction_layout = [ + ("opcode", opcode_bits, "opcode to be executed"), + ("ra", log2_int(num_registers), "operand A read register"), + ("ca", 1, "set to substitute constant table value for A"), + ("rb", log2_int(num_registers), "operand B read register"), + ("cb", 1, "set to substitute constant table value for B"), + ("wd", log2_int(num_registers), "write register"), + ("immediate", 9, "Used by jumps to load the next PC value") +] + +class RegisterFile(Module, AutoDoc): + def __init__(self, depth=512, width=256, bypass=False): + reset_cycles = 4 + self.intro = ModuleDoc(title="Register File", body=""" +This implements the register file for the Curve25519 engine. It's implemented using +7-series specific block RAMs in order to take advantage of architecture-specific features +to ensure a compact and performant implementation. + +The core primitive is the RAMB36E1. This can be configured as a 64/72-bit wide memory +but only if used in "SDP" (simple dual port) mode. In SDP, you have one read, one write port. +However, the register file needs to produce two operands per cycle, while accepting up to +one operand per cycle. + +In order to do this, we stipulate that the RF runs at `rf_clk` (200MHz), but uses four phases +to produce/consume data. "Engine clock" `eng_clk` (50MHz) runs at a lower rate to accommodate +large-width arithmetic in a single cycle. + +The phasing is defined as follows: + +Phase 0: + - read from port A +Phase 1: + - read from port B +Phase 2: + - write data +Phase 3: + - quite cycle, used to create extra setup time for next stage (requires multicycle-path constraints) + +The writing of data is done in the second phase means that write happen to the same address +as being read, you get the old value. For pipelined operation, it could be desirable to shift +the write to happen before the reads, but as of now the implementation is not pipelined. + +The register file is unavailable for {} `eng_clk` cycles after reset. + +When configured as a 64 bit memory, the depth of the block is 512 bits, corresponding to +an address width of 9 bits. + + """.format(reset_cycles)) + + instruction = Record(instruction_layout) + phase = Signal(2) # internal phase + self.phase = Signal() # external phase + self.comb += self.phase.eq(phase[1]) # divide down internal phase so slower modules can capture it + + # these are the signals in and out of the register file + self.ra_dat = Signal(width) # this is passed in from outside the module because we want to mux with e.g. memory bus + self.ra_adr = Signal(log2_int(depth)) + self.rb_dat = Signal(width) + self.rb_adr = Signal(log2_int(depth)) + + # register file pipelines the write target address, going to the exec units; also needs the window to be complete + # window is assumed to be static and does not change throughout a give program run, so it's not pipelined + self.instruction_pipe_in = Signal(len(instruction)) + self.instruction_pipe_out = Signal(len(instruction)) + self.window = Signal(log2_int(depth) - log2_int(num_registers)) + + # this is the immediate data to write in, coming from the exec units + self.wd_dat = Signal(width) + self.wd_adr = Signal(log2_int(depth)) + self.wd_bwe = Signal(width//8) # byte masks for writing + self.we = Signal() + self.clear = Signal() + + self.running = Signal() # used for activity gating to RAM + + eng_sync = Signal(reset=1) + + rf_adr = Signal(log2_int(depth)) + self.comb += [ + If(phase == 0, + rf_adr.eq(self.ra_adr), + ).Elif(phase == 1, + rf_adr.eq(self.rb_adr), + ) + ] + rf_dat = Signal(width) + self.sync.eng_clk += [ + # TODO: check that this is in sync with expected values + self.instruction_pipe_out.eq(self.instruction_pipe_in), + ] + # unfortunately, -1L speed grade is too slow to support pipeline bypassing of the register file: + # bypass path closes at about 5.4ns, which fails to meet the 5ns cycle time target for the four-phase RF + if bypass: + self.sync.rf_clk += [ + If(phase == 1, + If((self.wd_adr != self.ra_adr) | ~self.we, + self.ra_dat.eq(rf_dat), + ).Else( + self.ra_dat.eq(self.wd_dat), + ), + self.rb_dat.eq(self.rb_dat), + ).Elif(phase == 2, + self.ra_dat.eq(self.ra_dat), + If((self.wd_adr != self.rb_adr) | ~self.we, + self.rb_dat.eq(rf_dat), + ).Else( + self.rb_dat.eq(self.wd_dat), + ) + ).Else( + self.ra_dat.eq(self.ra_dat), + self.rb_dat.eq(self.rb_dat), + ), + ] + else: + self.sync.rf_clk += [ + If(phase == 1, + self.ra_dat.eq(rf_dat), + self.rb_dat.eq(self.rb_dat), + ).Elif(phase == 2, + self.ra_dat.eq(self.ra_dat), + self.rb_dat.eq(rf_dat), + ).Else( + self.ra_dat.eq(self.ra_dat), + self.rb_dat.eq(self.rb_dat), + ), + ] + wren_pipe = Signal() # do not change this variable name, it is constrained in the XDC + self.sync.rf_clk += [ + If(eng_sync, + phase.eq(0), + ).Else( + phase.eq(phase + 1), + ), + wren_pipe.eq((phase == 1) & self.we), # we want wren to hit on phase==2, but we pipeline it to relax timing. so capture the input to the pipe on phase == 1 + ] + wd_bwe_pipe = Signal(width//8) + self.sync.rf_clk += [ + # add a register to relax timing on wd_bwe. This offsets the signal by one rf_clk (clk200) period, + # but because write happens on phase 2 and the signal is valid on eng_clk (clk50) edges, this will + # not affect the functionality + wd_bwe_pipe.eq(self.wd_bwe) + ] + + for word in range(int(256/64)): + self.specials += Instance("BRAM_SDP_MACRO", name="RF_RAMB" + str(word), + p_BRAM_SIZE = "36Kb", + p_DEVICE = "7SERIES", + p_WRITE_WIDTH = 64, + p_READ_WIDTH = 64, + p_DO_REG = 0, + p_INIT_FILE = "NONE", + p_SIM_COLLISION_CHECK = "ALL", # "WARNING_ONLY", "GENERATE_X_ONLY", "NONE" + p_SRVAL = 0, + p_WRITE_MODE = "READ_FIRST", + i_RDCLK = ClockSignal("rf_clk"), + i_WRCLK = ClockSignal("rf_clk"), + i_RDADDR = rf_adr, + i_WRADDR = self.wd_adr, + i_DI = self.wd_dat[word*64 : word*64 + 64], + o_DO = rf_dat[word*64 : word*64 + 64], + i_RDEN = self.running, # reduce power when not running + i_WREN = wren_pipe, # (phase == 2) & self.we, but pipelined one stage + i_RST = ResetSignal("rf_clk"), + i_WE = wd_bwe_pipe[word*8 : word*8 + 8], + + i_REGCE = 1, # should be ignored, but added to quiet down simulation warnings + ) + + # create an internal reset signal that synchronizes the "eng" to the "rf" domains + # it will also reset the register file on demand + reset_counter = Signal(log2_int(reset_cycles), reset=reset_cycles - 1) + self.sync.eng_clk += [ + If(self.clear, + reset_counter.eq(reset_cycles - 1), + eng_sync.eq(1), + ).Else( + If(reset_counter != 0, + reset_counter.eq(reset_counter - 1), + eng_sync.eq(1), + ).Else( + eng_sync.eq(0) + ), + ) + ] + +class Curve25519Const(Module, AutoDoc): + def __init__(self, insert_docs=False): + global did_const_doc + constant_defs = { + 0: [0, "zero", "The number zero"], + 1: [1, "one", "The number one"], + 2: [121665, "am24", "The value $\\frac{{A-2}}{{4}}$"], + 3: [0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFED, "field", f"Binary coding of {prime_string}"], + 4: [121666, "ap24", "The value $\\frac{{A+2}}{{4}}$"], + 5: [5, "five", "The number 5 (for pow22501)"], + 6: [10, "ten", "The number 10 (for pow22501)"], + 7: [20, "twenty", "The number 20 (for pow22501)"], + 8: [50, "fifty", "The number 50 (for pow22501)"], + 9: [100, "one hundred", "The number 100 (for pow22501)"], + 10: [254, "two hundred fifty four", "The number 254 (iteration count)"], + 11: [0x00000001_00000000_00000000_00000000_00000001_00000000_00000000_00000000, "increment for GCM counter (LE)", "increment for GCM counter (LE)"], + 12: [0x00000000_00000000_00000000_00000010_00000000_00000000_00000000_00000010, "sixteen (twice)", "The number 16 (for block-size address increment)"], + 13: [0x00000000_00000000_00000000_00000001_00000000_00000000_00000000_00000001, "decrement for GCM dual-loops (LE)", "decrement for GCM dual-loops"], + # 14 + # 15 + 16: [16, "sixteen", "The number 16"], + } + self.adr = Signal(5) + self.const = Signal(256) + constant_str = "This module encodes the constants that can be substituted for any register value. Therefore, up to 32 constants can be encoded.\n\n" + for code, const in constant_defs.items(): + self.comb += [ + If(self.adr == code, + self.const.eq(const[0]), + ) + ] + constant_str += """ +**{}** + + Substitute register {} with {}: {}\n""".format(const[1], code, const[2], const[0]) + if insert_docs: + self.constants = ModuleDoc(title="Curve25519 Constants", body=constant_str) + +# ------------------------------------------------------------------------ EXECUTION UNITS +class ExecUnit(Module, AutoDoc): + def __init__(self, width=256, opcode_list=["UDF"], insert_docs=False): + if insert_docs: + self.intro = ModuleDoc(title="ExecUnit class", body=""" + ExecUnit is the superclass template for execution units. + + Configuration Arguments: + - `opcode_list` is the list of opcodes that an ExecUnit can process + - `width` is the bit-width of the execution pathway + + Signal API for an exec unit: + - `a` and `b` are the inputs. + - `instruction_in` is the instruction corresponding to the currently present `a` and `b` inputs + - `start` is a single-clock signal which indicates processing should start + - `q` is the output + - `instruction_out` is the instruction for the result present at the `q` output + - `q_valid` is a single cycle pulse that indicates that the `q` result and `wa_out` value is valid + + + """) + self.instruction = Record(instruction_layout) + + self.a = Signal(width) + self.b = Signal(width) + self.q = Signal(width) + self.start = Signal() + self.q_valid = Signal() + # pipeline the instruction + self.instruction_in = Signal(len(self.instruction)) + self.instruction_out = Signal(len(self.instruction)) + + self.opcode_list = opcode_list + self.comb += [ + self.instruction.raw_bits().eq(self.instruction_in) + ] + +class ExecMask(ExecUnit): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["MSK"], insert_docs=True) # we insert_docs to be true for exactly once module exactly once + self.intro = ModuleDoc(title="Masking ExecUnit Subclass", body=f""" +This execution unit implements the bit-mask and operation. It takes Ra[0] (the +zeroth bit of Ra) and replicates it to {str(width)} bits wide, and then ANDs it with +the full contents of Rb. This operation is introduced as one of the elements of +the `cswap()` routine, which is a constant-time swap of two variables based on a `swap` flag. + +Here is an example of how to swap the contents of `ra` and `rb` based on the value of the 0th bit of `swap`:: + + XOR dummy, ra, rb // dummy $\gets$ ra ^ rb + MSK dummy, swap, dummy // If swap[0] then dummy $\gets$ dummy, else dummy $\gets$ 0 + XOR ra, dummy, ra // ra $\gets$ ra ^ dummy + XOR rb, dummy, rb // rb $\gets$ rb ^ dummy +""") + self.sync.eng_clk += [ + self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + self.comb += [ + self.q.eq(self.b & Replicate(self.a[0], width)), + ] + +class ExecLogic(ExecUnit): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["XOR", "NOT", "PSA", "PSB", "XBT", "SHL", "AND"]) + self.intro = ModuleDoc(title="Logic ExecUnit Subclass", body=f""" +This execution unit implements bit-wise logic operations: XOR, NOT, and +passthrough. + +* XOR returns the result of A^B +* NOT returns the result of !A +* PSA returns the value of A +* PSB returns the value of B +* SHL returns A << 1 +* XBT returns the 255th bit of A, reported in the 0th bit of the result +* AND returns the result of A&B + +""") + + zeros = Signal(255, reset=0) + self.sync.eng_clk += [ + self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + self.comb += [ + If(self.instruction.opcode == opcodes["XOR"][0], + self.q.eq(self.a ^ self.b) + ).Elif(self.instruction.opcode == opcodes["NOT"][0], + self.q.eq(~self.a) + ).Elif(self.instruction.opcode == opcodes["PSA"][0], + self.q.eq(self.a), + ).Elif(self.instruction.opcode == opcodes["PSB"][0], + self.q.eq(self.b), + ).Elif(self.instruction.opcode == opcodes["XBT"][0], + self.q.eq(Cat(self.a[254], zeros)) + ).Elif(self.instruction.opcode == opcodes["SHL"][0], + self.q.eq(Cat(0, self.a[:255])), + ).Elif(self.instruction.opcode == opcodes["AND"][0], + self.q.eq(self.a & self.b), + ), + ] + +class ExecAddSub(ExecUnit, AutoDoc): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["ADD", "SUB"]) + self.notes = ModuleDoc(title="Add/Sub ExecUnit Subclass", body=f""" +This execution module implements 256-bit binary addition and subtraction. + +Note that to implement operations in $\mathbf{{F}}_p$, where *p* is $2^{{255}}-19$, this must be compounded +with other operators as follows: + +Addition of Ra + Rb into Rc in {field_latex}: + +.. code-block:: c + + ADD Rc, Ra, Rb // Rc <- Ra + Rb + TRD Rd, Rc // Rd <- ReductionValue(Rc) + SUB Rc, Rc, Rd // Rc <- Rc - Rd + +Negation of Ra into Rc in {field_latex}: + +.. code-block:: c + + SUB Rc, #FIELDPRIME, Ra // Rc <- 2^255-19 - Ra + +Note that **#FIELDPRIME** is one of the 32 available hard-coded constants +that can be substituted for any register in any arithmetic operation, please +see the section on "Constants" for more details. + +Subtraction of Ra - Rb into Rc in {field_latex}: + +.. code-block:: c + + SUB Rb, #FIELDPRIME, Rb // Rb <- 2^255-19 - Rb + ADD Rc, Ra, Rb // Rc <- Ra + Rb + TRD Rd, Rc // Rd <- ReductionValue(Rc) + SUB Rc, Rc, Rd // Rc <- Rc - Rd + +In all the examples above, Ra and Rb must be members of {field_latex}. + """) + + self.sync.eng_clk += [ + self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + self.comb += [ + If(self.instruction.opcode == opcodes["ADD"][0], + self.q.eq(self.a + self.b), + ).Elif(self.instruction.opcode == opcodes["SUB"][0], + self.q.eq(self.a - self.b), + ), + ] + +class ExecTestReduce(ExecUnit, AutoDoc): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["TRD"]) + + self.notes = ModuleDoc(title="Modular Reduction Test ExecUnit Subclass", body=f""" +First, observe that $2^n-19$ is 0x07FF....FFED. +Next, observe that arithmetic in the field of {prime_string} will never set +the 256th bit. + +Modular reduction must happen when an arithmetic operation +overflows the bounds of the modulus. When this happens, one must +subtract the modulus (in this case {prime_string}). + +The reduce operation is done in two halves. The first half is +to check if a reduction must happen. The second is to do the subtraction. +In order to allow for constant-time operation, we always do the subtraction, +even if it is not strictly necessary. + +We use this to our advantage, and compute a reduction using +a test operator that produces a residue, and a subtraction operation. + +It's up to the programmer to ensure that the two instruction sequence +is never broken up. + +Thus the reduction algorithm is as follows: + +1. TestReduce + - If the 256th bit is set (e.g, ra[255]), then return {prime_string} + - If bits ra[255:5] are all 1, and bits ra[4:0] are greater than or equal to 0x1D, then return {prime_string} + - Otherwise return 0 +2. Subtract + - Subtract the return value of TestReduce from the tested value + + """) + self.sync.eng_clk += [ + self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + self.comb += [ + If( (self.a >= 0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFED), + self.q.eq(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFED) + ).Else( + self.q.eq(0x0) + ), + ] + +class ExecMul(ExecUnit, AutoDoc): + def __init__(self, width=256, sim=False): + ExecUnit.__init__(self, width, ["MUL"]) + + self.sync.eng_clk += [ # pipeline the instruction + self.instruction_out.eq(self.instruction_in), + ] + self.notes = ModuleDoc(title=f"Multiplication in {field_latex} ExecUnit Subclass", body=f""" +Unlike the ADD/SUB module, this operator explicitly works in {field_latex}. It takes in two inputs, +Ra and Rb, and both must be members of {field_latex}. The result is also reduced to a member of {field_latex}. + +The multiplier is designed with a separate clock, `mul_clk` so that it can be remapped to a faster +domain than `engine_clk` for better performance. The nominal target for `mul_clk` is 100MHz. + +The base algorithm for this implementation is lifted from the paper "Compact and Flexible FPGA Implementation +of Ed25519 and X25519" by Furkan Turan and Ingrid Verbauwhede (https://doi.org/10.1145/3312742). The algorithm +specified in this paper is optimized for the DSP48E blocks found inside a 7-Series Xilinx FPGA. In particular, +we can compute 17-bit multiplies using this hardware block, and 255 divides evenly into 17 to produce +a requirement of 15x DSP48E blocks. + +At a high level, the steps to compute the multiplication are: + +1. Schoolbook multiplication +2. Collapse partial sums +3. Propagate carries +4. Is the sum $\geq$ $2^{{255}}-19$? +5. If yes, add 19; else add 0 +6. Propagate carries again, in case the addition by 19 causes overflows + +The multiplier would run about 30% faster if step (6) were skipped. This step happens +in a fairly small minority of cases, maybe a fraction of 1%, and the worst-case +carry propagate through every limb (mathspeak for "digits") is diminishingly rare. The test for +whether or not to propagate carries is fairly straightforward. However, short-circuiting +the carry propagate step based upon the properties of the data creates +a timing side-channel. Therefore, we prefer a slower but safer implementation, even if +we are spending a bunch of cycles propagating zeros most of the time. + +A constant-time optimization would be for the multiplier to simply produce a 256-bit +result, and then use a subsequent TRD/SUB instruction pair. However, the non-pipelined +version of the engine25519 executes at a rate of 60ns per instruction, or 120ns total to +compute the TRD/SUB combination, whereas iterating through the carry propagates +would take 140ns total (as the mul core runs 2x clock speed of the rest of the engine). +This is basically a wash. + +However, if pipelining (and bypassing) were implemented, this might become a viable +optimization, but bypassing such a wide core would also have resource and speed +implications of its own. + +The above steps are coordinated by the `mseq` state machine. Control lines for +the DSP48E blocks are grouped into two sets, one controls the global state of +things such as the operation mode and input modes, and the other controls the +routing of individual 17-bit limbs (e.g. "digits" of our 17-bit representation of +numbers) to various sources and destinations. + +The following sections walk through the algorithm in detail. + +Schoolbook Multiplication +------------------------- + +The first step in the algorithm is called "schoolbook multiplication". It's +almost that, but with a twist. Below is what actual schoolbook multiplication +would be like, if you had a pair of numbers that were broken into three "limbs" (digits) +A[2:0] and B[2:0]. + +:: + + | A2 A1 A0 + x | B2 B1 B0 + ------------------------------------------ + | A2*B0 A1*B0 A0*B0 + A2*B1 | A1*B1 A0*B1 + A2*B2 A1*B2 | A0*B2 + (overflow) (not overflowing) + +The result of schoolbook multiplication is a result that potentially has +2x the number of limbs than the either multiplicand. + +Mapping the overflow back into the prime field (e.g. wrapping the overflow around) +is a process called reduction. It turns out that for +a prime field like {field_latex}, reduction works out to taking the limbs that +extend beyond the base number of limbs in the field, shifting them right by the +number of limbs, multiplying it by 19, and adding it back in; and if the result +isn't a member of the field, add 19 one last time, and take the result as just +the bottom 255 bits (ignore any carry overflow). + +This trick works because the form of the field is $2^{{n}}-p$: it is a power +of 2, reduced by some small amount $p$. By starting from a power of 2, +most of the binary numbers representable in an n-bit word are valid members of +the field. The only ones that are not valid field members are the numbers that are equal +to $2^{{n}}-p$ but less than $2^{{n}}-1$ (the biggest number that fits in n bits). +To turn these invalid binary numbers into members of the field, you just need +to add $p$, and the reduction is complete. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/reduction_diagram.png + :alt: A diagram illustrating modular reduction + +The diagram above draws out the number lines for both a simple binary number line, +and for some field $\mathbf{{F}}_{{{{2^{{n}}}}-p}}$. Both lines start at 0 on the left, +and increment until they roll over. The point at which $\mathbf{{F}}_{{{{2^{{n}}}}-p}}$ +rolls over is a distance $p$ from the end of the binary number line: thus, we can +observe that $2^{{n}}-1$ reduces to $p-1$. Adding 1 results in $2^{{n}}$, which reduces +to $p$: that is, the top bit, wrapped around, and multiplied +it by $p$. + +As we continue toward the right, the numbers continue to go up and wrap around, and +for each wrap the distance between the binary wrap point and the $\mathbf{{F}}_{{{{2^{{n}}}}-p}}$ +wrap point increases by a factor of $p$, such that $2^{{n+1}}$ reduces to $2*p$. Thus modular +reduction of natural binary numbers that are larger than our field $2^{{n}}-p$ +consists of taking the bits that overflow an $n$-bit representation, shifting them to +the right by $n$, and multiplying by $p$. + +A more tractable example to compute than {field_latex} is the field $\mathbf{{F}}_{{{{2^{{6}}}}-5}} = 59$. +The members of the field are from 0-58, and reduction is done by taking any number modulo 59. Thus, +the number 59 reduces to 0; 60 reduces to 1; 61 reduces to 2, and so forth, until we get to 64, which +reduces to 5 -- the value of the overflowed bits (1) times $p$. + +Let's look at some more examples. First, recall that the biggest member of the +field, 58, in binary is 0b00_11_1010. + +Let's consider a simple case where we are presented a partial sum that overflows +the field by one bit, say, the number 0b01_11_0000, which is decimal 112. In this case, we take +the overflowed bit, shift it to the right, multiply by 5: + + 0b01_11_0000 + ^ move this bit to the right multiply by 0b101 (5) + 0b00_11_0000 + 0b101 = 0b00_11_0101 = 53 + +And we can confirm using a calculator that 112 % 59 = 53. Now let's overflow +by yet another bit, say, the number 0b11_11_0000. Let's try the math again: + + 0b11_11_0000 + ^ move to the right and multiply by 0b101: 0b101 * 0b11 = 0b1111 + 0b00_11_0000 + 0b1111 = 0b00_11_1111 + +This result is still not a member of the field, as the maximum value is 0b0011_1010. +In this case, we need to add the number 5 once again to resolve this "special-case" +overflow where we have a binary number that fits in $n$ bits but is in that sliver +between $2^{{n}}-p$ and $2^{{n}}-1$: + + 0b00_11_1111 + 0b101 = 0b01_00_0100 + +At this step, we can discard the MSB overflow, and the result is 0b0100 = 4; +and we can check with a calculator that 240 % 59 = 4. + +Therefore, when doing schoolbook multiplication, the partial products that start to +overflow to the left can be brought back around to the right hand side, after +multiplying by $p$, in this case, the number 19. This magical property is one +of the reasons why {field_latex} is quite amenable to math on binary machines. + +Let's use this finding to rewrite the straight schoolbook +multiplication form from above, but now with the modular reduction applied to +the partial sums, so it all wraps around into this compact form: +:: + + | A2 A1 A0 + x | B2 B1 B0 + ------------------------------------------ + | A2*B0 A1*B0 A0*B0 + | A1*B1 A0*B1 19*A2*B1 + + | A0*B2 19*A2*B2 19*A1*B2 + ---------------------------- + S2 S1 S0 + +As discussed above, each overflowed limb is wrapped around and multiplied by 19, +creating a number of partial sums S[2:0] that now has as many terms as +there are limbs, but with each partial sum still potentially +overflowing the native width of the limb. Thus, the inputs to a limb are 17 bits wide, +but we retain precision up to 48 bits during the partial sum stage, and then do a +subsequent condensation of partial sums to reduce things back down to 17 bits again. +The condensation is done in the next three steps, "collapse partial sums", "propagate carries", +and finally "normalize". + +However, before moving on to those sections, there is an additional trick we need +to apply for an efficient implementation of this multiplication step in hardware. + +In order to minimize the amount of data movement, we observe that for each row, +the "B" values are shared between all the multipliers, and the "A" values are +constant along the diagonals. Thus we can avoid re-loading the "A" values every +cycle by shifting the partial sums diagonally through the computation, allowing +the "A" values to be loaded as "A" and "A*19" into holding register once before +the computations starts, and selecting between the two options based on the step +number during the computation. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/mapping.png + :alt: Mapping schoolbook multiply onto the hardware array to minimize data movement + +The diagram above illustrates how the schoolbook multiply is mapped onto the hardware +array. The top diagram is an exact redrawing of the previous text box, where the +partial sums that would extend to the left have been multiplied by 19 and wrapped around. +Each colored block corresponds to a given DSP48E1 block. The red arrow +illustrates the path of a partial sum in both the schoolbook form and the unwrapped +form for hardware implementation. In the bottom diagram, one can clearly see that +the Ax coefficients are constant for each column, and that for each row, the Bx +values are identical across all blocks in each step. Thus each column corresponds to +a single DSP48E1 block. We take advantage of the ability of the DSP48E1 block to +hold two selectable A values to pre-load Ax and Ax*19 before the computation starts, and +we bus together the Bx values and change them in sequence with each round. The +partial sums are then routed to the "down and right" to complete the mapping. The final +result is one cycle shifted from the canonical mapping. + +We have a one-cycle structural pipeline delay going from this step to the next one, so +we use this pipeline delay to do a shift with no add by setting the `opmode` from `C+M` to +`C+0` (in other words, instead of adding to the current multiplication output for the last +step, we squash that input and set it to 0). + +The fact that we pipeline the data also gives us an opportunity to pick up the upper limb +of the partial sum collapse "for free" by copying it into the "D" register of the DSP48E1 +during the shift step. + +In C, the code basically looks like this: + +.. code-block:: c + + // initialize the a_bar set of data + for( int i = 0; i < DSP17_ARRAY_LEN; i++ ) {{ + a_bar_dsp[i] = a_dsp[i] * 19; + }} + operand p; + for( int i = 0; i < DSP17_ARRAY_LEN; i++ ) {{ + p[i] = 0; + }} + + // core multiply + for( int col = 0; col < 15; col++ ) {{ + for( int row = 0; row < 15; row++ ) {{ + if( row >= col ) {{ + p[row] += a_dsp[row-col] * b_dsp[col]; + }} else {{ + p[row] += a_bar_dsp[15+row-col] * b_dsp[col]; + }} + }} + }} + +This completes in 15 cycles. + +Collapse Partial Sums +--------------------- + +The potential width of the partial sum is up to 43 bits wide (according to +the paper cited above; the native partial sum precision of the DSP48E1 is 48 bits). +This step divides the partial sums up into 17-bit words, and then shifts the higher +to the next limbs over, allowing them to collapse into a smaller sum that +overflows less. + +:: + + ... P2[16:0] P1[16:0] P0[16:0] + ... P1[33:17] P0[33:17] P14[33:17]*19 + ... P0[50:34] P14[50:34]*19 P13[50:34]*19 + +Again, the magic number 19 shows up to allow sums which "wrapped around" +to add back in. Note that in the timing diagram below, we refer to the +mid- and upper- words of the shifted partial sums as "Q" and "R" respectively, +because the timing diagram lacks the width within a data bubble to +write out the full notation: so `Q0,1` is P14[33:17] and `R0,2` is P13[50:34] for P0[16:0]. + +This is what the C code equivalent looks like for this operation. + +.. code-block:: c + + // the lowest limb has to handle two upper limbs wrapping around (Q/R) + prop[0] = (p[0] & 0x1ffff) + + (((p[14] * 1) >> 17) & 0x1ffff) * 19 + + (((p[13] * 1) >> 34) & 0x1ffff) * 19; + // the second lowest limb has to handle just one limb wrapping around (Q) + prop[1] = (p[1] & 0x1ffff) + + ((p[0] >> 17) & 0x1ffff) + + (((p[14] * 1) >> 34) & 0x1ffff) * 19; + // the rest are just shift-and-add without the modular wrap-around + for(int bitslice = 2; bitslice < 15; bitslice += 1) {{ + prop[bitslice] = (p[bitslice] & 0x1ffff) + ((p[bitslice - 1] >> 17) & 0x1ffff) + ((p[bitslice - 2] >> 34)); + }} + +This completes in 2 cycles after a one-cycle pipeline stall delay penalty to retrieve +the partial sum result from the previous step. + +Propagate Carries +----------------- + +The partial sums will generate carries, which need to be propagated down the +chain. The C-code equivalent of this looks as follows: + +.. code-block:: c + + for(int i = 0; i < 15; i++) {{ + if ( i+1 < 15 ) {{ + prop[i+1] = (prop[i] >> 17) + prop[i+1]; + prop[i] = prop[i] & 0x1ffff; + }} + }} + +This completes in 14 cycles. + +Normalize +--------- + +We're almost here, except that $0 \leq result \leq 2^{{256}}-1$, which is slightly +larger than the range of {field_latex}. + +Thus we need to check if number is somewhere in between 0x7ff....ffed and +0x7ff....ffff, or if the 256th bit will be set. In these cases, we need to add 19 to +the result, so that the result is a member of the field $2^{{255}}-19$ (the 256th bit +is dropped automatically when concatenating the fifteen 17-bit limbs together). + +We use the DSP48E1 block to help accelerate the test for this case, so that it +can complete in a single cycle without slowing down the machine. We use the "pattern +detect" (PD) feature of the DSP48E1 to check for all "1's" in bit positions 255-5, and a +single LUT to compare the final 5 bits to check for numbers between {prime_string} and +$2^{{255}}-1$. We then OR this result with the 256th bit. + +If the result falls within this special "overflow" case, we add the number 19, otherwise, +we add 0. Note that this add-by-19-or-0 step is implemented by pre-loading the number 19 into the A:B +pipeline registers of the DSP4E1 block during the "propagate" stage. Selection of +whether to add 19 or 0 relies on the fact that the DSP48E1 block has an input multiplexer +to its internal adder that can pick data from multiple sources, including the ability to +pick no source by loading the number 0. Thus the operation mode of the DSP48E1 is adjusted +to either pull an input from A:B (that is, the number 19) or the number 0, based on the +result of the overflow computation. Thus the PD feature is important in preventing this +step from being rate-limiting. With the PD feature we only have to check an effective 16 +intermediate results, instead of 256 raw bits, and then drive set the operation mode of +the ALU. + +Thus, this operation completes in a single cycle. + +After adding the number 19, we have to once again propagate carries. Even if we add the number +0, we also have to "propagate carries" for constant-time operation. This is done by +running the carry propagate operation described above a second time. + +Once the second carry propagate is finished, we have the final result. + +Potential corner case +--------------------- + +There is a potential corner case where if the carry-propagated result going into +"normalize" is between + + 0xFFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFDA and + 0xFFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFEC + +In this case, the top bit would be wrapped around, multiplied by 19, and added to +the LSB, but the result would not be a member of $2^{{255}}-19$ (it would be one +of the 19 numbers just short of $2^{{255}}-1$), and the multiplier would pass it +on as if it were a valid result. + +In some cases, this isn't even a problem, because if the subsequent result goes through +any operation that includes a "TRD" instruction, it should reduce the number +correctly. + +However, I do not think this corner case is possible, because the overflow path to set the +high bit is from the top limb going from 0x1_FFFF -> 0x2_0000 (that is, 0x7FFFC -> 0x80000 +when written MSB-aligned) due to a carry coming in from the lower limb, and +it would require the carry to be very large, not just +1 as shown in the simple +rollover case, but a value from 0x1_FFED-0x1_FFDB. + +I don't have a formal mathematical proof of this, but I strongly suspect that +carry values going into the top limb cannot approach these large numbers, and therefore +it is not possible to hit this corner case. + +In the case that it _could_ be hit, the fix would be to add an additional +detection stage to handle the case that the result is not normalized, and +to add 19 to the final sum. This can be accelerated to a single cycle by also +adding 1 into the partial products, short-circuiting the carry propagate because +this should be the only special case we're trying to check for (we should definitely +not be able to re-overflow because we are only adding at most 19 to the final result +in the previous step). + +It'd be great to have a real mathematician comment if this is a real corner case. + +Maybe this is a more solid reasoning why this corner case can't happen: + +The biggest value of a partial sum is 0x53_FFAC_0015 (0x1_FFFF * 0x1_FFFF * 15). +This means the biggest value of the third overflowed 17-bit limb is 0x14. Therefore +the biggest value resulting from the "collapse partial sums" stage is +0x1_FFFF + 0x1_FFFF + 0x14 = 0x4_0012. Thus the largest carry term that has +to propagate is 0x4_0012 >> 17 = 2. 2 is much smaller than the amount required +to trigger this condition, that is, a value in the range of 0x1_FFED-0x1_FFDB. +Thus, perhaps this condition simply can't happen? + +""") + # array of 15, 17-bit wide signals = 255 bits + a_17 = [Signal(17),Signal(17),Signal(17),Signal(17),Signal(17), + Signal(17),Signal(17),Signal(17),Signal(17),Signal(17), + Signal(17),Signal(17),Signal(17),Signal(17),Signal(17),] + b_17 = [Signal(17),Signal(17),Signal(17),Signal(17),Signal(17), + Signal(17),Signal(17),Signal(17),Signal(17),Signal(17), + Signal(17),Signal(17),Signal(17),Signal(17),Signal(17),] + # split incoming data into 17-bit wide chunks + for i in range(15): + self.comb += [ + a_17[i].eq(self.a[i*17:i*17+17]), + b_17[i].eq(self.b[i*17:i*17+17]), + ] + + # signals common to all DSP blocks + dsp_alumode = Signal(4) + dsp_opmode = Signal(7) + dsp_reset = Signal() + dsp_a1_ce = Signal() + dsp_a2_ce = Signal() + dsp_b1_ce = Signal() + dsp_b2_ce = Signal() + dsp_d_ce = Signal() + dsp_p_ce = Signal() + self.comb += [ + dsp_reset.eq(ResetSignal()), + dsp_b1_ce.eq(0), # not used + ] + zeros = Signal(48, reset=0) # dummy zeros signals to tie off unused bits of the DSP48E + self.comb += zeros.eq(0) + + step = Signal(max=15+1) # controls the multiplication step + prop = Signal() # count the propagations + + for i in range(15): + # create all the per-block DSP signals before we loop through and connect them + setattr(self, "dsp_a" + str(i), Signal(48, name="dsp_a" + str(i))) + setattr(self, "dsp_b" + str(i), Signal(17, name="dsp_b" + str(i))) + setattr(self, "dsp_c" + str(i), Signal(48, name="dsp_c" + str(i))) + setattr(self, "dsp_d" + str(i), Signal(17, name="dsp_d" + str(i))) + setattr(self, "dsp_match" + str(i), Signal(name="dsp_match"+str(i))) + setattr(self, "dsp_p" + str(i), Signal(48, name="dsp_p"+str(i))) + setattr(self, "dsp_p_ce" + str(i), Signal(48, name="dsp_p_ce"+str(i))) + setattr(self, "dsp_inmode" + str(i), Signal(5, name="dsp_inmode"+str(i))) + + self.timing = ModuleDoc(title="Detailed timing operation", body=""" + +Below is a detailed timing diagram that illustrates the expected sequence of events +by the implementation of this code. + +Signal descriptions: + +* `clk` is `mul_clk`, nominally 100MHz (2x engine clock) +* `go` is the signal from the microcode sequencer to latch inputs and start computation +* `self.a` is the `a` operand +* `self.b` is the `b` operand +* `state` is the current `mseq` state machine's state +* `step` is a counter used by `mseq` to control how many iterations to run in a given state +* `prop` is a counter used to count which iteration of the carry propagate we're on +* `dsp.a`-`dsp.d` is the `a-d` inputs to the DSP48E1 blocks +* `A1_CE` is the enable to the A1 pipe register. Note that we configure 2x pipeline registers on the A input. +* `A1` is a pipe register internal to the DSP48E1 block +* `A2_CE` is the enable to the A2 pipe register +* `A2` is a pipe register internal to the DSP48E1 block +* `B2_CE` is the enable to the B2 pipe register. Note that we configure 1x pipeline registers on the B input, and when 1x register is selected, the second pipe register (B2) is used. Thus there is no B1 register. +* `B2` is a pipe register internal to the DSP48E1 block +* `C` is the C input value. Note that this one input is *not* pipelined, and thus there is no register enable for it. Because it is not pipelined it's also likely to be critical-path. We use this mainly to loop P results back into the ALU with masking operations applied within a single cycle. +* `D_CE` is the enable to the D pipe register. There is only one possible D register in the DSP48E1 +* `D` is a pipe register internal to the DSP48E1 block that feeds the pre-adder +* `inmode` configures the input mode to the DSP48E1 ALU blocks. It is not pipelined and allows us to re-route data from A, B, C, and D to various ALU internals. +* `opmode` configures what computation to perform by the DSP48E1 ALU on the current cycle. It is not pipelined. +* `P_CE` is the enable for the output product register. +* `P` is the output product register presented by the DSP48E1 ALU. +* `overflow` is the overflow detection output from the DSP48E1 ALU. Its result timing is synchronous with the `P` register. +* `done` is the signal from the multiplier back to the microcode sequencer to latch the result and finish computation + +.. wavedrom:: + :caption: Detailed timing of the multiply operation + + { "config": {skin : "default"}, + "signal" : [ + { "name": "clk", "wave": "p......|.........|.......|....." }, + { "name": "go", "wave": "010..........................10" }, + { "name": "self.a", "wave": "x2...........................2.", "data": ["A0[255:0]","A1[255:0]"] }, + { "name": "self.b", "wave": "x2...........................2.", "data": ["B0[255:0]","B1[255:0]"] }, + { "name": "state", "wave": "2.34......5555...|..86...|..923", "data":["IDLE","SETA","MPY","DLY","PLSB","PMSB","PROP","NORM","PROP","DONE","IDLE","SETA"]}, + { "name": "step", "wave": "x..2===|==5...55|5556.666|66xxx", "data":["0","1", "2", "3","13","14","0","1","2","11","12","13","0","1","2","11","12","13"]}, + { "name": "prop", "wave": "x.........5.....|...6....|..xxx", "data":["0","1"]}, + { "name": "dsp.a", "wave": "x2x2x.....8x.................2x", "data": ["A0xx","A19","0", "A1xx"] }, + { "name": "dsp.b", "wave": "x2====|==x55xxxxxxx8xx.......2=", "data": ["19","B00","B01","B02","B03","B13","B14","1or19","1or19","19","19","B1_00"] }, + { "name": "dsp.c", "wave": "x...2===|=x5x5...|..x6...|..xxx", "data":["Q0","Q1","Q2","Q3","Q13","P0,0","C* >> 17 ","C* >> 17 "]}, + { "name": "dsp.d", "wave": "x.........55x.xxxxxx...xxxxxxx.", "data":["*Q0,1","R0,2"]}, + {}, + { "name": "A1_CE", "wave": "1.010.....10..................." }, + { "name": "A1", "wave": "x.2.2......8.........x.........", "data": ["A0xx","A0xx*19","0"] }, + { "name": "A2_CE", "wave": "0..10......10.................." }, + { "name": "A2", "wave": "x...2.......8........x.........", "data":["A0xx","0"] }, + { "name": "B2_CE", "wave": "01.......01.0......10.........." }, + { "name": "B2", "wave": "x.22===|==x55xxxx.xx8x.........", "data": ["19","B00","B01","B02","B03","B13","B14","1or19","1or19","19"] }, + { "name": "C", "wave": "x...2===|==555...|..86...|..x..", "data": ["Q0","Q1","Q2","Q3","Q13","Q14","P0,0","*P","C* >> 17 ","C&","C* >> 17 "] }, + { "name": "D_CE", "wave": "0.........1.0.................." }, + { "name": "D", "wave": "x..........55xx................", "data": ["Q0,1","R0,2","QS14,1","RS14,2","QS14,1","RS14,2"] }, + { "name": "inmode", "wave": "x.2.2.....x5.x.xx.xx8x.........", "data":["A1B2","AnB2","DB2","0B2"]}, + { "name": "opmode", "wave": "x.2.=.....2555...|..86...|..xxx", "data":["M","C+M","C+0","C+M","P+M","C+P","AB/0+C","C+P"]}, + {}, + { "name": "P_CE", "wave": "0.1.....|....5555|5516666|660.1", "data": ["P1", "P2", "P3","P4","P13","P14","P1", "P2", "P3","P4","P13","P14"] }, + { "name": "P", "wave": "x..2====|===55555|5552666|666x.", "data": ["A19","P0","P1","P2","P3","P13","P14","P0","PLSB","PMSB","C1","C2","C3","C12", "C13","C14","S+","C1","C2","C3","C12", "C13","C14","final"] }, + { "name": "overflow", "wave": "x...................2x.........", "data":["Y/N"]}, + { "name": "done", "wave": "0...........................10." }, + ]} + +Notes: + +1. the final product sum on the first DLY cycle is just a shift to get the + product results into the right unit. Thus, for the load of `dsp.d` `*Q0,1`, it needs + to pick the result off of the neighboring DSP unit, because it needs to acquire the value + before the final shift. +2. The `S+` on the P line is the non-normalized sum. This is basically the final result, but + sometimes with the 19 added to the least significant limb, in the case that the result is greater than + or equal to $2^{{255}}-19$. This addition must be propagated through the whole result. +3. The "done" state is slightly more complicated than illustrated here. Because the multiplier runs at + twice the speed of the sequencing engine (two `mul_clk` per `eng_clk`), "done" actually spans between + 2 and 3 states. In the case that the computation finishes in-phase with the slower engine clock, we assert + "done" for two cycles. In the case that we finish out of phase, have to wait a half `eng_clk` cycle + (one state in `mul_clk`) before asserting the done pulse for two `mul_clk` cycles (thus 3 total cycles). + The computation is fixed-time, so the determination of how many wait states is done at the design stage and + hard-coded. However, anytime the algorithm is adjusted, the designer needs to re-check the number of + cycles it took and pick the correct "done" sequencing. + + """) + + self.diagrams = ModuleDoc(title="Dataflow Diagrams", body=""" + +Here's a collection of data flow diagrams that help illustrate how to configure the DSP48E1 block. +The DSP48E1 block has a lot of configuration options, so instead of overlaying on the messy overall +diagram of the DSP48E1, we simplify its construction and draw only the pieces relevant to each phase +of the algorithm. + +There's no substitute for consulting Xilinx UG479 (https://www.xilinx.com/support/documentation/user_guides/ug479_7Series_DSP48E1.pdf), +but if you're just getting started here's a few breadcrumbs to help you steer around the block. + +1. The block contains a pre-adder, multiplier, and "ALU". +2. It has four major inputs, A, B, C, and D. A/B are typically multiplier inputs, C is mostly intended for carry propagation and shuttling partial sums, and D is a pre-adder input. Thus a common form of computation is P = (A+D)*B + C. +3. Almost any input can be zero'd out, and so if you wanted to compute just A*B, what is actually computed is (A+D)*B + C but with the C and D values zero'd out. This is controlled by combinations of `inmode` and `opmode`. +4. Inputs A-D and output P can all be registered, and for this implementation we put two registers on A, one register on B, zero registers on C, one register on D, and one register on P. +5. Inputs A and B can have two pipeline registers. While the datasheet makes it look like you could be able to selectively write from the DSP48E1 input to either A1/A2 or B1/B2, in fact, you can't. + A2 can only get a value from A1 (thus setting A2 necessitates overwriting the value in A1). However, you can gate the A2's enable, so it can hold a value indefinitely, and the multiplier can route an input from either A1 or A2. We use this to our advantage and load `dsp.a` into the A2 register, and `dsp.a*19` into the A1 register, and then use the `inmode` configuration to switch between these two inputs based on which partial sum we're computing at the moment. + I think normally this feature is used to implement pipelining and pipeline bypassing in other applications, and we are slightly abusing it here to our advantage. +6. Because we configured C to have no input register, it can be used for cycle-to-cycle feedback of partial sums. + Introducing an input register here (per DRC recco spit out by Vivado) could speed up the clock rate but it also introduces a single-cycle stall every time we have to do a partial sum feedback, which is a greater performance impact for our implementation. +7. The "ALU" part of the DSP48E1 is used as the partial sum adder in our implementation (but it can also do logic operations and other fun things that we don't need). It actually adds four numbers: P <- X + Y + Z + Carry bit. + We don't use the carry "bit" as it is only one-bit wide and we are propagating several bits of carry at once, so it is hard-wired to 0. X/Y/Z are up to 48 bits wide, and allows us to add combinations of the multiplier output, a concatenation of A:B (A as MSB, B as LSB), C, P, the number 0, and a couple other source options we don't use in this implementation. This is controlled by `opmode`. +8. In parallel to the "ALU" is a pattern detector. The pattern being detected is hard-coded into the bitstream, and in this case we are looking for a run of `1`'s to help accelerate the overflow detection problem. The output of the pattern detector is always being computed, and dataflow-synchronous to the P output. +9. Unused bits of verilog instances in Migen need to be tied to 0; Migen does not automatically extend/pad shorter `Signal` values to match verilog input widths. This is important because the DSP48E1 input widths don't always exactly match the Migen widths. We create a "zeros" signal and `Cat()` it onto the MSBs as necessary to ensure all inputs to the DSP48E1 are properly specified. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/mpy_pipe3.png + :alt: data flow block diagram of the multiplier core + +Above is the relevant elements of the DSP48E1 block as configured for the systolic dataflow for the "schoolbook" +multiply operation. Items shaded in gray are external to the DSP48E1 block. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/psum3.png + :alt: data flow block diagram of the partial sum step + +Above is the configuration of the DSP48E1 block for the partial sum steps. Partial sum takes two cycles to +sum together the three 17-bit segments of the partial sums. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/carry_prop3.png + :alt: data flow block diagram of the carry propagate + +Above is the configuration of the DSP48E1 block for the carry propagate step. This step must be repeated +14 times to handle the worst-case carry propagate path. During the carry propagate step, the pattern +detector is active, and on the final step we check it to see if the result overflows $2^{{255}}-19$. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/normalize4.png + :alt: data flow block diagram of the normalization step + +Above is the configuration of the DSP48E1 block for the normalization step. If the result overflows $2^{{255}}-19$, +we must add 19 to make it a member of the prime field once again. We can do this in a single cycle by +short-circuiting the carry propagate: we already know we will have to propagate a carry to handle the overflow +case (there are only 19 possible numbers that will overflow this, and all of them have 1's set up the entire +chain), so we pre-add the carry simultaneous with adding the number 19 to the least significant limb. We also +use this step to mask out the upper level bits on the partial sums, because the top bits are now the old +carries that have already been propagated. If we fail to do this, then we re-propagate the carries from the last step. + + """) + + start_pipe = Signal() + self.sync.mul_clk += start_pipe.eq(self.start) # break critical path of instruction decode -> SETUP_A state muxes + self.submodules.mseq = mseq = ClockDomainsRenamer("mul_clk")(FSM(reset_state="IDLE")) + mseq.act("IDLE", + NextValue(step, 0), + NextValue(prop, 0), + If(start_pipe, + NextState("SETUP_A") + ) + ) + mseq.act("SETUP_A", # SETA, load the a, a19 values values + NextState("MULTIPLY"), + ) + mseq.act("MULTIPLY", # MPY + If(step < 14, + NextValue(step, step + 1) + ).Else( + NextState("P_DELAY"), + NextValue(step, 0), + ) + ) + mseq.act("P_DELAY", # DLY - due to pipelining of P register, we have a structural hazard that delays feedback by one cycle + # we take advantage of this time to (1) shift the results into canonical position and (2) nab a copy of the data for the PSUM_MSB state + NextState("PSUM_LSB") + ) + mseq.act("PSUM_LSB", # PLSB + NextState("PSUM_MSB") + ) + mseq.act("PSUM_MSB", # PMSB + NextState("CARRYPROP") + ) + mseq.act("CARRYPROP", # PROP + If( step == 13, + If( prop == 0, + NextState("NORMALIZE"), + NextValue(step, 0), + ).Else( + NextState("DONE"), # if modifying to the "DONE" state, change q-latch statement at the end + ) + ).Else( + NextValue(step, step + 1), + ) + ) + mseq.act("NORMALIZE", # NORM + NextState("CARRYPROP"), + NextValue(prop, 1), + NextValue(step, 0), + ) + ### note that the post-amble "manually" aligns the mul_clk to eng_clk phases + ### this can have one of two outcomes if the previous number of states is even or odd + ### in this case, we end up phase mis-aligned, so we have to burn a dummy cycle to sync clocks + ### see q_valid logic at end of this module + mseq.act("DONE", # DONE -- we are actually finished on an odd phase of the eng_clk, can't assert RF here + NextState("DONE2"), + ) + mseq.act("DONE2", # assert valid to the RF here + NextState("DONE3"), + ) + mseq.act("DONE3", # second done state, because we are latching into a half-rate clock domain, so valid is good for one full eng_clk + NextState("IDLE"), + # Note: we could, in theory, pipeline the next multiply by detecting if go goes high here, + # and bypassing IDLE and going straight to SETA, but... + ) + + # DSP48E opcode encodings + # general DSP48E computation is P <- X + Y + Z + C + OP_PASS_M = 0b000_01_01 # X:Y <- M; Z <-0; P <- 0 + M + 0 + OP_M_PLUS_PCIN = 0b001_01_01 # X:Y <- M; Z <-PCIN; P <- PCIN + M + 0 + OP_M_PLUS_C = 0b011_01_01 # X:Y <- M; Z <-C; P <- C + M + 0 + OP_M_PLUS_P = 0b010_01_01 # X:Y <- M; Z <-P ; P <- P + M + 0 + OP_P_PLUS_PCIN17 = 0b101_10_00 # X <- P; Y <- 0; Z <- PCIN >> 17; P <- PCIN>>17 + P + 0 + OP_C_PLUS_P = 0b010_11_00 # X <- 0; Y <- C; Z <- P; P <- 0 + C + P + OP_AB_PLUS_P = 0b010_00_11 # X <- A:B; Y <- 0; Z <- P; P <- A:B + 0 + P + 0 + OP_AB_PLUS_C = 0b011_00_11 # X <- A:B; Y <- 0; Z <- C; P <- A:B + 0 + C + 0 + OP_0_PLUS_P = 0b010_00_00 # X <- 0; Y <- 0; Z <- P; P <- 0 + 0 + P + 0 + OP_C_PLUS_0 = 0b011_00_00 # X <- 0; Y <- 0; Z <- C; P <- C + 0 + 0 + 0 + INMODE_A1 = 0b0001 + INMODE_A2 = 0b0000 + INMODE_D = 0b0110 + INMODE_0 = 0b0010 + INMODE_B2 = 0b0 + # INMODE_B1 = 0b1 # should not be used in this configuration, only 1 BREG configured + + overflow_25519 = Signal() # set during normalize if we're overflowing 2^255-19 + + # see the self.timing documentation (above, best viewed after post-processing with sphinx) for how this all works. + self.comb += [ + dsp_alumode.eq(0), + If(mseq.before_entering("SETUP_A"), + dsp_b2_ce.eq(1), + dsp_a1_ce.eq(1), + ).Elif(mseq.ongoing("SETUP_A"), + # at this point, these are already loaded: A1 <- Axx, B2 <- 19 + # P <- A1 * B2 + dsp_opmode.eq(OP_PASS_M), + # pipeline in the b1 value for the first round of the multiply + dsp_b2_ce.eq(1), + dsp_p_ce.eq(1), + ).Elif(mseq.ongoing("MULTIPLY"), + dsp_p_ce.eq(1), + If(step == 0, + dsp_a1_ce.eq(1), + dsp_a2_ce.eq(1), # latch the pipelined Axx * 19 signal on the first round of multiply + dsp_opmode.eq(OP_PASS_M), # don't add PCIN on the first partial product, as it's bogus on step 0 + ).Else( + dsp_a1_ce.eq(0), + dsp_a2_ce.eq(0), + dsp_opmode.eq(OP_M_PLUS_C), + ), + If(step != 14, + dsp_b2_ce.eq(1), + ).Else( + dsp_b2_ce.eq(0), + ) + ).Elif(mseq.ongoing("P_DELAY"), + dsp_opmode.eq(OP_C_PLUS_0), + dsp_p_ce.eq(1), + dsp_b2_ce.eq(1), + dsp_d_ce.eq(1), + dsp_a1_ce.eq(1), + ).Elif(mseq.ongoing("PSUM_LSB"), + dsp_p_ce.eq(1), + dsp_b2_ce.eq(1), + dsp_d_ce.eq(1), + dsp_opmode.eq(OP_M_PLUS_C), + dsp_a2_ce.eq(1), + ).Elif(mseq.ongoing("PSUM_MSB"), + dsp_p_ce.eq(1), + dsp_opmode.eq(OP_M_PLUS_P), + ).Elif(mseq.ongoing("CARRYPROP"), + dsp_p_ce.eq(0), # move to individual unit P_CEs for this stage + dsp_opmode.eq(OP_C_PLUS_P), + If(step==13, + dsp_b2_ce.eq(1), + ) + ).Elif(mseq.ongoing("NORMALIZE"), + dsp_p_ce.eq(1), + If(overflow_25519 | (self.dsp_p14[17] == 1), + dsp_opmode.eq(OP_AB_PLUS_C), + ).Else( + dsp_opmode.eq(OP_C_PLUS_0), + ) + ) + ] + b_step = Signal(17) + self.comb += [ + # the code below doesn't synthesize well, so let's write out the barrel shifter explicitly + # getattr(self, "dsp_b" + str(i)).eq((self.b >> (17 * (step + 1))) & 0x1_ffff), # b_17[step+1] + # written out explicitly because the fancy for-loop format also leads to a weird synthesis result... + If(step == 0, b_step.eq(b_17[1]) + ).Elif(step == 1, b_step.eq(b_17[2]) + ).Elif(step == 2, b_step.eq(b_17[3]) + ).Elif(step == 3, b_step.eq(b_17[4]) + ).Elif(step == 4, b_step.eq(b_17[5]) + ).Elif(step == 5, b_step.eq(b_17[6]) + ).Elif(step == 6, b_step.eq(b_17[7]) + ).Elif(step == 7, b_step.eq(b_17[8]) + ).Elif(step == 8, b_step.eq(b_17[9]) + ).Elif(step == 9, b_step.eq(b_17[10]) + ).Elif(step == 10, b_step.eq(b_17[11]) + ).Elif(step == 11, b_step.eq(b_17[12]) + ).Elif(step == 12, b_step.eq(b_17[13]) + ).Elif(step == 13, b_step.eq(b_17[14]) + ) + ] + + # reduce width of DSP's INMODE combinational path using a sub machine that reduces + # the complexity of the `mseq` machine and allows for a pipeline stage to be inserted... + INMODE_IDLE = 0 + INMODE_MPY = 1 + INMODE_PROP1 = 2 + INMODE_PROP2 = 3 + inmode_sel = Signal(2) + self.sync.mul_clk += [ + If(mseq.ongoing("IDLE") | mseq.ongoing("SETUP_A"), + inmode_sel.eq(INMODE_IDLE) + ).Elif(mseq.ongoing("MULTIPLY"), + inmode_sel.eq(INMODE_MPY), + ).Elif(mseq.ongoing("P_DELAY") | mseq.ongoing("PSUM_LSB"), + inmode_sel.eq(INMODE_PROP1) + ).Else( + inmode_sel.eq(INMODE_PROP2) + ) + ] + + for i in range(15): + # INMODE is a critical path, so rewrite code not in computation order but in signal use order to better + # understand how to optimize it. + self.comb += [ + If(inmode_sel == INMODE_IDLE, + getattr(self, "dsp_inmode" + str(i)).eq(Cat(INMODE_A1, INMODE_B2)), + ), + If(inmode_sel == INMODE_MPY, + If(step == 0, + getattr(self, "dsp_inmode" + str(i)).eq(Cat(INMODE_A1, INMODE_B2)), + # A1 has Axx on the first step only + ).Elif(i > (14 - step), # lay out the diagonal wrap-around of partial sums + getattr(self, "dsp_inmode" + str(i)).eq(Cat(INMODE_A1, INMODE_B2)), # A1 has Axx*19 + ).Else( + getattr(self, "dsp_inmode" + str(i)).eq(Cat(INMODE_A2, INMODE_B2)), + # A2 has Axx for rest of steps + ) + ), + If(inmode_sel == INMODE_PROP1, + getattr(self, "dsp_inmode" + str(i)).eq(Cat(INMODE_D, INMODE_B2)), + ), + If(inmode_sel == INMODE_PROP2, + getattr(self, "dsp_inmode" + str(i)).eq(Cat(INMODE_0, INMODE_B2)), + ) + ] + + # rest of signals are in computation order below + self.comb += [ + If(mseq.before_entering("SETUP_A"), + getattr( self, "dsp_a" + str(i) ).eq(Cat(a_17[i], zeros[:(30-17)])), + getattr( self, "dsp_b" + str(i) ).eq(19), + ).Elif(mseq.ongoing("SETUP_A"), + getattr(self, "dsp_b" + str(i)).eq(b_17[0]), # preload B00 + ).Elif(mseq.ongoing("MULTIPLY"), + getattr(self, "dsp_c" + str(i)).eq(getattr(self, "dsp_p" + str( (i+1) % 15 ))), + If(step == 0, + getattr(self, "dsp_a" + str(i)).eq(getattr(self, "dsp_p" + str(i))), + ), + If(step < 14, + getattr(self, "dsp_b" + str(i)).eq(Cat(b_step, zeros[:1])), # b_17[step+1]; note that b input is 18 bits wide, so pad with one 0 to prevent a dangling X on the high bit + ), + ) + ] + + if i > 0: # sum is different from bottom limb, as the top MSB wraps around + self.comb += [ + If(mseq.ongoing("P_DELAY"), + getattr(self, "dsp_c" + str(i)).eq(getattr(self, "dsp_p" + str((i + 1) % 15))), + getattr(self, "dsp_d" + str(i)).eq((getattr(self, "dsp_p" + str(i)) >> 17) & 0x1_ffff), # (i-1)+1, the +1 is because the result has not been shifted yet + getattr(self, "dsp_b" + str(i)).eq(1), + )] + else: + self.comb += [ + If(mseq.ongoing("P_DELAY"), + getattr(self, "dsp_a" + str(i)).eq(zeros), + getattr(self, "dsp_c" + str(i)).eq(getattr(self, "dsp_p" + str((i + 1) % 15))), + getattr(self, "dsp_d" + str(i)).eq((getattr(self, "dsp_p" + str(0)) >> 17) & 0x1_ffff), + getattr(self, "dsp_b" + str(i)).eq(19), + )] + + self.comb += [ + If(mseq.ongoing("PSUM_LSB"), + getattr(self, "dsp_c" + str(i)).eq(getattr(self, "dsp_p" + str(i)) & 0x1_ffff), + )] + if i > 1: # sum-ordering is different for the bottom two limbs, as the top wraps around into two limbs + self.comb += [ + If(mseq.ongoing("PSUM_LSB"), + getattr(self, "dsp_d" + str(i)).eq((getattr(self, "dsp_p" + str(i - 2)) >> 34) & 0x1_ffff), + getattr(self, "dsp_b" + str(i)).eq(1), + )] + elif i == 1: + self.comb += [ + If(mseq.ongoing("PSUM_LSB"), + getattr(self, "dsp_d" + str(i)).eq((getattr(self, "dsp_p" + str(14)) >> 34) & 0x1_ffff), + getattr(self, "dsp_b" + str(i)).eq(19), + )] + else: + self.comb += [ + If(mseq.ongoing("PSUM_LSB"), + getattr(self, "dsp_d" + str(i)).eq((getattr(self, "dsp_p" + str(13)) >> 34) & 0x1_ffff), + getattr(self, "dsp_b" + str(i)).eq(19), + )] + + self.comb += [ + If(mseq.ongoing("PSUM_MSB"), + getattr(self, "dsp_c0").eq(zeros), # dsp_c is actually don't care due to the opmode + ).Elif(mseq.ongoing("NORMALIZE"), + getattr(self, "dsp_c" + str(i)).eq(getattr(self, "dsp_p" + str(i)) & 0x1_ffff), + ) + ] + + if i == 0: + self.comb += [ + If(mseq.ongoing("CARRYPROP"), + getattr(self, "dsp_c" + str(i)).eq( zeros ), + ), + If(mseq.ongoing("CARRYPROP") & (step == 13), + getattr(self, "dsp_b" + str(i)).eq( 19 ), # special-case constant to handle normalization in overflow of prime field; a is loded with 0 on previous cycle + ), + ] + else: + self.comb += [ + If(mseq.ongoing("CARRYPROP"), + getattr(self, "dsp_c" + str(i)).eq( Cat(getattr(self, "dsp_p" + str(i - 1)) >> 17, zeros[:17]) ), + getattr(self, "dsp_p_ce" + str(i)).eq(step == (i-1)), + ), + If(mseq.ongoing("CARRYPROP") & (step == 13), + getattr(self, "dsp_b" + str(i)).eq(0), + ) + ] + if sim: + instance = "DSP48E1_sim" + else: + instance = "DSP48E1" + self.specials += [ + Instance(instance, name="DSP_ENG25519_" + str(i), + # configure number of input registers + p_ACASCREG=1, + p_AREG=2, + p_ADREG=0, + p_ALUMODEREG=0, + p_BCASCREG=1, + p_BREG=1, + + # only pipeline at the output + p_CARRYINREG=0, + p_CARRYINSELREG=0, + p_CREG=0, + p_DREG=1, # i think we can use this to save some fabric registers + p_INMODEREG=0, + p_MREG=0, + p_OPMODEREG=0, + p_PREG=1, + + p_A_INPUT="DIRECT", + p_B_INPUT="DIRECT", + p_USE_DPORT="TRUE", + p_USE_MULT="DYNAMIC", + p_USE_SIMD="ONE48", + + # setup pattern detector to catch the case of mostly 1's + p_AUTORESET_PATDET="NO_RESET", + p_MASK =0xffff_fffe_0000, #'1'*(48-17)+'0'*17, # 1 bits are ignored, 0 compared + p_PATTERN=0x1_ffff, # '0'*(48-17)+'1'*17, # compare against 0x1_FFFF + p_SEL_MASK="MASK", + p_SEL_PATTERN="PATTERN", + p_USE_PATTERN_DETECT="PATDET", + + # signals + i_A=getattr(self, "dsp_a" + str(i)), + i_ALUMODE=dsp_alumode, + i_B=Cat(getattr(self, "dsp_b" + str(i)), zeros[:(18-17)]), # extra bits must be set to zero + i_C=getattr(self, "dsp_c" + str(i)), + i_CARRYIN=0, + i_CARRYINSEL=zeros[:3], + i_CEA1=dsp_a1_ce, + i_CEA2=dsp_a2_ce, + i_CEAD=0, # no pipe + i_CEALUMODE=0, # no pipe + i_CEB1=dsp_b1_ce, + i_CEB2=dsp_b2_ce, + i_CEC=0, # no pipe + i_CECARRYIN=0, + i_CECTRL=0, # no pipe on opmode + i_CED=dsp_d_ce, + i_CEP=dsp_p_ce | getattr(self, "dsp_p_ce" + str(i)), + i_CLK=ClockSignal("mul_clk"), # run at 2x speed of engine clock + i_D=Cat(getattr(self, "dsp_d" + str(i)), zeros[:(25-17)]), + i_INMODE=getattr(self, "dsp_inmode" + str(i)), + i_OPMODE=dsp_opmode, + o_P=getattr(self, "dsp_p" + str(i)), + o_PATTERNDETECT=getattr(self, "dsp_match" + str(i)), + + # tie unused CE + i_CEM=0, + i_CEINMODE=1, + + # resets + i_RSTA=dsp_reset, + i_RSTALLCARRYIN=dsp_reset, + i_RSTALUMODE=dsp_reset, + i_RSTB=dsp_reset, + i_RSTC=dsp_reset, + i_RSTCTRL=dsp_reset, + i_RSTD=dsp_reset, + i_RSTINMODE=dsp_reset, + i_RSTM=dsp_reset, + i_RSTP=dsp_reset, + ) + ] + self.sync.mul_clk += [ # this syncs into the eng_clk domain + If(mseq.ongoing("DONE"), ## mod this to sync with the phase that the state machine ends on + self.q[i * 17:i * 17 + 17].eq(getattr(self, "dsp_p" + str(i))[:17]), + ).Else( + self.q[i * 17:i * 17 + 17].eq(self.q[i * 17:i * 17 + 17]), + ), + ] + # whether we are asserting on DONE/DONE2 or DONE2/DONE3 depends on even/odd # of states previously spent to compute the mul + self.sync.mul_clk += [ + If(mseq.ongoing("DONE2") | mseq.ongoing("DONE3"), + self.q_valid.eq(1), + ).Else( + self.q_valid.eq(0), + ) + ] + # compute special-case detection if the partial sum output is >= 2^255-19 + self.comb += [ + overflow_25519.eq( + self.dsp_match14 & + self.dsp_match13 & + self.dsp_match12 & + self.dsp_match11 & + self.dsp_match10 & + self.dsp_match9 & + self.dsp_match8 & + self.dsp_match7 & + self.dsp_match6 & + self.dsp_match5 & + self.dsp_match4 & + self.dsp_match3 & + self.dsp_match2 & + self.dsp_match1 & + (self.dsp_p0 >= 0x1_ffed) + ) + ] + +class ExecClmul(ExecUnit, AutoDoc): + def clmul64(self, IN2, IN1): + return (Replicate(IN2[0], 64) & (IN1[0:64])) ^ Cat(Signal(1, reset = 0), (Replicate(IN2[1], 63) & IN1[0:63])) ^ Cat(Signal(2, reset = 0), (Replicate(IN2[2], 62) & IN1[0:62])) ^ Cat(Signal(3, reset = 0), (Replicate(IN2[3], 61) & IN1[0:61])) ^ Cat(Signal(4, reset = 0), (Replicate(IN2[4], 60) & IN1[0:60])) ^ Cat(Signal(5, reset = 0), (Replicate(IN2[5], 59) & IN1[0:59])) ^ Cat(Signal(6, reset = 0), (Replicate(IN2[6], 58) & IN1[0:58])) ^ Cat(Signal(7, reset = 0), (Replicate(IN2[7], 57) & IN1[0:57])) ^ Cat(Signal(8, reset = 0), (Replicate(IN2[8], 56) & IN1[0:56])) ^ Cat(Signal(9, reset = 0), (Replicate(IN2[9], 55) & IN1[0:55])) ^ Cat(Signal(10, reset = 0), (Replicate(IN2[10], 54) & IN1[0:54])) ^ Cat(Signal(11, reset = 0), (Replicate(IN2[11], 53) & IN1[0:53])) ^ Cat(Signal(12, reset = 0), (Replicate(IN2[12], 52) & IN1[0:52])) ^ Cat(Signal(13, reset = 0), (Replicate(IN2[13], 51) & IN1[0:51])) ^ Cat(Signal(14, reset = 0), (Replicate(IN2[14], 50) & IN1[0:50])) ^ Cat(Signal(15, reset = 0), (Replicate(IN2[15], 49) & IN1[0:49])) ^ Cat(Signal(16, reset = 0), (Replicate(IN2[16], 48) & IN1[0:48])) ^ Cat(Signal(17, reset = 0), (Replicate(IN2[17], 47) & IN1[0:47])) ^ Cat(Signal(18, reset = 0), (Replicate(IN2[18], 46) & IN1[0:46])) ^ Cat(Signal(19, reset = 0), (Replicate(IN2[19], 45) & IN1[0:45])) ^ Cat(Signal(20, reset = 0), (Replicate(IN2[20], 44) & IN1[0:44])) ^ Cat(Signal(21, reset = 0), (Replicate(IN2[21], 43) & IN1[0:43])) ^ Cat(Signal(22, reset = 0), (Replicate(IN2[22], 42) & IN1[0:42])) ^ Cat(Signal(23, reset = 0), (Replicate(IN2[23], 41) & IN1[0:41])) ^ Cat(Signal(24, reset = 0), (Replicate(IN2[24], 40) & IN1[0:40])) ^ Cat(Signal(25, reset = 0), (Replicate(IN2[25], 39) & IN1[0:39])) ^ Cat(Signal(26, reset = 0), (Replicate(IN2[26], 38) & IN1[0:38])) ^ Cat(Signal(27, reset = 0), (Replicate(IN2[27], 37) & IN1[0:37])) ^ Cat(Signal(28, reset = 0), (Replicate(IN2[28], 36) & IN1[0:36])) ^ Cat(Signal(29, reset = 0), (Replicate(IN2[29], 35) & IN1[0:35])) ^ Cat(Signal(30, reset = 0), (Replicate(IN2[30], 34) & IN1[0:34])) ^ Cat(Signal(31, reset = 0), (Replicate(IN2[31], 33) & IN1[0:33])) ^ Cat(Signal(32, reset = 0), (Replicate(IN2[32], 32) & IN1[0:32])) ^ Cat(Signal(33, reset = 0), (Replicate(IN2[33], 31) & IN1[0:31])) ^ Cat(Signal(34, reset = 0), (Replicate(IN2[34], 30) & IN1[0:30])) ^ Cat(Signal(35, reset = 0), (Replicate(IN2[35], 29) & IN1[0:29])) ^ Cat(Signal(36, reset = 0), (Replicate(IN2[36], 28) & IN1[0:28])) ^ Cat(Signal(37, reset = 0), (Replicate(IN2[37], 27) & IN1[0:27])) ^ Cat(Signal(38, reset = 0), (Replicate(IN2[38], 26) & IN1[0:26])) ^ Cat(Signal(39, reset = 0), (Replicate(IN2[39], 25) & IN1[0:25])) ^ Cat(Signal(40, reset = 0), (Replicate(IN2[40], 24) & IN1[0:24])) ^ Cat(Signal(41, reset = 0), (Replicate(IN2[41], 23) & IN1[0:23])) ^ Cat(Signal(42, reset = 0), (Replicate(IN2[42], 22) & IN1[0:22])) ^ Cat(Signal(43, reset = 0), (Replicate(IN2[43], 21) & IN1[0:21])) ^ Cat(Signal(44, reset = 0), (Replicate(IN2[44], 20) & IN1[0:20])) ^ Cat(Signal(45, reset = 0), (Replicate(IN2[45], 19) & IN1[0:19])) ^ Cat(Signal(46, reset = 0), (Replicate(IN2[46], 18) & IN1[0:18])) ^ Cat(Signal(47, reset = 0), (Replicate(IN2[47], 17) & IN1[0:17])) ^ Cat(Signal(48, reset = 0), (Replicate(IN2[48], 16) & IN1[0:16])) ^ Cat(Signal(49, reset = 0), (Replicate(IN2[49], 15) & IN1[0:15])) ^ Cat(Signal(50, reset = 0), (Replicate(IN2[50], 14) & IN1[0:14])) ^ Cat(Signal(51, reset = 0), (Replicate(IN2[51], 13) & IN1[0:13])) ^ Cat(Signal(52, reset = 0), (Replicate(IN2[52], 12) & IN1[0:12])) ^ Cat(Signal(53, reset = 0), (Replicate(IN2[53], 11) & IN1[0:11])) ^ Cat(Signal(54, reset = 0), (Replicate(IN2[54], 10) & IN1[0:10])) ^ Cat(Signal(55, reset = 0), (Replicate(IN2[55], 9) & IN1[0:9])) ^ Cat(Signal(56, reset = 0), (Replicate(IN2[56], 8) & IN1[0:8])) ^ Cat(Signal(57, reset = 0), (Replicate(IN2[57], 7) & IN1[0:7])) ^ Cat(Signal(58, reset = 0), (Replicate(IN2[58], 6) & IN1[0:6])) ^ Cat(Signal(59, reset = 0), (Replicate(IN2[59], 5) & IN1[0:5])) ^ Cat(Signal(60, reset = 0), (Replicate(IN2[60], 4) & IN1[0:4])) ^ Cat(Signal(61, reset = 0), (Replicate(IN2[61], 3) & IN1[0:3])) ^ Cat(Signal(62, reset = 0), (Replicate(IN2[62], 2) & IN1[0:2])) ^ Cat(Signal(63, reset = 0), (Replicate(IN2[63], 1) & IN1[0:1])) + + def clmul64h(self, IN2, IN1): + return Cat((((Replicate(IN2[0], 1)) & IN1[63:64]) ^ ((Replicate(IN2[1], 2)) & IN1[62:64]) ^ ((Replicate(IN2[2], 3)) & IN1[61:64]) ^ ((Replicate(IN2[3], 4)) & IN1[60:64]) ^ ((Replicate(IN2[4], 5)) & IN1[59:64]) ^ ((Replicate(IN2[5], 6)) & IN1[58:64]) ^ ((Replicate(IN2[6], 7)) & IN1[57:64]) ^ ((Replicate(IN2[7], 8)) & IN1[56:64]) ^ ((Replicate(IN2[8], 9)) & IN1[55:64]) ^ ((Replicate(IN2[9], 10)) & IN1[54:64]) ^ ((Replicate(IN2[10], 11)) & IN1[53:64]) ^ ((Replicate(IN2[11], 12)) & IN1[52:64]) ^ ((Replicate(IN2[12], 13)) & IN1[51:64]) ^ ((Replicate(IN2[13], 14)) & IN1[50:64]) ^ ((Replicate(IN2[14], 15)) & IN1[49:64]) ^ ((Replicate(IN2[15], 16)) & IN1[48:64]) ^ ((Replicate(IN2[16], 17)) & IN1[47:64]) ^ ((Replicate(IN2[17], 18)) & IN1[46:64]) ^ ((Replicate(IN2[18], 19)) & IN1[45:64]) ^ ((Replicate(IN2[19], 20)) & IN1[44:64]) ^ ((Replicate(IN2[20], 21)) & IN1[43:64]) ^ ((Replicate(IN2[21], 22)) & IN1[42:64]) ^ ((Replicate(IN2[22], 23)) & IN1[41:64]) ^ ((Replicate(IN2[23], 24)) & IN1[40:64]) ^ ((Replicate(IN2[24], 25)) & IN1[39:64]) ^ ((Replicate(IN2[25], 26)) & IN1[38:64]) ^ ((Replicate(IN2[26], 27)) & IN1[37:64]) ^ ((Replicate(IN2[27], 28)) & IN1[36:64]) ^ ((Replicate(IN2[28], 29)) & IN1[35:64]) ^ ((Replicate(IN2[29], 30)) & IN1[34:64]) ^ ((Replicate(IN2[30], 31)) & IN1[33:64]) ^ ((Replicate(IN2[31], 32)) & IN1[32:64]) ^ ((Replicate(IN2[32], 33)) & IN1[31:64]) ^ ((Replicate(IN2[33], 34)) & IN1[30:64]) ^ ((Replicate(IN2[34], 35)) & IN1[29:64]) ^ ((Replicate(IN2[35], 36)) & IN1[28:64]) ^ ((Replicate(IN2[36], 37)) & IN1[27:64]) ^ ((Replicate(IN2[37], 38)) & IN1[26:64]) ^ ((Replicate(IN2[38], 39)) & IN1[25:64]) ^ ((Replicate(IN2[39], 40)) & IN1[24:64]) ^ ((Replicate(IN2[40], 41)) & IN1[23:64]) ^ ((Replicate(IN2[41], 42)) & IN1[22:64]) ^ ((Replicate(IN2[42], 43)) & IN1[21:64]) ^ ((Replicate(IN2[43], 44)) & IN1[20:64]) ^ ((Replicate(IN2[44], 45)) & IN1[19:64]) ^ ((Replicate(IN2[45], 46)) & IN1[18:64]) ^ ((Replicate(IN2[46], 47)) & IN1[17:64]) ^ ((Replicate(IN2[47], 48)) & IN1[16:64]) ^ ((Replicate(IN2[48], 49)) & IN1[15:64]) ^ ((Replicate(IN2[49], 50)) & IN1[14:64]) ^ ((Replicate(IN2[50], 51)) & IN1[13:64]) ^ ((Replicate(IN2[51], 52)) & IN1[12:64]) ^ ((Replicate(IN2[52], 53)) & IN1[11:64]) ^ ((Replicate(IN2[53], 54)) & IN1[10:64]) ^ ((Replicate(IN2[54], 55)) & IN1[9:64]) ^ ((Replicate(IN2[55], 56)) & IN1[8:64]) ^ ((Replicate(IN2[56], 57)) & IN1[7:64]) ^ ((Replicate(IN2[57], 58)) & IN1[6:64]) ^ ((Replicate(IN2[58], 59)) & IN1[5:64]) ^ ((Replicate(IN2[59], 60)) & IN1[4:64]) ^ ((Replicate(IN2[60], 61)) & IN1[3:64]) ^ ((Replicate(IN2[61], 62)) & IN1[2:64]) ^ ((Replicate(IN2[62], 63)) & IN1[1:64]) ^ ((Replicate(IN2[63], 64)) & IN1[0:64])), Signal(1, reset = 0))[1:65] + + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["CLMUL"]) + self.notes = ModuleDoc(title="Clmul ExecUnit Subclass", body=f""" + """) + + clmul64x_in1 = Signal(64) + clmul64x_in2 = Signal(64) + clmul64_out = Signal(64) + clmul64h_out = Signal(64) + nlane = width // 128 + clmul_buf = Signal(nlane * 128) ## width must be a multiple of 128... + lanec = Signal(log2_int(nlane, False)) + assert(nlane == 2) ## fixme + + self.sync.eng_clk += [ + clmul64_out.eq(self.clmul64(clmul64x_in1, clmul64x_in2)), + clmul64h_out.eq(self.clmul64h(clmul64x_in1, clmul64x_in2)), + ] + + self.sync.eng_clk += [ + #self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + + self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE")) + seq.act("IDLE", + If(self.start, + NextValue(lanec, 0), + Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(clmul64x_in1, self.a[ 0: 64]), NextValue(clmul64x_in2, self.b[ 0: 64]) ], + 0x1: [ NextValue(clmul64x_in1, self.a[ 0: 64]), NextValue(clmul64x_in2, self.b[ 64:128]) ], + 0x2: [ NextValue(clmul64x_in1, self.a[ 64:128]), NextValue(clmul64x_in2, self.b[ 0: 64]) ], + 0x3: [ NextValue(clmul64x_in1, self.a[ 64:128]), NextValue(clmul64x_in2, self.b[ 64:128]) ], + }), + NextState("NEXT"))) + seq.act("NEXT", + Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(clmul64x_in1, self.a[128:192]), NextValue(clmul64x_in2, self.b[128:192]) ], + 0x1: [ NextValue(clmul64x_in1, self.a[128:192]), NextValue(clmul64x_in2, self.b[192:256]) ], + 0x2: [ NextValue(clmul64x_in1, self.a[192:256]), NextValue(clmul64x_in2, self.b[128:192]) ], + 0x3: [ NextValue(clmul64x_in1, self.a[192:256]), NextValue(clmul64x_in2, self.b[192:256]) ], + }), + NextState("WRITE")) + seq.act("WRITE", + Case(lanec, { + 0: [ NextValue(clmul_buf[0:128], Cat(clmul64_out, clmul64h_out)), + NextValue(lanec, 1), + ], + 1: [ NextValue(clmul_buf[128:256], Cat(clmul64_out, clmul64h_out)), + NextState("OUT"), + ], + })) + seq.act("OUT", + self.q_valid.eq(1), + If(self.instruction.immediate[8:9], + self.q.eq(clmul_buf), + ).Else( + self.q.eq(Cat(clmul_buf[0:128], Signal(128, reset = 0))) + ), + NextState("IDLE")); + + +class ExecGCMShifts(ExecUnit, AutoDoc): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["GCM_SHLMI", "GCM_SHRMI", "GCM_CMPD", "GCM_SWAP64"]) + self.notes = ModuleDoc(title="GCM Shifts ExecUnit Subclass", body=f""" + """) + + assert(width == 256) # fixme + + self.sync.eng_clk += [ + self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + self.comb += [ + If(self.instruction.opcode == opcodes["GCM_CMPD"][0], + self.q.eq(Cat(self.a[ 0: 64], self.a[ 64:128] ^ Cat(Signal(63, reset = 0), self.a[ 0: 1]) ^ Cat(Signal(62, reset = 0), self.a[ 0: 2]) ^ Cat(Signal(57, reset = 0), self.a[ 0: 7]), + self.a[128:192], self.a[192:256] ^ Cat(Signal(63, reset = 0), self.a[128:129]) ^ Cat(Signal(62, reset = 0), self.a[128:130]) ^ Cat(Signal(57, reset = 0), self.a[128:135])) + ) #eq + ).Elif(self.instruction.opcode == opcodes["GCM_SHRMI"][0], + Case(self.instruction.immediate[0:3], { + 0x0: self.q.eq(self.a), + 0x1: self.q.eq(Cat(self.a[1:128], self.b[0:1], self.a[129:256], self.b[128:129])), + 0x2: self.q.eq(Cat(self.a[2:128], self.b[0:2], self.a[130:256], self.b[128:130])), + 0x3: self.q.eq(Cat(self.a[3:128], self.b[0:3], self.a[131:256], self.b[128:131])), + 0x4: self.q.eq(Cat(self.a[4:128], self.b[0:4], self.a[132:256], self.b[128:132])), + 0x5: self.q.eq(Cat(self.a[5:128], self.b[0:5], self.a[133:256], self.b[128:133])), + 0x6: self.q.eq(Cat(self.a[6:128], self.b[0:6], self.a[134:256], self.b[128:134])), + 0x7: self.q.eq(Cat(self.a[7:128], self.b[0:7], self.a[135:256], self.b[128:135])), + }) + ).Elif(self.instruction.opcode == opcodes["GCM_SHLMI"][0], + Case(self.instruction.immediate[0:3], { + 0x0: self.q.eq(self.a), + 0x1: self.q.eq(Cat(self.b[127:128], self.a[0:127], self.b[255:256], self.a[128:255])), + 0x2: self.q.eq(Cat(self.b[126:128], self.a[0:126], self.b[254:256], self.a[128:254])), + 0x3: self.q.eq(Cat(self.b[125:128], self.a[0:125], self.b[253:256], self.a[128:253])), + 0x4: self.q.eq(Cat(self.b[124:128], self.a[0:124], self.b[252:256], self.a[128:252])), + 0x5: self.q.eq(Cat(self.b[123:128], self.a[0:123], self.b[251:256], self.a[128:251])), + 0x6: self.q.eq(Cat(self.b[122:128], self.a[0:122], self.b[250:256], self.a[128:250])), + 0x7: self.q.eq(Cat(self.b[121:128], self.a[0:121], self.b[249:256], self.a[128:249])), + }) + ).Elif(self.instruction.opcode == opcodes["GCM_SWAP64"][0], + # also gcm_brev*, gcm_swap32 + Case(self.instruction.immediate[0:2], { + # SWAP64 + 0: self.q.eq(Cat(self.b[ 64:128], self.a[ 0: 64], + self.b[192:256], self.a[128:192])), + # SWAP32 + 4: self.q.eq(Cat(self.b[ 32: 64], self.a[ 0: 32], self.b[ 96:128], self.a[ 64: 96], + self.b[160:192], self.a[128:160], self.b[224:256], self.a[192:224])), + # BREV16 + 1: self.q.eq(Cat(self.a[ 8: 16], self.a[ 0: 8], self.a[ 24: 32], self.a[ 16: 24], self.a[ 40: 48], self.a[ 32: 40], self.a[ 56: 64], self.a[ 48: 56], + self.a[ 72: 80], self.a[ 64: 72], self.a[ 88: 96], self.a[ 80: 88], self.a[104:112], self.a[ 96:104], self.a[120:128], self.a[112:120], + self.a[136:144], self.a[128:136], self.a[152:160], self.a[144:152], self.a[168:176], self.a[160:168], self.a[184:192], self.a[176:184], + self.a[200:208], self.a[192:200], self.a[216:224], self.a[208:216], self.a[232:240], self.a[224:232], self.a[248:256], self.a[240:248])), + # BREV32 + 2: self.q.eq(Cat(self.a[ 24: 32], self.a[ 16: 24], self.a[ 8: 16], self.a[ 0: 8], + self.a[ 56: 64], self.a[ 48: 56], self.a[ 40: 48], self.a[ 32: 40], + self.a[ 88: 96], self.a[ 80: 88], self.a[ 72: 80], self.a[ 64: 72], + self.a[120:128], self.a[112:120], self.a[104:112], self.a[ 96:104], + self.a[152:160], self.a[144:152], self.a[136:144], self.a[128:136], + self.a[184:192], self.a[176:184], self.a[168:176], self.a[160:168], + self.a[216:224], self.a[208:216], self.a[200:208], self.a[192:200], + self.a[248:256], self.a[240:248], self.a[232:240], self.a[224:232])), + # BREV64 + 3: self.q.eq(Cat(self.a[ 56: 64], self.a[ 48: 56], self.a[ 40: 48], self.a[ 32: 40], self.a[ 24: 32], self.a[ 16: 24], self.a[ 8: 16], self.a[ 0: 8], + self.a[120:128], self.a[112:120], self.a[104:112], self.a[ 96:104], self.a[ 88: 96], self.a[ 80: 88], self.a[ 72: 80], self.a[ 64: 72], + self.a[184:192], self.a[176:184], self.a[168:176], self.a[160:168], self.a[152:160], self.a[144:152], self.a[136:144], self.a[128:136], + self.a[248:256], self.a[240:248], self.a[232:240], self.a[224:232], self.a[216:224], self.a[208:216], self.a[200:208], self.a[192:200])), + }) + ) + ] + +class ExecAES(ExecUnit, AutoDoc): + def __init__(self, width=256): + ExecUnit.__init__(self, width, ["AESESMI"]) + self.notes = ModuleDoc(title="AES ExecUnit Subclass", body=f""" + """) + + assert(width == 256) # fixme + nlane = width // 128 + aes_buf = Signal(nlane * 128) ## width must be a multiple of 128... + assert(nlane == 2) ## fixme + + aes_in = Array(Signal(8) for a in range(4)) + aes_out = Array(Signal(24) for a in range(4)) + for i in range(4): + self.sync.mul_clk += Case(aes_in[i], { 0x00: aes_out[i].eq(0xa563c6), 0x01: aes_out[i].eq(0x847cf8), 0x02: aes_out[i].eq(0x9977ee), 0x03: aes_out[i].eq(0x8d7bf6), 0x04: aes_out[i].eq(0x0df2ff), 0x05: aes_out[i].eq(0xbd6bd6), 0x06: aes_out[i].eq(0xb16fde), 0x07: aes_out[i].eq(0x54c591), 0x08: aes_out[i].eq(0x503060), 0x09: aes_out[i].eq(0x030102), 0x0a: aes_out[i].eq(0xa967ce), 0x0b: aes_out[i].eq(0x7d2b56), 0x0c: aes_out[i].eq(0x19fee7), 0x0d: aes_out[i].eq(0x62d7b5), 0x0e: aes_out[i].eq(0xe6ab4d), 0x0f: aes_out[i].eq(0x9a76ec), 0x10: aes_out[i].eq(0x45ca8f), 0x11: aes_out[i].eq(0x9d821f), 0x12: aes_out[i].eq(0x40c989), 0x13: aes_out[i].eq(0x877dfa), 0x14: aes_out[i].eq(0x15faef), 0x15: aes_out[i].eq(0xeb59b2), 0x16: aes_out[i].eq(0xc9478e), 0x17: aes_out[i].eq(0x0bf0fb), 0x18: aes_out[i].eq(0xecad41), 0x19: aes_out[i].eq(0x67d4b3), 0x1a: aes_out[i].eq(0xfda25f), 0x1b: aes_out[i].eq(0xeaaf45), 0x1c: aes_out[i].eq(0xbf9c23), 0x1d: aes_out[i].eq(0xf7a453), 0x1e: aes_out[i].eq(0x9672e4), 0x1f: aes_out[i].eq(0x5bc09b), 0x20: aes_out[i].eq(0xc2b775), 0x21: aes_out[i].eq(0x1cfde1), 0x22: aes_out[i].eq(0xae933d), 0x23: aes_out[i].eq(0x6a264c), 0x24: aes_out[i].eq(0x5a366c), 0x25: aes_out[i].eq(0x413f7e), 0x26: aes_out[i].eq(0x02f7f5), 0x27: aes_out[i].eq(0x4fcc83), 0x28: aes_out[i].eq(0x5c3468), 0x29: aes_out[i].eq(0xf4a551), 0x2a: aes_out[i].eq(0x34e5d1), 0x2b: aes_out[i].eq(0x08f1f9), 0x2c: aes_out[i].eq(0x9371e2), 0x2d: aes_out[i].eq(0x73d8ab), 0x2e: aes_out[i].eq(0x533162), 0x2f: aes_out[i].eq(0x3f152a), 0x30: aes_out[i].eq(0x0c0408), 0x31: aes_out[i].eq(0x52c795), 0x32: aes_out[i].eq(0x652346), 0x33: aes_out[i].eq(0x5ec39d), 0x34: aes_out[i].eq(0x281830), 0x35: aes_out[i].eq(0xa19637), 0x36: aes_out[i].eq(0x0f050a), 0x37: aes_out[i].eq(0xb59a2f), 0x38: aes_out[i].eq(0x09070e), 0x39: aes_out[i].eq(0x361224), 0x3a: aes_out[i].eq(0x9b801b), 0x3b: aes_out[i].eq(0x3de2df), 0x3c: aes_out[i].eq(0x26ebcd), 0x3d: aes_out[i].eq(0x69274e), 0x3e: aes_out[i].eq(0xcdb27f), 0x3f: aes_out[i].eq(0x9f75ea), 0x40: aes_out[i].eq(0x1b0912), 0x41: aes_out[i].eq(0x9e831d), 0x42: aes_out[i].eq(0x742c58), 0x43: aes_out[i].eq(0x2e1a34), 0x44: aes_out[i].eq(0x2d1b36), 0x45: aes_out[i].eq(0xb26edc), 0x46: aes_out[i].eq(0xee5ab4), 0x47: aes_out[i].eq(0xfba05b), 0x48: aes_out[i].eq(0xf652a4), 0x49: aes_out[i].eq(0x4d3b76), 0x4a: aes_out[i].eq(0x61d6b7), 0x4b: aes_out[i].eq(0xceb37d), 0x4c: aes_out[i].eq(0x7b2952), 0x4d: aes_out[i].eq(0x3ee3dd), 0x4e: aes_out[i].eq(0x712f5e), 0x4f: aes_out[i].eq(0x978413), 0x50: aes_out[i].eq(0xf553a6), 0x51: aes_out[i].eq(0x68d1b9), 0x52: aes_out[i].eq(0x000000), 0x53: aes_out[i].eq(0x2cedc1), 0x54: aes_out[i].eq(0x602040), 0x55: aes_out[i].eq(0x1ffce3), 0x56: aes_out[i].eq(0xc8b179), 0x57: aes_out[i].eq(0xed5bb6), 0x58: aes_out[i].eq(0xbe6ad4), 0x59: aes_out[i].eq(0x46cb8d), 0x5a: aes_out[i].eq(0xd9be67), 0x5b: aes_out[i].eq(0x4b3972), 0x5c: aes_out[i].eq(0xde4a94), 0x5d: aes_out[i].eq(0xd44c98), 0x5e: aes_out[i].eq(0xe858b0), 0x5f: aes_out[i].eq(0x4acf85), 0x60: aes_out[i].eq(0x6bd0bb), 0x61: aes_out[i].eq(0x2aefc5), 0x62: aes_out[i].eq(0xe5aa4f), 0x63: aes_out[i].eq(0x16fbed), 0x64: aes_out[i].eq(0xc54386), 0x65: aes_out[i].eq(0xd74d9a), 0x66: aes_out[i].eq(0x553366), 0x67: aes_out[i].eq(0x948511), 0x68: aes_out[i].eq(0xcf458a), 0x69: aes_out[i].eq(0x10f9e9), 0x6a: aes_out[i].eq(0x060204), 0x6b: aes_out[i].eq(0x817ffe), 0x6c: aes_out[i].eq(0xf050a0), 0x6d: aes_out[i].eq(0x443c78), 0x6e: aes_out[i].eq(0xba9f25), 0x6f: aes_out[i].eq(0xe3a84b), 0x70: aes_out[i].eq(0xf351a2), 0x71: aes_out[i].eq(0xfea35d), 0x72: aes_out[i].eq(0xc04080), 0x73: aes_out[i].eq(0x8a8f05), 0x74: aes_out[i].eq(0xad923f), 0x75: aes_out[i].eq(0xbc9d21), 0x76: aes_out[i].eq(0x483870), 0x77: aes_out[i].eq(0x04f5f1), 0x78: aes_out[i].eq(0xdfbc63), 0x79: aes_out[i].eq(0xc1b677), 0x7a: aes_out[i].eq(0x75daaf), 0x7b: aes_out[i].eq(0x632142), 0x7c: aes_out[i].eq(0x301020), 0x7d: aes_out[i].eq(0x1affe5), 0x7e: aes_out[i].eq(0x0ef3fd), 0x7f: aes_out[i].eq(0x6dd2bf), 0x80: aes_out[i].eq(0x4ccd81), 0x81: aes_out[i].eq(0x140c18), 0x82: aes_out[i].eq(0x351326), 0x83: aes_out[i].eq(0x2fecc3), 0x84: aes_out[i].eq(0xe15fbe), 0x85: aes_out[i].eq(0xa29735), 0x86: aes_out[i].eq(0xcc4488), 0x87: aes_out[i].eq(0x39172e), 0x88: aes_out[i].eq(0x57c493), 0x89: aes_out[i].eq(0xf2a755), 0x8a: aes_out[i].eq(0x827efc), 0x8b: aes_out[i].eq(0x473d7a), 0x8c: aes_out[i].eq(0xac64c8), 0x8d: aes_out[i].eq(0xe75dba), 0x8e: aes_out[i].eq(0x2b1932), 0x8f: aes_out[i].eq(0x9573e6), 0x90: aes_out[i].eq(0xa060c0), 0x91: aes_out[i].eq(0x988119), 0x92: aes_out[i].eq(0xd14f9e), 0x93: aes_out[i].eq(0x7fdca3), 0x94: aes_out[i].eq(0x662244), 0x95: aes_out[i].eq(0x7e2a54), 0x96: aes_out[i].eq(0xab903b), 0x97: aes_out[i].eq(0x83880b), 0x98: aes_out[i].eq(0xca468c), 0x99: aes_out[i].eq(0x29eec7), 0x9a: aes_out[i].eq(0xd3b86b), 0x9b: aes_out[i].eq(0x3c1428), 0x9c: aes_out[i].eq(0x79dea7), 0x9d: aes_out[i].eq(0xe25ebc), 0x9e: aes_out[i].eq(0x1d0b16), 0x9f: aes_out[i].eq(0x76dbad), 0xa0: aes_out[i].eq(0x3be0db), 0xa1: aes_out[i].eq(0x563264), 0xa2: aes_out[i].eq(0x4e3a74), 0xa3: aes_out[i].eq(0x1e0a14), 0xa4: aes_out[i].eq(0xdb4992), 0xa5: aes_out[i].eq(0x0a060c), 0xa6: aes_out[i].eq(0x6c2448), 0xa7: aes_out[i].eq(0xe45cb8), 0xa8: aes_out[i].eq(0x5dc29f), 0xa9: aes_out[i].eq(0x6ed3bd), 0xaa: aes_out[i].eq(0xefac43), 0xab: aes_out[i].eq(0xa662c4), 0xac: aes_out[i].eq(0xa89139), 0xad: aes_out[i].eq(0xa49531), 0xae: aes_out[i].eq(0x37e4d3), 0xaf: aes_out[i].eq(0x8b79f2), 0xb0: aes_out[i].eq(0x32e7d5), 0xb1: aes_out[i].eq(0x43c88b), 0xb2: aes_out[i].eq(0x59376e), 0xb3: aes_out[i].eq(0xb76dda), 0xb4: aes_out[i].eq(0x8c8d01), 0xb5: aes_out[i].eq(0x64d5b1), 0xb6: aes_out[i].eq(0xd24e9c), 0xb7: aes_out[i].eq(0xe0a949), 0xb8: aes_out[i].eq(0xb46cd8), 0xb9: aes_out[i].eq(0xfa56ac), 0xba: aes_out[i].eq(0x07f4f3), 0xbb: aes_out[i].eq(0x25eacf), 0xbc: aes_out[i].eq(0xaf65ca), 0xbd: aes_out[i].eq(0x8e7af4), 0xbe: aes_out[i].eq(0xe9ae47), 0xbf: aes_out[i].eq(0x180810), 0xc0: aes_out[i].eq(0xd5ba6f), 0xc1: aes_out[i].eq(0x8878f0), 0xc2: aes_out[i].eq(0x6f254a), 0xc3: aes_out[i].eq(0x722e5c), 0xc4: aes_out[i].eq(0x241c38), 0xc5: aes_out[i].eq(0xf1a657), 0xc6: aes_out[i].eq(0xc7b473), 0xc7: aes_out[i].eq(0x51c697), 0xc8: aes_out[i].eq(0x23e8cb), 0xc9: aes_out[i].eq(0x7cdda1), 0xca: aes_out[i].eq(0x9c74e8), 0xcb: aes_out[i].eq(0x211f3e), 0xcc: aes_out[i].eq(0xdd4b96), 0xcd: aes_out[i].eq(0xdcbd61), 0xce: aes_out[i].eq(0x868b0d), 0xcf: aes_out[i].eq(0x858a0f), 0xd0: aes_out[i].eq(0x9070e0), 0xd1: aes_out[i].eq(0x423e7c), 0xd2: aes_out[i].eq(0xc4b571), 0xd3: aes_out[i].eq(0xaa66cc), 0xd4: aes_out[i].eq(0xd84890), 0xd5: aes_out[i].eq(0x050306), 0xd6: aes_out[i].eq(0x01f6f7), 0xd7: aes_out[i].eq(0x120e1c), 0xd8: aes_out[i].eq(0xa361c2), 0xd9: aes_out[i].eq(0x5f356a), 0xda: aes_out[i].eq(0xf957ae), 0xdb: aes_out[i].eq(0xd0b969), 0xdc: aes_out[i].eq(0x918617), 0xdd: aes_out[i].eq(0x58c199), 0xde: aes_out[i].eq(0x271d3a), 0xdf: aes_out[i].eq(0xb99e27), 0xe0: aes_out[i].eq(0x38e1d9), 0xe1: aes_out[i].eq(0x13f8eb), 0xe2: aes_out[i].eq(0xb3982b), 0xe3: aes_out[i].eq(0x331122), 0xe4: aes_out[i].eq(0xbb69d2), 0xe5: aes_out[i].eq(0x70d9a9), 0xe6: aes_out[i].eq(0x898e07), 0xe7: aes_out[i].eq(0xa79433), 0xe8: aes_out[i].eq(0xb69b2d), 0xe9: aes_out[i].eq(0x221e3c), 0xea: aes_out[i].eq(0x928715), 0xeb: aes_out[i].eq(0x20e9c9), 0xec: aes_out[i].eq(0x49ce87), 0xed: aes_out[i].eq(0xff55aa), 0xee: aes_out[i].eq(0x782850), 0xef: aes_out[i].eq(0x7adfa5), 0xf0: aes_out[i].eq(0x8f8c03), 0xf1: aes_out[i].eq(0xf8a159), 0xf2: aes_out[i].eq(0x808909), 0xf3: aes_out[i].eq(0x170d1a), 0xf4: aes_out[i].eq(0xdabf65), 0xf5: aes_out[i].eq(0x31e6d7), 0xf6: aes_out[i].eq(0xc64284), 0xf7: aes_out[i].eq(0xb868d0), 0xf8: aes_out[i].eq(0xc34182), 0xf9: aes_out[i].eq(0xb09929), 0xfa: aes_out[i].eq(0x772d5a), 0xfb: aes_out[i].eq(0x110f1e), 0xfc: aes_out[i].eq(0xcbb07b), 0xfd: aes_out[i].eq(0xfc54a8), 0xfe: aes_out[i].eq(0xd6bb6d), 0xff: aes_out[i].eq(0x3a162c) } ) + + self.sync.eng_clk += [ + #self.q_valid.eq(self.start), + self.instruction_out.eq(self.instruction_in), + ] + + start_pipe = Signal() + self.sync.mul_clk += start_pipe.eq(self.start) # break critical path of instruction decode -> SETUP_A state muxes + + self.submodules.seq = seq = ClockDomainsRenamer("mul_clk")(FSM(reset_state="IDLE")) + seq.act("IDLE", + If(start_pipe, + # put the first byte in the lookup tables (LANE1) + # [ NextValue(aes_in[i], self.a[32*i:32*i+8]) for i in range(0, 4) ], + NextValue(aes_in[0], self.a[ 0: 8]), + NextValue(aes_in[1], self.a[ 32: 40]), + NextValue(aes_in[2], self.a[ 64: 72]), + NextValue(aes_in[3], self.a[ 96:104]), + NextState("LANE2_1"))) + seq.act("LANE2_1", + # put the first byte in the lookup tables (LANE2) + NextValue(aes_in[0], self.a[128:136]), + NextValue(aes_in[1], self.a[160:168]), + NextValue(aes_in[2], self.a[192:200]), + NextValue(aes_in[3], self.a[224:232]), + NextState("LANE1_2")) + seq.act("LANE1_2", + # store the xor'ed result for LANE1, byte 1 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[ 0: 32], self.b[ 0: 32] ^ Cat(aes_out[0][ 0:16], aes_out[0][ 8:24])), + NextValue(aes_buf[ 32: 64], self.b[ 32: 64] ^ Cat(aes_out[1][ 0:16], aes_out[1][ 8:24])), + NextValue(aes_buf[ 64: 96], self.b[ 64: 96] ^ Cat(aes_out[2][ 0:16], aes_out[2][ 8:24])), + NextValue(aes_buf[ 96:128], self.b[ 96:128] ^ Cat(aes_out[3][ 0:16], aes_out[3][ 8:24]))], + 1:[ NextValue(aes_buf[ 0: 32], self.b[ 0: 32] ^ Cat(aes_out[0][ 8:16], Signal(24, reset = 0))), + NextValue(aes_buf[ 32: 64], self.b[ 32: 64] ^ Cat(aes_out[1][ 8:16], Signal(24, reset = 0))), + NextValue(aes_buf[ 64: 96], self.b[ 64: 96] ^ Cat(aes_out[2][ 8:16], Signal(24, reset = 0))), + NextValue(aes_buf[ 96:128], self.b[ 96:128] ^ Cat(aes_out[3][ 8:16], Signal(24, reset = 0)))], + }), + # put the second byte in the lookup tables (LANE1) + NextValue(aes_in[3], self.a[ 8: 16]), + NextValue(aes_in[0], self.a[ 40: 48]), + NextValue(aes_in[1], self.a[ 72: 80]), + NextValue(aes_in[2], self.a[104:112]), + NextState("LANE2_2")) + seq.act("LANE2_2", + # store the xor'ed result for LANE2, byte 1 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[128:160], self.b[128:160] ^ Cat(aes_out[0][ 0:16], aes_out[0][ 8:24])), + NextValue(aes_buf[160:192], self.b[160:192] ^ Cat(aes_out[1][ 0:16], aes_out[1][ 8:24])), + NextValue(aes_buf[192:224], self.b[192:224] ^ Cat(aes_out[2][ 0:16], aes_out[2][ 8:24])), + NextValue(aes_buf[224:256], self.b[224:256] ^ Cat(aes_out[3][ 0:16], aes_out[3][ 8:24]))], + 1:[ NextValue(aes_buf[128:160], self.b[128:160] ^ Cat(aes_out[0][ 8:16], Signal(24, reset = 0))), + NextValue(aes_buf[160:192], self.b[160:192] ^ Cat(aes_out[1][ 8:16], Signal(24, reset = 0))), + NextValue(aes_buf[192:224], self.b[192:224] ^ Cat(aes_out[2][ 8:16], Signal(24, reset = 0))), + NextValue(aes_buf[224:256], self.b[224:256] ^ Cat(aes_out[3][ 8:16], Signal(24, reset = 0)))], + }), + # put the second byte in the lookup tables (LANE2) + NextValue(aes_in[3], self.a[136:144]), + NextValue(aes_in[0], self.a[168:176]), + NextValue(aes_in[1], self.a[200:208]), + NextValue(aes_in[2], self.a[232:240]), + NextState("LANE1_3")) + seq.act("LANE1_3", + # store the xor'ed result for LANE1, byte 2 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[ 0: 32], aes_buf[ 0: 32] ^ Cat(aes_out[0][16:24], aes_out[0][ 0:16], aes_out[0][ 8:16])), + NextValue(aes_buf[ 32: 64], aes_buf[ 32: 64] ^ Cat(aes_out[1][16:24], aes_out[1][ 0:16], aes_out[1][ 8:16])), + NextValue(aes_buf[ 64: 96], aes_buf[ 64: 96] ^ Cat(aes_out[2][16:24], aes_out[2][ 0:16], aes_out[2][ 8:16])), + NextValue(aes_buf[ 96:128], aes_buf[ 96:128] ^ Cat(aes_out[3][16:24], aes_out[3][ 0:16], aes_out[3][ 8:16]))], + 1:[ NextValue(aes_buf[ 0: 32], aes_buf[ 0: 32] ^ Cat(Signal(8, reset = 0), aes_out[0][ 8:16], Signal(16, reset = 0))), + NextValue(aes_buf[ 32: 64], aes_buf[ 32: 64] ^ Cat(Signal(8, reset = 0), aes_out[1][ 8:16], Signal(16, reset = 0))), + NextValue(aes_buf[ 64: 96], aes_buf[ 64: 96] ^ Cat(Signal(8, reset = 0), aes_out[2][ 8:16], Signal(16, reset = 0))), + NextValue(aes_buf[ 96:128], aes_buf[ 96:128] ^ Cat(Signal(8, reset = 0), aes_out[3][ 8:16], Signal(16, reset = 0)))], + }), + # put the third byte in the lookup tables (LANE1) + NextValue(aes_in[2], self.a[ 16: 24]), + NextValue(aes_in[3], self.a[ 48: 56]), + NextValue(aes_in[0], self.a[ 80: 88]), + NextValue(aes_in[1], self.a[112:120]), + NextState("LANE2_3")) + seq.act("LANE2_3", + # store the xor'ed result for LANE2, byte 2 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[128:160], aes_buf[128:160] ^ Cat(aes_out[0][16:24], aes_out[0][ 0:16], aes_out[0][ 8:16])), + NextValue(aes_buf[160:192], aes_buf[160:192] ^ Cat(aes_out[1][16:24], aes_out[1][ 0:16], aes_out[1][ 8:16])), + NextValue(aes_buf[192:224], aes_buf[192:224] ^ Cat(aes_out[2][16:24], aes_out[2][ 0:16], aes_out[2][ 8:16])), + NextValue(aes_buf[224:256], aes_buf[224:256] ^ Cat(aes_out[3][16:24], aes_out[3][ 0:16], aes_out[3][ 8:16]))], + 1:[ NextValue(aes_buf[128:160], aes_buf[128:160] ^ Cat(Signal(8, reset = 0), aes_out[0][ 8:16], Signal(16, reset = 0))), + NextValue(aes_buf[160:192], aes_buf[160:192] ^ Cat(Signal(8, reset = 0), aes_out[1][ 8:16], Signal(16, reset = 0))), + NextValue(aes_buf[192:224], aes_buf[192:224] ^ Cat(Signal(8, reset = 0), aes_out[2][ 8:16], Signal(16, reset = 0))), + NextValue(aes_buf[224:256], aes_buf[224:256] ^ Cat(Signal(8, reset = 0), aes_out[3][ 8:16], Signal(16, reset = 0)))], + }), + # put the third byte in the lookup tables (LANE2) + NextValue(aes_in[2], self.a[144:152]), + NextValue(aes_in[3], self.a[176:184]), + NextValue(aes_in[0], self.a[208:216]), + NextValue(aes_in[1], self.a[240:248]), + NextState("LANE1_4")) + seq.act("LANE1_4", + # store the xor'ed result for LANE1, byte 3 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[ 0: 32], aes_buf[ 0: 32] ^ Cat(aes_out[0][ 8:24], aes_out[0][ 0:16])), + NextValue(aes_buf[ 32: 64], aes_buf[ 32: 64] ^ Cat(aes_out[1][ 8:24], aes_out[1][ 0:16])), + NextValue(aes_buf[ 64: 96], aes_buf[ 64: 96] ^ Cat(aes_out[2][ 8:24], aes_out[2][ 0:16])), + NextValue(aes_buf[ 96:128], aes_buf[ 96:128] ^ Cat(aes_out[3][ 8:24], aes_out[3][ 0:16]))], + 1:[ NextValue(aes_buf[ 0: 32], aes_buf[ 0: 32] ^ Cat(Signal(16, reset = 0), aes_out[0][ 8:16], Signal(8, reset = 0))), + NextValue(aes_buf[ 32: 64], aes_buf[ 32: 64] ^ Cat(Signal(16, reset = 0), aes_out[1][ 8:16], Signal(8, reset = 0))), + NextValue(aes_buf[ 64: 96], aes_buf[ 64: 96] ^ Cat(Signal(16, reset = 0), aes_out[2][ 8:16], Signal(8, reset = 0))), + NextValue(aes_buf[ 96:128], aes_buf[ 96:128] ^ Cat(Signal(16, reset = 0), aes_out[3][ 8:16], Signal(8, reset = 0)))], + }), + # put the fourth byte in the lookup tables (LANE1) + NextValue(aes_in[1], self.a[ 24: 32]), + NextValue(aes_in[2], self.a[ 56: 64]), + NextValue(aes_in[3], self.a[ 88: 96]), + NextValue(aes_in[0], self.a[120:128]), + NextState("LANE2_4")) + seq.act("LANE2_4", + # store the xor'ed result for LANE2, byte 3 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[128:160], aes_buf[128:160] ^ Cat(aes_out[0][ 8:24], aes_out[0][ 0:16])), + NextValue(aes_buf[160:192], aes_buf[160:192] ^ Cat(aes_out[1][ 8:24], aes_out[1][ 0:16])), + NextValue(aes_buf[192:224], aes_buf[192:224] ^ Cat(aes_out[2][ 8:24], aes_out[2][ 0:16])), + NextValue(aes_buf[224:256], aes_buf[224:256] ^ Cat(aes_out[3][ 8:24], aes_out[3][ 0:16]))], + 1:[ NextValue(aes_buf[128:160], aes_buf[128:160] ^ Cat(Signal(16, reset = 0), aes_out[0][ 8:16], Signal(8, reset = 0))), + NextValue(aes_buf[160:192], aes_buf[160:192] ^ Cat(Signal(16, reset = 0), aes_out[1][ 8:16], Signal(8, reset = 0))), + NextValue(aes_buf[192:224], aes_buf[192:224] ^ Cat(Signal(16, reset = 0), aes_out[2][ 8:16], Signal(8, reset = 0))), + NextValue(aes_buf[224:256], aes_buf[224:256] ^ Cat(Signal(16, reset = 0), aes_out[3][ 8:16], Signal(8, reset = 0)))], + }), + # put the fourth byte in the lookup tables (LANE2) + NextValue(aes_in[1], self.a[152:160]), + NextValue(aes_in[2], self.a[184:192]), + NextValue(aes_in[3], self.a[216:224]), + NextValue(aes_in[0], self.a[248:256]), + NextState("LANE1_F")) + seq.act("LANE1_F", + # store the xor'ed result for LANE1, byte 4 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[ 0: 32], aes_buf[ 0: 32] ^ Cat(aes_out[0][ 8:16], aes_out[0][ 8:24], aes_out[0][ 0: 8])), + NextValue(aes_buf[ 32: 64], aes_buf[ 32: 64] ^ Cat(aes_out[1][ 8:16], aes_out[1][ 8:24], aes_out[1][ 0: 8])), + NextValue(aes_buf[ 64: 96], aes_buf[ 64: 96] ^ Cat(aes_out[2][ 8:16], aes_out[2][ 8:24], aes_out[2][ 0: 8])), + NextValue(aes_buf[ 96:128], aes_buf[ 96:128] ^ Cat(aes_out[3][ 8:16], aes_out[3][ 8:24], aes_out[3][ 0: 8]))], + 1:[ NextValue(aes_buf[ 0: 32], aes_buf[ 0: 32] ^ Cat(Signal(24, reset = 0), aes_out[0][ 8:16])), + NextValue(aes_buf[ 32: 64], aes_buf[ 32: 64] ^ Cat(Signal(24, reset = 0), aes_out[1][ 8:16])), + NextValue(aes_buf[ 64: 96], aes_buf[ 64: 96] ^ Cat(Signal(24, reset = 0), aes_out[2][ 8:16])), + NextValue(aes_buf[ 96:128], aes_buf[ 96:128] ^ Cat(Signal(24, reset = 0), aes_out[3][ 8:16]))], + }), + NextState("LANE2_F")) + seq.act("LANE2_F", + # store the xor'ed result for LANE2, byte 4 in aes_buf + Case(self.instruction.immediate[0:1], { + 0:[ NextValue(aes_buf[128:160], aes_buf[128:160] ^ Cat(aes_out[0][ 8:16], aes_out[0][ 8:24], aes_out[0][ 0: 8])), + NextValue(aes_buf[160:192], aes_buf[160:192] ^ Cat(aes_out[1][ 8:16], aes_out[1][ 8:24], aes_out[1][ 0: 8])), + NextValue(aes_buf[192:224], aes_buf[192:224] ^ Cat(aes_out[2][ 8:16], aes_out[2][ 8:24], aes_out[2][ 0: 8])), + NextValue(aes_buf[224:256], aes_buf[224:256] ^ Cat(aes_out[3][ 8:16], aes_out[3][ 8:24], aes_out[3][ 0: 8]))], + 1:[ NextValue(aes_buf[128:160], aes_buf[128:160] ^ Cat(Signal(24, reset = 0), aes_out[0][ 8:16])), + NextValue(aes_buf[160:192], aes_buf[160:192] ^ Cat(Signal(24, reset = 0), aes_out[1][ 8:16])), + NextValue(aes_buf[192:224], aes_buf[192:224] ^ Cat(Signal(24, reset = 0), aes_out[2][ 8:16])), + NextValue(aes_buf[224:256], aes_buf[224:256] ^ Cat(Signal(24, reset = 0), aes_out[3][ 8:16]))], + }), + NextState("AES_EVEN1")) + seq.act("AES_EVEN1", + NextState("AES_EVEN2")) + seq.act("AES_EVEN2", + NextState("IDLE")) + + self.sync.mul_clk += [ + If(seq.ongoing("AES_EVEN1") | seq.ongoing("AES_EVEN2"), + self.q_valid.eq(1), + If(self.instruction.immediate[8:9], + self.q.eq(aes_buf), + ).Else( + self.q.eq(Cat(aes_buf[0:128], Signal(128, reset = 0))), + ) + ).Else( + self.q_valid.eq(0), + ) + ] + +class ExecLS(ExecUnit, AutoDoc): + def __init__(self, width=256, interface=None): + ExecUnit.__init__(self, width, ["MEM"]) + + self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f""" + """) + + self.sync.eng_clk += [ # pipeline the instruction + self.instruction_out.eq(self.instruction_in), + ] + + assert(width == 256) # fixme + assert(len(interface.sel) == 16) # 128 bits Wishbone + + start_pipe = Signal() + self.sync.mul_clk += start_pipe.eq(self.start) # break critical path of instruction decode -> SETUP_A state muxes + self.submodules.lsseq = lsseq = ClockDomainsRenamer("mul_clk")(FSM(reset_state="IDLE")) + cpar = Signal() # to keep track of the odd-ness of our cycle, so we can align 2 mul_clk cycles of output on 1 eng_clk cycle + lbuf = Signal(width) + timeout = Signal(11) + #tries = Signal() + self.has_failure = Signal(2) + self.has_timeout = Signal(2) + + self.sync.mul_clk += If(timeout > 0, timeout.eq(timeout - 1)) + + lsseq.act("IDLE", + If(start_pipe, + #NextValue(lbuf, 0xF00FF00F_0FF00FF0_F00FF00F_0FF00FF0_F00FF00F_0FF00FF0_F00FF00F_0FF00FF0), + NextValue(cpar, 0), + NextValue(self.has_timeout, 0), + NextValue(self.has_failure, 0), + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, self.a[4:32]), + NextValue(interface.we, self.instruction.immediate[0]), + NextValue(timeout, 2047), + If(self.instruction.immediate[0], # do we need those tests or could we always update dat_w/dat_r ? + NextValue(interface.dat_w, self.b[0:128])), + NextState("MEMl") # MEMl + ) + ) + lsseq.act("MEMl", + NextValue(cpar, cpar ^ 1), + If(interface.ack, + If(~self.instruction.immediate[0], + NextValue(lbuf[0:128], interface.dat_r)), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("MEMl2") + ).Elif(interface.err, + NextValue(self.has_failure[0], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + ).Elif(timeout == 0, + NextValue(self.has_timeout[0], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + )) + lsseq.act("MEMl2", + NextValue(cpar, cpar ^ 1), + If(~interface.ack, + If(self.instruction.immediate[8:9], + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, self.a[132:160]), + NextValue(interface.we, self.instruction.immediate[0]), + NextValue(timeout, 2047), + If(self.instruction.immediate[0], + NextValue(interface.dat_w, self.b[128:256])), + NextState("MEMh") + ).Else( + NextValue(lbuf[128:256], 0), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + ) + )) + lsseq.act("MEMh", + NextValue(cpar, cpar ^ 1), + If(interface.ack, + If(~self.instruction.immediate[0], + NextValue(lbuf[128:256], interface.dat_r)), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("MEMh2") + ).Elif(interface.err, + NextValue(self.has_failure[1], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + ).Elif(timeout == 0, + NextValue(self.has_timeout[1], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + )) + lsseq.act("MEMh2", + NextValue(cpar, cpar ^ 1), + If(~interface.ack, + #NextValue(tries, 0), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + )) + lsseq.act("MEM_ODD", # clock alignement cycle + NextState("MEM_EVEN1")) + lsseq.act("MEM_EVEN1", + NextState("MEM_EVEN2")) + lsseq.act("MEM_EVEN2", + NextValue(cpar, 0), + NextValue(self.has_failure, 0), + NextValue(self.has_timeout, 0), + NextState("IDLE")) + lsseq.act("ERR", + #If(~tries, # second attempt + # NextValue(cpar, 0), + # NextValue(tries, 1), + # NextState("IDLE") + #).Else(NextValue(tries, 0), # no third attempt, give up + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + #) + ) + self.sync.mul_clk += [ + If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"), + self.q_valid.eq(1), + If(~self.instruction.immediate[0], + self.q.eq(lbuf), + ).Else( + # self.q.eq(Cat((self.a[0:32] + 16)[0:32], self.a[32:128], + # (self.a[128:160] + 16)[0:32], self.a[160:256])), + self.q.eq(self.a), + ), + ).Else( + self.q_valid.eq(0), + ) + ] + + self.state = Signal(32) + self.sync.mul_clk += self.state[0].eq(lsseq.ongoing("IDLE")) + self.sync.mul_clk += self.state[1].eq(lsseq.ongoing("MEMl")) + self.sync.mul_clk += self.state[2].eq(lsseq.ongoing("MEMl2")) + self.sync.mul_clk += self.state[3].eq(lsseq.ongoing("MEMh")) + self.sync.mul_clk += self.state[4].eq(lsseq.ongoing("MEMh2")) + self.sync.mul_clk += self.state[5].eq(lsseq.ongoing("MEM_ODD")) + self.sync.mul_clk += self.state[6].eq(lsseq.ongoing("MEM_EVEN1")) + self.sync.mul_clk += self.state[7].eq(lsseq.ongoing("MEM_EVEN2")) + self.sync.mul_clk += self.state[8].eq(lsseq.ongoing("MEM_ERR")) + self.sync.mul_clk += self.state[28:30].eq((self.state[28:30] & Replicate(~start_pipe, 2)) | self.has_timeout) + self.sync.mul_clk += self.state[30:32].eq((self.state[30:32] & Replicate(~start_pipe, 2)) | self.has_failure) + + +class Engine(Module, AutoCSR, AutoDoc): + def __init__(self, platform, prefix, sim=False, build_prefix=""): + opdoc = "\n" + for mnemonic, description in opcodes.items(): + opdoc += f" * **{mnemonic}** ({str(description[0])}) -- {description[1]} \n" + + self.intro = ModuleDoc(title="Curve25519 Engine", body=""" +The Curve25519 engine is a microcoded hardware accelerator for Curve25519 operations. +The Engine loosely resembles a Harvard architecture microcoded CPU, with a single +512-entry, 256-bit wide 2R1W windowed-register file, a handful of execution units, and a "mailbox" +unit (like a load/store, but transactional to wishbone). The Engine's microcode is +contained in a 1k-entry, 32-bit wide microcode block. Microcode procedures are written to +the block, and execution will start from the `mpstart` offset when the `go` bit is set. +Execution will stop after either one of two conditions are met: either a `FIN` instruction +is executed, or the microcode program counter (mpc) goes past the stop threshold, computed +as `mpstart` + `mplen`. + +The register file is "windowed". A single window consists of 32x256-bit wide registers, +and there are up to 16 windows. The concept behind windows is that core routines, such +as point doubling and point addition, are codable using no more than 32 intermediate +registers. The same microcode can be used, then, to serve point operations to up to +16 different clients, selectable by setting the appropriate window. Note that the register +file will stripe across four 4kiB pages, which means that memory protection can be +enforced at page-level boundaries by hardware (with the help of the OS) for up to four +separate clients, each getting four register windows. + +Every register read can be overridden from a constant ROM, by asserting `ca` or `cb` for +registers a and b respectively. When either of these bits are asserted, the respective +register address is fed into a "constants" lookup table, and the result of that table lookup is +replaced for the constant value. This means up to 32 commonly used constants may be stored +in the hardware for quick retrieval. + +.. image:: https://raw.githubusercontent.com/betrusted-io/gateware/master/gateware/curve25519/block_diagram.png + :alt: High-level block diagram of the Curev25519 engine + +Above is a high-level block diagram of the Curve25519 engine. Four clocks are present +in this microarchitecture, and they are phase-aligned thanks to the 7-Series MMCM +and low-skew global clock network. `eng_clk` is 50MHz, `mul_clk` is 100MHz, and +`rf_clk` is 200MHz. The slowest 50MHz `eng_clk` clock controls the `seq` state machine, whose +state names are listed on the left. A 50MHz base clock is chosen because this allows a +single-cycle 256-bit add/sub using hardware carry chains in the Spartan7 -1L speed grade, +greatly simplifying most of the arithmetic blocks. Faster clocks are used to pump the microcode +RAM (100MHz) and register file (200MHz), so that we are wasting less time fetching instructions +and operands. In particular, the register file uses four phases because we are emulating +a three-port register file (2R1W) using a single-port memory primitive, and the microcode RAM +runs at 100MHz (sysclk) for convenience of reading/writing instructions from the Wishbone bus. +Not shown in the diagram are the global "window" register bits, or the multiplexers that +switch off the datapaths when the system is not running allowing Wishbone full access to +the machine state. + +Execution units are subclasses of "ExecUnit", and their instantiation is controlled by +inclusion in the `exec_units` dictionary. Likewise, opcodes are defined in the `opcodes`, +dictionary, and opcodes are bound to ExecUnits by passing them as the `opcode_list` argument +to the execution units. + +Note that execution units can take an arbitrary amount of time to complete. Most will complete +in one cycle, but for example, the multiplier takes 52 cycles @ 100MHz, or 26 `eng_clk` cycles. +The current implementation does not allow pipelined operation; registered stages are provided +to break combinational paths and bring up the base clock rate, but every instruction must go through +the entire FETCH-EXEC-WAIT_DONE cycle before the next one can issue. + +The design is partially outfitted with registers to facilitate pipelining in the future, but +the current simplified implementation is expected to provide adequate speedup. It's +probably not worth the additional resources to do e.g. pipeline bypassing and hazard checking, +as the target FPGA design is nearly at capacity. + +A conservative implementation (no optimization of intermediate values, immediate reduction of +every add/sub operation) of Montgomery scalar multiplication using Engine25519 +completes one scalar multiply operation in 2.270ms, compared to 103ms in software. +This does not include the time required to do the final affine inversion (done in software, +with significant overhead -- about 100ms), or the time to load the microcode and operands (about 5us). +The affine inversion can also be microcoded, it just hasn't been done yet. + +The Engine address space is divided up as follows (expressed as offset from base):: + + 0x0_0000 - 0x0_0fff: microcode (one 4k byte page) + 0x1_0000 - 0x1_3fff: memory-mapped register file (4 x 4k pages = 16kbytes) + +Here are the currently implemented opcodes for The Engine: +{} + """.format(opdoc)) + + microcode_width = 32 + microcode_depth = 1024 + running = Signal() # asserted when microcode is running + + instruction = Record(instruction_layout) # current instruction to execute + illegal_opcode = Signal() + abort = Signal(); + + ### register file + rf_depth_raw = 512 + rf_width_raw = 256 + self.submodules.rf = rf = RegisterFile(depth=rf_depth_raw, width=rf_width_raw) + self.window = CSRStorage(fields=[ + CSRField("window", size=log2_int(rf_depth_raw) - log2_int(num_registers), description="Selects the current register window to use"), + ]) + + self.mpstart = CSRStorage(fields=[ + CSRField("mpstart", size=log2_int(microcode_depth), description="Where to start execution") + ]) + self.mplen = CSRStorage(fields=[ + CSRField("mplen", size=log2_int(microcode_depth), description="Length of the current microcode program. Thus valid code must be in the range of [mpstart, mpstart + mplen]"), + ]) + self.control = CSRStorage(fields=[ + CSRField("go", size=1, pulse=True, description="Writing to this puts the engine in `run` mode, and it will execute mplen microcode instructions starting at mpstart"), + ]) + self.mpresume = CSRStatus(fields=[ + CSRField("mpresume", size=log2_int(microcode_depth), description="Where to resume execution after a pause") + ]) + + self.power = CSRStorage(fields=[ + CSRField("on", size=1, reset=0, + description="Writing `1` turns on the clocks to this block, `0` stops the clocks (for power savings). The handling of the clock gate is in a different module, this is just a flag to that block."), + CSRField("pause_req", size=1, description="Writing a `1` to this block will pause execution at the next micro-op, and allow for read-out of data from RF/microcode. Must check pause_gnt to confirm the pause has happened. Used to interrupt flow for suspend/resume."), + ]) + # bring pause into the eng_clk domain + pause_req = Signal() + self.sync.eng_clk += pause_req.eq(self.power.fields.pause_req) + # re-sync the eng_clk phase to the RF phase whenever clocks are re-applied. We don't guarantee that the clocks start exactly + # at the same time, so you can get phase shift... + power_on_delay = Signal(max=16, reset=15) + eng_powered_on = Signal() + self.sync += [ # stretch out any power on pulse so we can process a reset in the clk50 domain after its enable has been switched on + If(~self.power.fields.on, + power_on_delay.eq(15) + ).Elif(power_on_delay > 0, + power_on_delay.eq(power_on_delay - 1) + ).Else( + power_on_delay.eq(0) + ), + eng_powered_on.eq(power_on_delay == 0), # make a signal that specifies that the engine is powered on that happens 16 cycles after the clocks are turned on + # note that this signal drops only *after* the power has been toggled, because when the clock is cut, + # the downstream "eng_clk" domain signals won't capture the latest state. So, once the power comes on, + # eng_powered_on must drop for a few cycles, then come back up again, which properly triggers a synchronization of the RF. + ] + eng_on_50 = Signal() + eng_on_50_r = Signal() + self.specials += MultiReg(eng_powered_on, eng_on_50, "eng_clk") + self.sync.eng_clk += eng_on_50_r.eq(eng_on_50) + rf_reset_clear = Signal() + self.specials += MultiReg(ResetSignal("eng_clk"), rf_reset_clear, "eng_clk") # sync up the register file's fast clock to our slow clock + self.comb += rf.clear.eq(rf_reset_clear | (eng_on_50 & ~eng_on_50_r)) + + self.status = CSRStatus(fields=[ + CSRField("running", size=1, description="When set, the microcode engine is running. All wishbone access to RF and microcode memory areas will stall until this bit is clear"), + CSRField("mpc", size=log2_int(microcode_depth), description="Current location of the microcode program counter. Mostly for debug."), + CSRField("pause_gnt", size=1, description="When set, the engine execution has been paused, and the RF & microcode ROM can be read out for suspend/resume"), + CSRField("sigill", size=1, description="Illegal Instruction"), + CSRField("abort", size=1, description="Abort from failure"), + CSRField("finished", size=1, description="Finished"), + ]) + pause_gnt = Signal() + mpc = Signal(log2_int(microcode_depth)) # the microcode program counter + running_r = Signal() + self.sync += [ + self.status.fields.running.eq(running), + self.status.fields.pause_gnt.eq(pause_gnt), + self.status.fields.mpc.eq(mpc), + self.status.fields.sigill.eq(illegal_opcode), + self.status.fields.abort.eq(abort), + self.status.fields.finished.eq(((~running & running_r) | self.status.fields.finished) & (~(running & ~running_r))), + ] + + self.submodules.ev = EventManager() + self.ev.finished = EventSourcePulse(description="Microcode run finished execution") + self.ev.illegal_opcode = EventSourcePulse(description="Illegal opcode encountered") + self.ev.finalize() + ill_op_r = Signal() + self.sync += [ + running_r.eq(running), + ill_op_r.eq(illegal_opcode), + ] + self.comb += [ + self.ev.finished.trigger.eq(~running & running_r), # falling edge pulse on running + self.ev.illegal_opcode.trigger.eq(~ill_op_r & illegal_opcode), + ] + + ### microcode memory - 1rd/1wr dedicated to wishbone, 1rd for execution + microcode = Memory(microcode_width, microcode_depth) + self.specials += microcode + micro_wrport = microcode.get_port(write_capable=True, mode=READ_FIRST) # READ_FIRST allows BRAM inference + self.specials += micro_wrport + micro_rdport = microcode.get_port(mode=READ_FIRST) + self.specials += micro_rdport + micro_runport = microcode.get_port(mode=READ_FIRST) # , clock_domain="eng_clk" + self.specials += micro_runport + + self.comb += [ + micro_runport.adr.eq(mpc), + instruction.raw_bits().eq(micro_runport.dat_r), # mapping should follow the record definition *exactly* + instruction.eq(micro_runport.dat_r), + ] + instruction_fields = [] + for opcode, bits, description in instruction_layout: + instruction_fields.append(CSRField(opcode, size=bits, description=description)) + self.instruction = CSRStatus(description="Current instruction being executed by the engine. The format of this register exactly reflects the binary layout of an Engine instruction.", fields=instruction_fields) + self.comb += [ + self.instruction.status.eq(micro_runport.dat_r) + ] + + self.ls_status = CSRStatus(32, description="Status of the L/S unit") + + ### wishbone bus interface: decode the two address spaces and dispatch accordingly + self.bus = bus = wishbone.Interface() + wdata = Signal(32) + wadr = Signal(log2_int(rf_depth_raw) + 3) # wishbone bus is 32-bits wide, so 3 extra bits to select the sub-words out of the 256-bit registers + wmask = Signal(4) + wdata_we = Signal() + rdata_re = Signal() + rdata_ack = Signal() + rdata_req = Signal() + radr = Signal(log2_int(rf_depth_raw) + 3) + + micro_rd_waitstates = 2 + micro_rdack = Signal(max=(micro_rd_waitstates+1)) + self.sync += [ + If( ((bus.adr & ((0xFFFF_C000) >> 2)) >= ((prefix | 0x1_0000) >> 2)) & (((bus.adr & ((0xFFFF_C000) >> 2)) < ((prefix | 0x1_4000) >> 2))), + # fully decode register file address to avoid aliasing + If(bus.cyc & bus.stb & bus.we & ~bus.ack, + If(~running | pause_gnt, + wdata.eq(bus.dat_w), + wadr.eq(bus.adr[:wadr.nbits]), + wmask.eq(bus.sel), + wdata_we.eq(1), + If(rf.phase, + bus.ack.eq(1), + ).Else( + bus.ack.eq(0), + ), + ).Else( + wdata_we.eq(0), + bus.ack.eq(0), + ) + ).Elif(bus.cyc & bus.stb & ~bus.we & ~bus.ack, + If(~running | pause_gnt, + radr.eq(bus.adr[:radr.nbits]), + rdata_re.eq(1), + bus.dat_r.eq( rf.ra_dat >> ((radr & 0x7) * 32) ), + bus.ack.eq(rdata_ack), + rdata_req.eq(1), + ).Else( + rdata_re.eq(0), + bus.ack.eq(0), + rdata_req.eq(0), + ) + ).Else( + wdata_we.eq(0), + bus.ack.eq(0), + rdata_req.eq(0), + rdata_re.eq(0), + ) + ).Elif( (bus.adr & ((0xFFFF_F000) >> 2)) == ((0x0 | prefix) >> 2), + # fully decode microcode address to avoid aliasing + If(bus.cyc & bus.stb & bus.we & ~bus.ack, + micro_wrport.adr.eq(bus.adr), + micro_wrport.dat_w.eq(bus.dat_w), + micro_wrport.we.eq(1), + bus.ack.eq(1), + ).Elif(bus.cyc & bus.stb & ~bus.we & ~bus.ack, + micro_wrport.we.eq(0), + micro_rdport.adr.eq(bus.adr), + bus.dat_r.eq(micro_rdport.dat_r), + + If(micro_rdack == 0, # 1 cycle delay for read to occur + bus.ack.eq(1), + ).Else( + bus.ack.eq(0), + micro_rdack.eq(micro_rdack - 1), + ) + ).Else( + micro_wrport.we.eq(0), + micro_rdack.eq(micro_rd_waitstates), + bus.ack.eq(0), + ) + ).Else( + # handle all mis-target reads not explicitly decoded + If(bus.cyc & bus.stb & ~bus.we & ~bus.ack, + bus.dat_r.eq(0xC0DE_BADD), + bus.ack.eq(1), + ).Elif(bus.cyc & bus.stb & bus.we & ~bus.ack, + bus.ack.eq(1), # ignore writes -- but don't hang the bus + ).Else( + bus.ack.eq(0), + ) + + ) + ] + + ### execution path signals to register file + ra_dat = Signal(rf_width_raw) + ra_adr = Signal(log2_int(num_registers)) + ra_const = Signal() + rb_dat = Signal(rf_width_raw) + rb_adr = Signal(log2_int(num_registers)) + rb_const = Signal() + wd_dat = Signal(rf_width_raw) + wd_adr = Signal(log2_int(num_registers)) + rf_write = Signal() + + self.submodules.ra_const_rom = Curve25519Const(insert_docs=True) + self.submodules.rb_const_rom = Curve25519Const() + + ### merge execution path signals with host access paths + self.comb += [ + ra_const.eq(instruction.ca), + rb_const.eq(instruction.cb), + ra_adr.eq(instruction.ra), + rb_adr.eq(instruction.rb), + self.ra_const_rom.adr.eq(ra_adr), + self.rb_const_rom.adr.eq(rb_adr), + rf.window.eq(self.window.fields.window), + + If(running & ~pause_gnt, + rf.ra_adr.eq(Cat(ra_adr, self.window.fields.window)), + rf.rb_adr.eq(Cat(rb_adr, self.window.fields.window)), + rf.instruction_pipe_in.eq(instruction.raw_bits()), + rf.wd_adr.eq(Cat(wd_adr, self.window.fields.window)), + rf.wd_dat.eq(wd_dat), + rf.wd_bwe.eq(0xFFFF_FFFF), # enable all bytes + rf.we.eq(rf_write), + ).Else( + rf.ra_adr.eq(radr >> 3), + rf.wd_adr.eq(wadr >> 3), + rf.wd_dat.eq(Cat(wdata,wdata,wdata,wdata,wdata,wdata,wdata,wdata)), # replicate; use byte-enable to multiplex + rf.wd_bwe.eq(0xF << ((wadr & 0x7) * 4)), # select the byte + rf.we.eq(wdata_we), + ), + If(~ra_const, + ra_dat.eq(rf.ra_dat), + ).Else( + ra_dat.eq(self.ra_const_rom.const) + ), + If(~rb_const, + rb_dat.eq(rf.rb_dat), + ).Else( + rb_dat.eq(self.rb_const_rom.const) + ) + ] + # simple machine to wait 2 RF clock cycles for data to propagate out of the register file and back to the host + rd_wait_states=4 + bus_rd_wait = Signal(max=(rd_wait_states+1)) + self.sync.rf_clk += [ + If(rdata_req, + If(~running | pause_gnt, + If(bus_rd_wait != 0, + bus_rd_wait.eq(bus_rd_wait-1), + ).Else( + rdata_ack.eq(1), + ) + ) + ).Else( + rdata_ack.eq(0), + bus_rd_wait.eq(rd_wait_states), + ) + ] + + sext_immediate = Signal(log2_int(microcode_depth)) + self.comb += sext_immediate.eq(Cat(instruction.immediate, instruction.immediate[8])) # migen signed math failed us. so manually sign extend. this breaks the configurability of the code. + + ### Microcode sequencer. Very simple: it can only run linear sections of microcode. Feature not bug; + ### constant time operation is a defense against timing attacks. + + # pulse-stretch the go from sys->eng_clk. Don't use Migen CDC primitives, as they add latency; a BlindTransfer + # primitive on its own will take about as much time as a couple instructions on The Engine. + engine_go = Signal() + go_stretch = Signal(2) + self.sync += [ # note that we will miss this if the system throttles our clocks when this pulse arrives + If(self.control.fields.go, + go_stretch.eq(2) + ).Else( + If(go_stretch != 0, + go_stretch.eq(go_stretch - 1), + ) + ) + ] + self.comb += engine_go.eq(self.control.fields.go | (go_stretch != 0)) + + self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE")) + mpc_stop = Signal(log2_int(microcode_depth)) + window_latch = Signal(self.window.fields.window.size) + exec = Signal() # indicates to execution units to start running + done = Signal() # indicates when the given execution units are done (as-muxed from subunits) + self.comb += rf.running.eq(~seq.ongoing("IDLE") | rdata_re), # let the RF know when we're not executing, so it can idle to save power + seq.act("IDLE", + NextValue(pause_gnt, 0), + If(engine_go, + If(pause_req, + NextValue(mpc, self.mpresume.fields.mpresume) + ).Else( + NextValue(mpc, self.mpstart.fields.mpstart) + ), + NextValue(mpc_stop, self.mpstart.fields.mpstart + self.mplen.fields.mplen - 1), + NextValue(window_latch, self.window.fields.window), + NextValue(running, 1), + NextState("FETCH"), + ).Else( + NextValue(running, 0), + ) + ) + seq.act("FETCH", + If(pause_req, + NextState("PAUSED"), + NextValue(pause_gnt, 1), + ).Else( + # one cycle latency for instruction fetch + NextState("EXEC"), + NextValue(pause_gnt, 0), + ) + ) + seq.act("EXEC", # not a great name. This is actually where the register file fetches its contents. + If(instruction.opcode == opcodes["BRZ"][0], + NextState("DO_BRZ"), + ).Elif(instruction.opcode == opcodes["FIN"][0], + NextState("IDLE"), + NextValue(running, 0), + ).Elif(instruction.opcode < opcodes["MAX"][0], # check if the opcode is legal before running it + exec.eq(1), + NextState("WAIT_DONE"), + ).Else( + NextState("ILLEGAL_OPCODE"), + ) + ) + seq.act("WAIT_DONE", # this is where the actual instruction execution happens. + If(done, # TODO: for now, we just wait for each instruction to finish; but the foundations are around for pipelining... + If(mpc < mpc_stop, + NextState("FETCH"), + NextValue(mpc, mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ) + ) + seq.act("ILLEGAL_OPCODE", + NextState("IDLE"), + NextValue(running, 0), + illegal_opcode.eq(1), + ) + seq.act("DO_BRZ", + If(ra_dat == 0, + If( (sext_immediate + mpc + 1 < mpc_stop) & (sext_immediate + mpc + 1 >= self.mpstart.fields.mpstart), # validate new PC is in range + NextState("FETCH"), + NextValue(mpc, sext_immediate + mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ).Else( + If(abort, + NextState("IDLE"), + NextValue(running, 0), + ).Elif(mpc < mpc_stop, + NextState("FETCH"), + NextValue(mpc, mpc + 1), + ).Else( + NextState("IDLE"), + NextValue(running, 0), + ) + ), + ) + seq.act("PAUSED", + If(~pause_req, + NextValue(pause_gnt, 0), + NextState("FETCH"), # could probably go directly to "EXEC", but, this is a minor detail recovering from pause + ) + ) + + self.busls = wishbone.Interface(data_width = 128, adr_width = 28) + exec_units = { + "exec_mask" : ExecMask(width=rf_width_raw), + "exec_logic" : ExecLogic(width=rf_width_raw), + "exec_addsub" : ExecAddSub(width=rf_width_raw), + "exec_testreduce": ExecTestReduce(width=rf_width_raw), + "exec_mul" : ExecMul(width=rf_width_raw, sim=sim), + "exec_clmul" : ExecClmul(width=rf_width_raw), + "exec_gcmshifts" : ExecGCMShifts(width=rf_width_raw), + "exec_aes" : ExecAES(width=rf_width_raw), + "exec_ls" : ExecLS(width=rf_width_raw,interface=self.busls) + } + index = 0 + for name, unit in exec_units.items(): + setattr(self.submodules, name, unit); + setattr(self, "done" + str(index), Signal(name="done"+str(index))) + setattr(self, "unit_q" + str(index), Signal(wd_dat.nbits, name="unit_q"+str(index))) + setattr(self, "unit_sel" + str(index), Signal(name="unit_sel"+str(index))) + setattr(self, "unit_wd" + str(index), Signal(log2_int(num_registers), name="unit_wd"+str(index))) + subdecode = Signal() + for op in unit.opcode_list: + self.comb += [ + If(instruction.opcode == opcodes[op][0], + subdecode.eq(1) + ) + ] + instruction_out = Record(instruction_layout) + self.comb += [ + instruction_out.raw_bits().eq(unit.instruction_out) + ] + self.comb += [ + unit.start.eq(exec & subdecode), + getattr(self, "done" + str(index)).eq(unit.q_valid), + unit.a.eq(ra_dat), + unit.b.eq(rb_dat), + unit.instruction_in.eq(instruction.raw_bits()), + getattr(self, "unit_q" + str(index)).eq(unit.q), + getattr(self, "unit_sel" + str(index)).eq(subdecode), + getattr(self, "unit_wd" + str(index)).eq(instruction_out.wd), + ] + index += 1 + + for i in range(index): + self.comb += [ + If(getattr(self, "done" + str(i)), + done.eq(1), # TODO: for proper pipelining, handle case of two units done simultaneously! + wd_dat.eq(getattr(self, "unit_q" + str(i))), + wd_adr.eq(getattr(self, "unit_wd" + str(i))), + ).Elif(seq.ongoing("IDLE"), + done.eq(0), + ) + ] + + self.comb += [ + rf_write.eq(done), + ] + + self.sync += abort.eq((abort & ~engine_go) | (self.exec_ls.has_failure[0] | self.exec_ls.has_failure[1] | self.exec_ls.has_timeout[0] | self.exec_ls.has_timeout[1])) + self.comb += self.ls_status.status.eq(self.exec_ls.state) + + ##### TIMING CONSTRAINTS -- you want these. Trust me. + + clk50 = "clk50" + #clk100 = "clk100" + clk100 = "sysclk" + clk200 = "clk200" + # registered exec units need this set of rules + ### clk200->clk50 multi-cycle paths: + # we architecturally guarantee extra setup time from the register file to the point of consumption: + # read data is stable by the 3rd phase of the RF fetch cycle, and so it is in fact ready even before + # the other signals that trigger the execute mode, hence 4+1 cycles total setup time + platform.add_platform_command("set_multicycle_path 5 -setup -start -from [get_clocks " + clk200 + "] -to [get_clocks " + clk50 + "] -through [get_cells *rf_r*_dat_reg*]") + platform.add_platform_command("set_multicycle_path 4 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk50 + "] -through [get_cells *rf_r*_dat_reg*]") + ### clk200->clk100 multi-cycle paths: + # same as above, but for the multiplier path. + platform.add_platform_command("set_multicycle_path 3 -setup -start -from [get_clocks " + clk200 + "] -to [get_clocks " + clk100 + "] -through [get_cells *rf_r*_dat_reg*]") + platform.add_platform_command("set_multicycle_path 2 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk100 + "] -through [get_cells *rf_r*_dat_reg*]") + + # unregistered exec units need this set of rules + ### clk200->clk200 multi-cycle paths: + # this is for the case when we don't register the data, and just go straight from RF out put RF input. In the worst case + # we have three (? maybe five?) clk200 cycles to compute as we phase through the reads and writes + platform.add_platform_command("set_multicycle_path 3 -setup -from [get_clocks " + clk200 + "] -to [get_clocks " + clk200 + "] -through [get_cells *rf_r*_dat_reg*]") + platform.add_platform_command("set_multicycle_path 2 -hold -end -from [get_clocks " + clk200 + "] -to [get_clocks " + clk200 + "] -through [get_cells *rf_r*_dat_reg*]") + + # other paths + ### sys->clk200 multi-cycle paths: + # microcode fetch is stable 10ns before use by the register file, by design + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=ra_const) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=ra_const) + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=rb_const) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=rb_const) + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.ra_const_rom.adr) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.ra_const_rom.adr) + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.rb_const_rom.adr) + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk100 + "] -through [get_nets {net}*]", net=self.rb_const_rom.adr) + # ignore the clk200 reset path for timing purposes -- there is >1 cycle guaranteed after reset for everything to settle before anything moves on these paths + platform.add_platform_command("set_false_path -through [get_nets " + clk200 + "_rst]") + # ignore the clk50 reset path for timing purposes -- there is > 1 cycle guaranteed after reset for everything to settle before anything moves on these paths (applies for other crypto engines, (SHA/AES) as well) + platform.add_platform_command("set_false_path -through [get_nets " + clk50 + "_rst]") + ### sys->clk50 multi-cycle paths: + # microcode fetch is guaranteed not to transition in the middle of an exec computation + platform.add_platform_command("set_multicycle_path 2 -setup -start -from [get_clocks " + clk100 + "] -to [get_clocks " + clk50 + "] -through [get_cells microcode_reg*]") + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk50 + "] -through [get_cells microcode_reg*]") + ### clk50->clk200 multi-cycle paths: + # engine running will set up a full eng_clk cycle before any RF accesses need to be valid + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_nets {{ {net1} {net2} {net3} }}]", net1=running, net2=running_r, net3=rf.running) + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_nets {{ {net1} {net2} {net3} }}]", net1=running, net2=running_r, net3=rf.running) + # this signal is a combo from clk50+sys + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") + # data writeback happens on phase==2, and thus is stable for at least two clk200 clocks extra + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 2 -setup -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + platform.add_platform_command("set_multicycle_path 1 -hold -end -from [get_clocks " + clk50 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + ### sys->clk200 multi-cycle paths: + # data writeback happens on phase==2, and thus is stable for at least two clk200 clocks extra + one full eng_clk (total 25ns) + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/DI*DI*]") + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins RF_RAMB*/*/ADDR*ADDR*]") + # this signal is a combo from clk50+sys + platform.add_platform_command("set_multicycle_path 4 -setup -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") + platform.add_platform_command("set_multicycle_path 3 -hold -end -from [get_clocks " + clk100 + "] -to [get_clocks " + clk200 + "] -through [get_pins *rf_wren_pipe_reg/D]") diff --git a/sbus-to-ztex-gateware-migen/engine_code/Cargo.toml b/sbus-to-ztex-gateware-migen/engine_code/Cargo.toml new file mode 100644 index 0000000..a877b59 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/engine_code/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "engine_code" +version = "0.1.0" +authors = ["Romain Dolbeau "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[dependencies.engine25519-as] +#git="https://github.com/betrusted-io/engine25519-as.git" +#rev="6681e73c1fdc4a460b5ef9f9c7c91aef546d00f3" +path = "/home/dolbeau/engine25519-as" + +[dev-dependencies.engine25519-as] +#git="https://github.com/betrusted-io/engine25519-as.git" +#rev="6681e73c1fdc4a460b5ef9f9c7c91aef546d00f3" +path = "/home/dolbeau/engine25519-as" + +[[bin]] +name = "engine_code" +path = "engine_code.rs" diff --git a/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs new file mode 100644 index 0000000..42974be --- /dev/null +++ b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs @@ -0,0 +1,1146 @@ +#![recursion_limit="768"] + +extern crate engine25519_as; +use engine25519_as::*; + +fn main() -> std::io::Result<()> { + let mcode = assemble_engine25519!( + start: + // P.U in %20 + // P.W in %21 + // Q.U in %22 + // Q.W in %23 + // affine_PmQ in %24 // I + // %30 is the TRD scratch register and cswap dummy + // %29 is the subtraction temporary value register and k_t + // x0.U in %25 // I + // x0.W in %26 // I + // x1.U in %27 // I + // x1.W in %28 /// I + // %19 is the loop counter, starts with 254 (if 0, loop runs exactly once) // I + // %31 is the scalar // I + // %18 is the swap variable + psa %18, #0 + + // for i in (0..255).rev() + mainloop: + // let choice: u8 = (bits[i + 1] ^ bits[i]) as u8; + // ProjectivePoint::conditional_swap(&mut x0, &mut x1, choice.into()); + xbt %29, %31 // orignally[k_t = (k>>t) & 1] now[k_t = k[254]] + shl %31, %31 // k = k<<1 + xor %18, %18, %29 // swap ^= k_t + + // cswap x0.U (%25), x1.U (%27) + xor %30, %25, %27 + msk %30, %18, %30 + xor %25, %30, %25 + xor %27, %30, %27 + // cswap x0.W (%26), x1.W (%28) + xor %30, %26, %28 + msk %30, %18, %30 + xor %26, %30, %26 + xor %28, %30, %28 + + psa %18, %29 // swap = k_t + + // differential_add_and_double(&mut x0, &mut x1, &affine_u); + psa %20, %25 + psa %21, %26 + psa %22, %27 + psa %23, %28 + // affine_u is already in %24 + + // let t0 = &P.U + &P.W; + add %0, %20, %21 + trd %30, %0 + sub %0, %0, %30 + // let t1 = &P.U - &P.W; + sub %21, #3, %21 // negate &P.W using #FIELDPRIME (#3) + add %1, %20, %21 + trd %30, %1 + sub %1, %1, %30 + // let t2 = &Q.U + &Q.W; + add %2, %22, %23 + trd %30, %2 + sub %2, %2, %30 + // let t3 = &Q.U - &Q.W; + sub %23, #3, %23 + add %3, %22, %23 + trd %30, %3 + sub %3, %3, %30 + // let t4 = t0.square(); // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2 + mul %4, %0, %0 + // let t5 = t1.square(); // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2 + mul %5, %1, %1 + // let t6 = &t4 - &t5; // 4 U_P W_P + sub %29, #3, %5 + add %6, %4, %29 + trd %30, %6 + sub %6, %6, %30 + // let t7 = &t0 * &t3; // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q + mul %7, %0, %3 + // let t8 = &t1 * &t2; // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q + mul %8, %1, %2 + // let t9 = &t7 + &t8; // 2 (U_P U_Q - W_P W_Q) + add %9, %7, %8 + trd %30, %9 + sub %9, %9, %30 + // let t10 = &t7 - &t8; // 2 (W_P U_Q - U_P W_Q) + sub %29, #3, %8 + add %10, %7, %29 + trd %30, %10 + sub %10, %10, %30 + // let t11 = t9.square(); // 4 (U_P U_Q - W_P W_Q)^2 + mul %11, %9, %9 + // let t12 = t10.square(); // 4 (W_P U_Q - U_P W_Q)^2 + mul %12, %10, %10 + // let t13 = &APLUS2_OVER_FOUR * &t6; // (A + 2) U_P U_Q + mul %13, #4, %6 // #4 is A+2/4 + // let t14 = &t4 * &t5; // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2 + mul %14, %4, %5 + // let t15 = &t13 + &t5; // (U_P - W_P)^2 + (A + 2) U_P W_P + add %15, %13, %5 + trd %30, %15 + sub %15, %15, %30 + // let t16 = &t6 * &t15; // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P) + mul %16, %6, %15 + // let t17 = affine_PmQ * &t12; // U_D * 4 (W_P U_Q - U_P W_Q)^2 + mul %17, %24, %12 // affine_PmQ loaded into %24 + + ///// these can be eliminated down the road, but included for 1:1 algorithm correspodence to reference in early testing + // P.U = t14; // U_{P'} = (U_P + W_P)^2 (U_P - W_P)^2 + psa %20, %14 + // P.W = t16; // W_{P'} = (4 U_P W_P) ((U_P - W_P)^2 + ((A + 2)/4) 4 U_P W_P) + psa %21, %16 + // let t18 = t11; // W_D * 4 (U_P U_Q - W_P W_Q)^2 + // Q.U = t18; // U_{Q'} = W_D * 4 (U_P U_Q - W_P W_Q)^2 + psa %22, %11 // collapsed two to save a register + // Q.W = t17; // W_{Q'} = U_D * 4 (W_P U_Q - U_P W_Q)^2 + psa %23, %17 + + ///// 'return' arguments for next iteration, can be optimized out later + psa %25, %20 + psa %26, %21 + psa %27, %22 + psa %28, %23 + + brz end, %19 // if loop counter is 0, quit + sub %19, %19, #1 // subtract one from the loop counter and run again + brz mainloop, #0 // go back to the top + end: + // ProjectivePoint::conditional_swap(&mut x0, &mut x1, Choice::from(bits[0] as u8)); + // cswap x0.U (%25), x1.U (%27) + xor %30, %25, %27 + msk %30, %18, %30 + xor %25, %30, %25 + xor %27, %30, %27 + // cswap x0.W (%26), x1.W (%28) + xor %30, %26, %28 + msk %30, %18, %30 + xor %26, %30, %26 + xor %28, %30, %28 + + // AFFINE SPLICE -- pass arguments to the affine block + psa %29, %25 + psa %30, %26 + // W.invert() in %21 + // U in %29 + // W in %30 + // result in %31 + // loop counter in %28 + + // from FieldElement.invert() + // let (t19, t3) = self.pow22501(); // t19: 249..0 ; t3: 3,1,0 + // let t0 = self.square(); // 1 e_0 = 2^1 + mul %0, %30, %30 // self is W, e.g. %30 + // let t1 = t0.square().square(); // 3 e_1 = 2^3 + mul %1, %0, %0 + mul %1, %1, %1 + // let t2 = self * &t1; // 3,0 e_2 = 2^3 + 2^0 + mul %2, %30, %1 + // let t3 = &t0 * &t2; // 3,1,0 + mul %3, %0, %2 + // let t4 = t3.square(); // 4,2,1 + mul %4, %3, %3 + // let t5 = &t2 * &t4; // 4,3,2,1,0 + mul %5, %2, %4 + + // let t6 = t5.pow2k(5); // 9,8,7,6,5 + psa %28, #5 // coincidentally, constant #5 is the number 5 + mul %6, %5, %5 + pow2k_5: + sub %28, %28, #1 // %28 = %28 - 1 + brz pow2k_5_exit, %28 + mul %6, %6, %6 + brz pow2k_5, #0 + pow2k_5_exit: + // let t7 = &t6 * &t5; // 9,8,7,6,5,4,3,2,1,0 + mul %7, %6, %5 + + // let t8 = t7.pow2k(10); // 19..10 + psa %28, #6 // constant #6 is the number 10 + mul %8, %7, %7 + pow2k_10: + sub %28, %28, #1 + brz pow2k_10_exit, %28 + mul %8, %8, %8 + brz pow2k_10, #0 + pow2k_10_exit: + // let t9 = &t8 * &t7; // 19..0 + mul %9, %8, %7 + + // let t10 = t9.pow2k(20); // 39..20 + psa %28, #7 // constant #7 is the number 20 + mul %10, %9, %9 + pow2k_20: + sub %28, %28, #1 + brz pow2k_20_exit, %28 + mul %10, %10, %10 + brz pow2k_20, #0 + pow2k_20_exit: + // let t11 = &t10 * &t9; // 39..0 + mul %11, %10, %9 + + // let t12 = t11.pow2k(10); // 49..10 + psa %28, #6 // constant #6 is the number 10 + mul %12, %11, %11 + pow2k_10b: + sub %28, %28, #1 + brz pow2k_10b_exit, %28 + mul %12, %12, %12 + brz pow2k_10b, #0 + pow2k_10b_exit: + // let t13 = &t12 * &t7; // 49..0 + mul %13, %12, %7 + + // let t14 = t13.pow2k(50); // 99..50 + psa %28, #8 // constant #8 is the number 50 + mul %14, %13, %13 + pow2k_50a: + sub %28, %28, #1 + brz pow2k_50a_exit, %28 + mul %14, %14, %14 + brz pow2k_50a, #0 + pow2k_50a_exit: + // let t15 = &t14 * &t13; // 99..0 + mul %15, %14, %13 + + // let t16 = t15.pow2k(100); // 199..100 + psa %28, #9 // constant #9 is the number 100 + mul %16, %15, %15 + pow2k_100: + sub %28, %28, #1 + brz pow2k_100_exit, %28 + mul %16, %16, %16 + brz pow2k_100, #0 + pow2k_100_exit: + // let t17 = &t16 * &t15; // 199..0 + mul %17, %16, %15 + + // let t18 = t17.pow2k(50); // 249..50 + psa %28, #8 // constant #8 is the number 50 + mul %18, %17, %17 + pow2k_50b: + sub %28, %28, #1 + brz pow2k_50b_exit, %28 + mul %18, %18, %18 + brz pow2k_50b, #0 + pow2k_50b_exit: + // let t19 = &t18 * &t13; // 249..0 + mul %19, %18, %13 + //(t19, t3) // just a return value, values are already there, do nothing + + //let t20 = t19.pow2k(5); // 254..5 + psa %28, #5 + mul %20, %19, %19 + pow2k_5_last: + sub %28, %28, #1 + brz pow2k_5_last_exit, %28 + mul %20, %20, %20 + brz pow2k_5_last, #0 + pow2k_5_last_exit: + + //let t21 = &t20 * &t3; // 254..5,3,1,0 + mul %21, %20, %3 + + // u = &self.U * &self.W.invert() + mul %31, %29, %21 + fin // finish execution + ); + let mcode_upd = assemble_engine25519!( + start: + // P.U in %20 + // P.W in %21 + // Q.U in %22 + // Q.W in %23 + // affine_PmQ in %24 // I + // %30 is the TRD scratch register and cswap dummy + // %29 is the subtraction temporary value register and k_t + // x0.U in %25 // !I + // x0.W in %26 // !I + // x1.U in %27 // !I + // x1.W in %28 // !I + // %19 is the loop counter, starts with 254 (if 0, loop runs exactly once) // I + // %31 is the scalar // I + // %18 is the swap variable + psa %25, #1 + psa %26, #0 + psa %27, %24 + psa %28, #1 + // #10 is 254 in my Engine + psa %19, #10 + psa %18, #0 + + // for i in (0..255).rev() + mainloop: + // let choice: u8 = (bits[i + 1] ^ bits[i]) as u8; + // ProjectivePoint::conditional_swap(&mut x0, &mut x1, choice.into()); + xbt %29, %31 // orignally[k_t = (k>>t) & 1] now[k_t = k[254]] + shl %31, %31 // k = k<<1 + xor %18, %18, %29 // swap ^= k_t + + // cswap x0.U (%25), x1.U (%27) + xor %30, %25, %27 + msk %30, %18, %30 + xor %25, %30, %25 + xor %27, %30, %27 + // cswap x0.W (%26), x1.W (%28) + xor %30, %26, %28 + msk %30, %18, %30 + xor %26, %30, %26 + xor %28, %30, %28 + + psa %18, %29 // swap = k_t + + // differential_add_and_double(&mut x0, &mut x1, &affine_u); + // affine_u is already in %24 + + // let t0 = &P.U + &P.W; + add %0, %25, %26 + trd %30, %0 + sub %0, %0, %30 + // let t1 = &P.U - &P.W; + sub %26, #3, %26 // negate &P.W using #FIELDPRIME (#3) + add %1, %25, %26 + trd %30, %1 + sub %1, %1, %30 + // let t2 = &Q.U + &Q.W; + add %2, %27, %28 + trd %30, %2 + sub %2, %2, %30 + // let t3 = &Q.U - &Q.W; + sub %28, #3, %28 + add %3, %27, %28 + trd %30, %3 + sub %3, %3, %30 + // let t4 = t0.square(); // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2 + mul %4, %0, %0 + // let t5 = t1.square(); // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2 + mul %5, %1, %1 + // let t6 = &t4 - &t5; // 4 U_P W_P + sub %29, #3, %5 + add %6, %4, %29 + trd %30, %6 + sub %6, %6, %30 + // let t7 = &t0 * &t3; // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q + mul %7, %0, %3 + // let t8 = &t1 * &t2; // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q + mul %8, %1, %2 + // let t9 = &t7 + &t8; // 2 (U_P U_Q - W_P W_Q) + add %9, %7, %8 + trd %30, %9 + sub %9, %9, %30 + // let t10 = &t7 - &t8; // 2 (W_P U_Q - U_P W_Q) + sub %29, #3, %8 + add %10, %7, %29 + trd %30, %10 + sub %10, %10, %30 + // let t11 = t9.square(); // 4 (U_P U_Q - W_P W_Q)^2 + mul %27, %9, %9 + // let t12 = t10.square(); // 4 (W_P U_Q - U_P W_Q)^2 + mul %12, %10, %10 + // let t13 = &APLUS2_OVER_FOUR * &t6; // (A + 2) U_P U_Q + mul %13, #4, %6 // #4 is A+2/4 + // let t14 = &t4 * &t5; // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2 + mul %25, %4, %5 + // let t15 = &t13 + &t5; // (U_P - W_P)^2 + (A + 2) U_P W_P + add %15, %13, %5 + trd %30, %15 + sub %15, %15, %30 + // let t16 = &t6 * &t15; // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P) + mul %26, %6, %15 + // let t17 = affine_PmQ * &t12; // U_D * 4 (W_P U_Q - U_P W_Q)^2 + mul %28, %24, %12 // affine_PmQ loaded into %24 + + brz end, %19 // if loop counter is 0, quit + sub %19, %19, #1 // subtract one from the loop counter and run again + brz mainloop, #0 // go back to the top + end: + // ProjectivePoint::conditional_swap(&mut x0, &mut x1, Choice::from(bits[0] as u8)); + // cswap x0.U (%25), x1.U (%27) + xor %30, %25, %27 + msk %30, %18, %30 + xor %25, %30, %25 + xor %27, %30, %27 + // cswap x0.W (%26), x1.W (%28) + xor %30, %26, %28 + msk %30, %18, %30 + xor %26, %30, %26 + xor %28, %30, %28 + + // AFFINE SPLICE -- pass arguments to the affine block + psa %29, %25 + psa %30, %26 + // W.invert() in %21 + // U in %29 + // W in %30 + // result in %31 + // loop counter in %28 + + // from FieldElement.invert() + // let (t19, t3) = self.pow22501(); // t19: 249..0 ; t3: 3,1,0 + // let t0 = self.square(); // 1 e_0 = 2^1 + mul %0, %30, %30 // self is W, e.g. %30 + // let t1 = t0.square().square(); // 3 e_1 = 2^3 + mul %1, %0, %0 + mul %1, %1, %1 + // let t2 = self * &t1; // 3,0 e_2 = 2^3 + 2^0 + mul %2, %30, %1 + // let t3 = &t0 * &t2; // 3,1,0 + mul %3, %0, %2 + // let t4 = t3.square(); // 4,2,1 + mul %4, %3, %3 + // let t5 = &t2 * &t4; // 4,3,2,1,0 + mul %5, %2, %4 + + // let t6 = t5.pow2k(5); // 9,8,7,6,5 + psa %28, #5 // coincidentally, constant #5 is the number 5 + mul %6, %5, %5 + pow2k_5: + sub %28, %28, #1 // %28 = %28 - 1 + brz pow2k_5_exit, %28 + mul %6, %6, %6 + brz pow2k_5, #0 + pow2k_5_exit: + // let t7 = &t6 * &t5; // 9,8,7,6,5,4,3,2,1,0 + mul %7, %6, %5 + + // let t8 = t7.pow2k(10); // 19..10 + psa %28, #6 // constant #6 is the number 10 + mul %8, %7, %7 + pow2k_10: + sub %28, %28, #1 + brz pow2k_10_exit, %28 + mul %8, %8, %8 + brz pow2k_10, #0 + pow2k_10_exit: + // let t9 = &t8 * &t7; // 19..0 + mul %9, %8, %7 + + // let t10 = t9.pow2k(20); // 39..20 + psa %28, #7 // constant #7 is the number 20 + mul %10, %9, %9 + pow2k_20: + sub %28, %28, #1 + brz pow2k_20_exit, %28 + mul %10, %10, %10 + brz pow2k_20, #0 + pow2k_20_exit: + // let t11 = &t10 * &t9; // 39..0 + mul %11, %10, %9 + + // let t12 = t11.pow2k(10); // 49..10 + psa %28, #6 // constant #6 is the number 10 + mul %12, %11, %11 + pow2k_10b: + sub %28, %28, #1 + brz pow2k_10b_exit, %28 + mul %12, %12, %12 + brz pow2k_10b, #0 + pow2k_10b_exit: + // let t13 = &t12 * &t7; // 49..0 + mul %13, %12, %7 + + // let t14 = t13.pow2k(50); // 99..50 + psa %28, #8 // constant #8 is the number 50 + mul %14, %13, %13 + pow2k_50a: + sub %28, %28, #1 + brz pow2k_50a_exit, %28 + mul %14, %14, %14 + brz pow2k_50a, #0 + pow2k_50a_exit: + // let t15 = &t14 * &t13; // 99..0 + mul %15, %14, %13 + + // let t16 = t15.pow2k(100); // 199..100 + psa %28, #9 // constant #9 is the number 100 + mul %16, %15, %15 + pow2k_100: + sub %28, %28, #1 + brz pow2k_100_exit, %28 + mul %16, %16, %16 + brz pow2k_100, #0 + pow2k_100_exit: + // let t17 = &t16 * &t15; // 199..0 + mul %17, %16, %15 + + // let t18 = t17.pow2k(50); // 249..50 + psa %28, #8 // constant #8 is the number 50 + mul %18, %17, %17 + pow2k_50b: + sub %28, %28, #1 + brz pow2k_50b_exit, %28 + mul %18, %18, %18 + brz pow2k_50b, #0 + pow2k_50b_exit: + // let t19 = &t18 * &t13; // 249..0 + mul %19, %18, %13 + //(t19, t3) // just a return value, values are already there, do nothing + + //let t20 = t19.pow2k(5); // 254..5 + psa %28, #5 + mul %20, %19, %19 + pow2k_5_last: + sub %28, %28, #1 + brz pow2k_5_last_exit, %28 + mul %20, %20, %20 + brz pow2k_5_last, #0 + pow2k_5_last_exit: + + //let t21 = &t20 * &t3; // 254..5,3,1,0 + mul %21, %20, %3 + + // u = &self.U * &self.W.invert() + mul %31, %29, %21 + fin // finish execution + ); + + let mcode2 = assemble_engine25519!( + start: + // P.U in %20 + // P.W in %21 + // Q.U in %22 + // Q.W in %23 + // affine_PmQ in %24 // I + // %30 is the TRD scratch register and cswap dummy + // %29 is the subtraction temporary value register and k_t + // x0.U in %25 // I + // x0.W in %26 // I + // x1.U in %27 // I + // x1.W in %28 /// I + // %19 is the loop counter, starts with 254 (if 0, loop runs exactly once) // I + // %31 is the scalar // I + // %18 is the swap variable + psa %25, #9 + psa %26, #1 + mul %27, %25, %26 + mul %28, %25, %25 + mul %31, %24, %24 + fin + ); + + let gcmcode_test = assemble_engine25519!( + start: + // A in %0 + // B in %1 + clmul %4, %0, %1, #0 + clmul %5, %0, %1, #1 + clmul %6, %0, %1, #2 + clmul %7, %0, %1, #3 + //gcm_sl1ai %8, %0, %1 + //gcm_sl1ai %9, %0, #0 + //gcm_sl1ai %10, %1, %0 + //gcm_sl1ai %11, %1, #0 + gcm_cmpd %12, %0 + gcm_cmpd %13, %1 + //gcm_sri %14, %0, #0 + //gcm_sri %15, %0, #1 + //gcm_sri %16, %0, #2 + //gcm_sri %17, %0, #3 + //gcm_sri %18, %0, #4 + //gcm_sri %19, %0, #5 + //gcm_sri %20, %0, #6 + //gcm_sri %21, %0, #7 + fin + ); + let gcmcode = assemble_engine25519!( + start: + // A in %0 + // B in %1 + + // // poly mult + // C + clmul %4, %0, %1, #0 + // E + clmul %5, %0, %1, #1 + // F + clmul %6, %0, %1, #2 + // D + clmul %7, %0, %1, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %3, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %3 + xor %4, %4, %5 + xor %2, %2, %4 + xor %0, %2, %1 + // output in %0 + fin + ); + let aescode = assemble_engine25519!( + start: + // X in %0 + // KEY in %31-%17 (backward) + // one full round demo + xor %0, %0, %31 + + aesesmi %1, %0, %30 + + aesesmi %0, %1, %29 + + aesesmi %1, %0, %28 + + aesesmi %0, %1, %27 + + aesesmi %1, %0, %26 + + aesesmi %0, %1, %25 + + aesesmi %1, %0, %24 + + aesesmi %0, %1, %23 + + aesesmi %1, %0, %22 + + aesesmi %0, %1, %21 + + aesesmi %1, %0, %20 + + aesesmi %0, %1, %19 + + aesesmi %1, %0, %18 + + aesesi %0, %1, %17 + + fin + ); + let gcm_pfx_code = assemble_engine25519!( + start: + // Input: rkeys in %31-%17 (backward, LE) + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // ADLEN in %12 (in 16-byte-blocks) + // Transient: + // %0, %1, %2 are tmp + // Output: + // all inputs preserved + // H will go in %15 (byte-reverted) + // T will go in %14 + // accum (0) will go in %13 + gcm_brev32 %16, %16 + // use %2 as a flag + psa %2, #1 + psa %1, #0 + genht: + xor %0, %1, %31 + + aesesmi %1, %0, %30 + + aesesmi %0, %1, %29 + + aesesmi %1, %0, %28 + + aesesmi %0, %1, %27 + + aesesmi %1, %0, %26 + + aesesmi %0, %1, %25 + + aesesmi %1, %0, %24 + + aesesmi %0, %1, %23 + + aesesmi %1, %0, %22 + + aesesmi %0, %1, %21 + + aesesmi %1, %0, %20 + + aesesmi %0, %1, %19 + + aesesmi %1, %0, %18 + + aesesi %0, %1, %17 + + // if the %2 flag is cleared, we've just computed T + brz afterht, %2 + // store H in %15 + psa %15, %0 + // increment counter; should we have a gcm_inc_be ? + // for now byterev + special constant + gcm_brev32 %16, %16 + add %16, %16, #11 + gcm_brev32 %16, %16 + // clear flag & go encrypt t + psa %2, #0 + psa %1, %16 + brz genht, #0 + + afterht: + // store T in %14 + psa %14, %0 + + // fully byte-revert H (first byte-in-dword, then dword-in-64bit) + gcm_brev64 %15, %15 + gcm_swap64 %15, %15, %15 + + psa %13, #0 + + // no fin; we fall directly into the AD code + //fin + ); + let gcm_ad_code = assemble_engine25519!( + // Input: rkeys in %31-%17 (backward, LE) + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // ADLEN in %12 (in 16-byte-blocks) + // H in %15 (byte-reverted) + // T in %14 + // accum in %13 + // Transient: + // %0, %1, %4, %5, %6, %7 are tmp + // Output: + // all inputs preserved except ADLEN (%12) & RD_PTR (%3) + // Updated accum is in %13 + + // if no ad, finish + brz done, %12 + // do one block, repeat + do_ad: load %0, %3 + gcm_brev64 %0, %0 + gcm_swap64 %0, %0, %0 + + xor %0, %0, %13 + add %3, %3, #16 + sub %12, %12, #1 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + + brz done, %12 + brz do_ad, #0 + + done: + fin + ); + let gcm_aes_code = assemble_engine25519!( + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // WR_PTR in %11 + // MLEN in %12 (in *complete* 16-byte-blocks) + // H in %15 (byte-reverted) + // T in %14 + // accum in %13 + // Transient: + // %0, %1, %4, %5, %6, %7 are tmp + // Output: + // all inputs preserved except RD_PTR (%3), WR_PTR (%11), MLEN (%12) + // accum is in %13 + + // if no msg, finish + brz done, %12 + // do one block, repeat + do_msg: + // increment counter + gcm_brev32 %16, %16 + add %16, %16, #11 + gcm_brev32 %16, %16 + + xor %0, %16, %31 + + aesesmi %1, %0, %30 + + aesesmi %0, %1, %29 + + aesesmi %1, %0, %28 + + aesesmi %0, %1, %27 + + aesesmi %1, %0, %26 + + aesesmi %0, %1, %25 + + aesesmi %1, %0, %24 + + aesesmi %0, %1, %23 + + aesesmi %1, %0, %22 + + aesesmi %0, %1, %21 + + aesesmi %1, %0, %20 + + aesesmi %0, %1, %19 + + aesesmi %1, %0, %18 + + aesesi %1, %1, %17 + + //gcm_brev64 %1, %0 + //gcm_swap64 %1, %1, %1 + + load %0, %3 + xor %0, %0, %1 + store %11, %11, %0 + + gcm_brev64 %0, %0 + gcm_swap64 %0, %0, %0 + + xor %0, %0, %13 + add %3, %3, #16 + add %11, %11, #16 + + sub %12, %12, #1 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + + brz done, %12 + brz do_msg, #0 + done: + fin + + ); + let gcm_finish_code = assemble_engine25519!( + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // WR_PTR in %11 + // MLEN in %12 (do one *partial* 16-byte-blocks, so 0 or non-zero) + // MMASK in %10 (could be computed from MLEN%16 but we don't have an instruction for it yet) + // finalblock in %9 (could be computed but we'd need to know the exact value of adlen) + // H in %15 (byte-reverted) + // T in %14 + // accum in %13 + // Transient: + // %0, %1, %4, %5, %6, %7 are tmp + // Output: + // all inputs preserved except RD_PTR (%3), WR_PTR (%11), MLEN (%12) + // accum is in %13 + // accum ^ T is in %8 + brz last, %12 + + finish_mlen: + // increment counter + gcm_brev32 %16, %16 + add %16, %16, #11 + gcm_brev32 %16, %16 + + xor %0, %16, %31 + + aesesmi %1, %0, %30 + + aesesmi %0, %1, %29 + + aesesmi %1, %0, %28 + + aesesmi %0, %1, %27 + + aesesmi %1, %0, %26 + + aesesmi %0, %1, %25 + + aesesmi %1, %0, %24 + + aesesmi %0, %1, %23 + + aesesmi %1, %0, %22 + + aesesmi %0, %1, %21 + + aesesmi %1, %0, %20 + + aesesmi %0, %1, %19 + + aesesmi %1, %0, %18 + + aesesi %1, %1, %17 + + //gcm_brev64 %1, %0 + //gcm_swap64 %1, %1, %1 + + and %1, %1, %10 + load %0, %3 + xor %0, %0, %1 + + store %11, %11, %0 + + gcm_brev64 %0, %0 + gcm_swap64 %0, %0, %0 + + xor %0, %0, %13 + //add %3, %3, #16 + //add %11, %11, #16 + + //sub %12, %12, #1 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + last: + // addmul of finalblock + + gcm_brev64 %9, %9 + gcm_swap64 %9, %9, %9 + xor %0, %9, %13 + //add %3, %3, #16 + //add %11, %11, #16 + //sub %12, %12, #1 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + + gcm_brev64 %13, %13 + gcm_swap64 %13, %13, %13 + + xor %8, %13, %14 + + fin + ); + + + let mut pos = 0; + + pos = 0; + println!("test AES:"); + while pos < aescode.len() { + print!("0x{:08x},", aescode[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", aescode.len()); + + pos = 0; + println!("GCM PFX:"); + while pos < gcm_pfx_code.len() { + print!("0x{:08x},", gcm_pfx_code[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", gcm_pfx_code.len()); + + pos = 0; + println!("GCM AD:"); + while pos < gcm_ad_code.len() { + print!("0x{:08x},", gcm_ad_code[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", gcm_ad_code.len()); + + pos = 0; + println!("GCM AES:"); + while pos < gcm_aes_code.len() { + print!("0x{:08x},", gcm_aes_code[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", gcm_aes_code.len()); + + pos = 0; + println!("GCM FINISH:"); + while pos < gcm_finish_code.len() { + print!("0x{:08x},", gcm_finish_code[pos]); + pos = pos + 1; + } + println!(""); + println!("-> {}", gcm_finish_code.len()); + + + Ok(()) +} diff --git a/sbus-to-ztex-gateware-migen/forth_to_migen_rom.sh b/sbus-to-ztex-gateware-migen/forth_to_migen_rom.sh new file mode 100755 index 0000000..5bcd747 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/forth_to_migen_rom.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +PFX=prom_migen + +rm -f ${PFX}.fc + +# (export BP=~/SPARC/SBusFPGA/sbus-to-ztex/openfirmware ; toke ${PFX}.forth ) + +( export BP=`pwd`/openfirmware ; openfirmware/cpu/x86/Linux/forth openfirmware/cpu/x86/build/builder.dic prom_migen.bth ) 2>&1 | tee forth.log + +rm -f /tmp/${PFX}.hexa + +od --endian=big -w4 -x ${PFX}.fc | awk '{ print $2,$3"," }' >| /tmp/${PFX}.hexa + +rm -f /tmp/${PFX}.txt_hexa + +cat /tmp/${PFX}.hexa | sed -e 's/^\([a-f0-9][a-f0-9][a-f0-9][a-f0-9]\) \([a-f0-9][a-f0-9][a-f0-9][a-f0-9]\),/0x\1\2,/g' -e 's/^\([a-f0-9][a-f0-9]*\) ,/0x\10000,/' -e 's/^ ,/0x00000000,/' -e 's/\(0x[0-9a-fA-F]*\),/if (idx == 0):\n\treturn \1;/' > /tmp/${PFX}.txt_hexa + +#echo "rom = [" +#cat /tmp/${PFX}.txt_hexa +#echo "]" diff --git a/sbus-to-ztex-gateware-migen/neorv32_trng_patched.vhd b/sbus-to-ztex-gateware-migen/neorv32_trng_patched.vhd new file mode 100644 index 0000000..5934a41 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/neorv32_trng_patched.vhd @@ -0,0 +1,382 @@ +-- # THIS IS NOT THE ORIGINAL FILE +-- # THIS WAS MODIFIED TO EXPOSE THE TRNG IN LITEX +-- # See the link in the copyright header to find the original file +-- +-- +-- ################################################################################################# +-- # << NEORV32 - True Random Number Generator (TRNG) >> # +-- # ********************************************************************************************* # +-- # This unit implements a *true* random number generator which uses several ring oscillators as # +-- # entropy source. The outputs of all chains are XORed and de-biased using a John von Neumann # +-- # randomness extractor. The de-biased signal is further processed by a simple LFSR for improved # +-- # whitening. # +-- # ********************************************************************************************* # +-- # BSD 3-Clause License # +-- # # +-- # Copyright (c) 2021, Stephan Nolting. All rights reserved. # +-- # # +-- # Redistribution and use in source and binary forms, with or without modification, are # +-- # permitted provided that the following conditions are met: # +-- # # +-- # 1. Redistributions of source code must retain the above copyright notice, this list of # +-- # conditions and the following disclaimer. # +-- # # +-- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of # +-- # conditions and the following disclaimer in the documentation and/or other materials # +-- # provided with the distribution. # +-- # # +-- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to # +-- # endorse or promote products derived from this software without specific prior written # +-- # permission. # +-- # # +-- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS # +-- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # +-- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # +-- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # +-- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE # +-- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED # +-- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # +-- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # +-- # OF THE POSSIBILITY OF SUCH DAMAGE. # +-- # ********************************************************************************************* # +-- # The NEORV32 Processor - https://github.com/stnolting/neorv32 (c) Stephan Nolting # +-- ################################################################################################# + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library neorv32; +-- use neorv32.neorv32_package.all; + +entity neorv32_trng is + port ( + -- host access -- + clk_i : in std_ulogic; -- global clock line +-- addr_i : in std_ulogic_vector(31 downto 0); -- address + rden_i : in std_ulogic; -- read enable + wren_i : in std_ulogic; -- write enable + data_i : in std_ulogic_vector(31 downto 0); -- data in + data_o : out std_ulogic_vector(31 downto 0)--; -- data out +-- ack_o : out std_ulogic -- transfer acknowledge + ); +end neorv32_trng; + +architecture neorv32_trng_rtl of neorv32_trng is + + -- Advanced Configuration -------------------------------------------------------------------------------- + constant num_roscs_c : natural := 4; -- total number of ring oscillators + constant num_inv_start_c : natural := 5; -- number of inverters in FIRST ring oscillator (has to be odd) + constant num_inv_inc_c : natural := 2; -- number of inverters increment for each next ring oscillator (has to be even) + constant lfsr_en_c : boolean := true; -- use LFSR-based post-processing + constant lfsr_taps_c : std_ulogic_vector(7 downto 0) := "10111000"; -- Fibonacci post-processing LFSR feedback taps + -- ------------------------------------------------------------------------------------------------------- + + -- control register bits -- + constant ctrl_data_lsb_c : natural := 0; -- r/-: Random data byte LSB + constant ctrl_data_msb_c : natural := 7; -- r/-: Random data byte MSB + -- + constant ctrl_en_c : natural := 30; -- r/w: TRNG enable + constant ctrl_valid_c : natural := 31; -- r/-: Output data valid + + -- IO space: module base address -- +-- constant hi_abb_c : natural := index_size_f(io_size_c)-1; -- high address boundary bit +-- constant lo_abb_c : natural := index_size_f(trng_size_c); -- low address boundary bit + +-- copy/pasted from the rtl/core/neorv32_package.vhd file + function xor_reduce_f(a : std_ulogic_vector) return std_ulogic is + variable tmp_v : std_ulogic; + begin + tmp_v := '0'; + if (a'low < a'high) then -- not null range? + for i in a'low to a'high loop + tmp_v := tmp_v xor a(i); + end loop; -- i + end if; + return tmp_v; + end function xor_reduce_f; + + + -- Component: Ring-Oscillator -- + component neorv32_trng_ring_osc + generic ( + NUM_INV : natural := 16 -- number of inverters in chain + ); + port ( + clk_i : in std_ulogic; + enable_i : in std_ulogic; -- enable chain input + enable_o : out std_ulogic; -- enable chain output + data_o : out std_ulogic -- sync random bit + ); + end component; + + -- access control -- + signal acc_en : std_ulogic; -- module access enable +-- signal wren : std_ulogic; -- full word write enable +-- signal rden : std_ulogic; -- read enable + + -- ring-oscillator array -- + signal osc_array_en_in : std_ulogic_vector(num_roscs_c-1 downto 0); + signal osc_array_en_out : std_ulogic_vector(num_roscs_c-1 downto 0); + signal osc_array_data : std_ulogic_vector(num_roscs_c-1 downto 0); + + -- von-Neumann de-biasing -- + type debiasing_t is record + sreg : std_ulogic_vector(1 downto 0); + state : std_ulogic; -- process de-biasing every second cycle + valid : std_ulogic; -- de-biased data + data : std_ulogic; -- de-biased data valid + end record; + signal debiasing : debiasing_t; + + -- (post-)processing core -- + type processing_t is record + enable : std_ulogic; -- TRNG enable flag + cnt : std_ulogic_vector(3 downto 0); -- bit counter + sreg : std_ulogic_vector(7 downto 0); -- data shift register + output : std_ulogic_vector(7 downto 0); -- output register + valid : std_ulogic; -- data output valid flag + end record; + signal processing : processing_t; + +begin + + -- Sanity Checks -------------------------------------------------------------------------- + -- ------------------------------------------------------------------------------------------- + assert not (num_roscs_c = 0) report "NEORV32 PROCESSOR CONFIG ERROR: TRNG - Total number of ring-oscillators has to be >0." severity error; + assert not ((num_inv_start_c mod 2) = 0) report "NEORV32 PROCESSOR CONFIG ERROR: TRNG - Number of inverters in fisrt ring has to be odd." severity error; + assert not ((num_inv_inc_c mod 2) /= 0) report "NEORV32 PROCESSOR CONFIG ERROR: TRNG - Number of inverters increment for each next ring has to be even." severity error; + + + -- Access Control ------------------------------------------------------------------------- + -- ------------------------------------------------------------------------------------------- +-- acc_en <= '1' when (addr_i(hi_abb_c downto lo_abb_c) = trng_base_c(hi_abb_c downto lo_abb_c)) else '0'; +-- wren <= acc_en and wren_i; +-- rden <= acc_en and rden_i; + + -- Read/Write Access ---------------------------------------------------------------------- + -- ------------------------------------------------------------------------------------------- + rw_access: process(clk_i) + begin + if rising_edge(clk_i) then +-- ack_o <= wren_i or rden_i; + -- write access -- + if (wren_i = '1') then + processing.enable <= data_i(ctrl_en_c); + end if; + -- read access -- +-- data_o <= (others => '0'); + if (rden_i = '1') then + data_o(ctrl_data_msb_c downto ctrl_data_lsb_c) <= processing.output; + data_o(ctrl_en_c) <= processing.enable; + data_o(ctrl_valid_c) <= processing.valid; + end if; + end if; + end process rw_access; + + + -- Entropy Source ------------------------------------------------------------------------- + -- ------------------------------------------------------------------------------------------- + neorv32_trng_ring_osc_inst: + for i in 0 to num_roscs_c-1 generate + neorv32_trng_ring_osc_inst_i: neorv32_trng_ring_osc + generic map ( + NUM_INV => num_inv_start_c + (i*num_inv_inc_c) -- number of inverters in chain + ) + port map ( + clk_i => clk_i, + enable_i => osc_array_en_in(i), + enable_o => osc_array_en_out(i), + data_o => osc_array_data(i) + ); + end generate; + + -- RO enable chain -- + array_intercon: process(processing.enable, osc_array_en_out) + begin + for i in 0 to num_roscs_c-1 loop + if (i = 0) then -- start of enable chain + osc_array_en_in(i) <= processing.enable; + else + osc_array_en_in(i) <= osc_array_en_out(i-1); + end if; + end loop; -- i + end process array_intercon; + + + -- John von Neumann De-Biasing ------------------------------------------------------------ + -- ------------------------------------------------------------------------------------------- + neumann_debiasing_sync: process(clk_i) + begin + if rising_edge(clk_i) then + debiasing.sreg <= debiasing.sreg(debiasing.sreg'left-1 downto 0) & xor_reduce_f(osc_array_data); + debiasing.state <= (not debiasing.state) and osc_array_en_out(num_roscs_c-1); -- start toggling when last RO is enabled -> process in every second cycle + end if; + end process neumann_debiasing_sync; + + -- Edge detector -- + neumann_debiasing_comb: process(debiasing) + variable tmp_v : std_ulogic_vector(2 downto 0); + begin + -- check groups of two non-overlapping bits from the input stream + tmp_v := debiasing.state & debiasing.sreg; + case tmp_v is + when "101" => debiasing.valid <= '1'; debiasing.data <= '1'; -- rising edge -> '1' + when "110" => debiasing.valid <= '1'; debiasing.data <= '0'; -- falling edge -> '0' + when others => debiasing.valid <= '0'; debiasing.data <= '0'; -- no valid data + end case; + end process neumann_debiasing_comb; + + + -- Processing Core ------------------------------------------------------------------------ + -- ------------------------------------------------------------------------------------------- + processing_core: process(clk_i) + begin + if rising_edge(clk_i) then + -- sample random data bit and apply post-processing -- + if (processing.enable = '0') then + processing.cnt <= (others => '0'); + processing.sreg <= (others => '0'); + elsif (debiasing.valid = '1') then -- valid random sample? + if (processing.cnt = "1000") then + processing.cnt <= (others => '0'); + else + processing.cnt <= std_ulogic_vector(unsigned(processing.cnt) + 1); + end if; + if (lfsr_en_c = true) then -- LFSR post-processing + processing.sreg <= processing.sreg(processing.sreg'left-1 downto 0) & ((not xor_reduce_f(processing.sreg and lfsr_taps_c)) xnor debiasing.data); + else -- NO post-processing + processing.sreg <= processing.sreg(processing.sreg'left-1 downto 0) & debiasing.data; + end if; + end if; + + -- data output register -- + if (processing.cnt = "1000") then + processing.output <= processing.sreg; + end if; + + -- data ready/valid flag -- + if (processing.cnt = "1000") then -- new sample ready? + processing.valid <= '1'; + elsif (processing.enable = '0') or (rden_i = '1') then -- clear when deactivated or on data read + processing.valid <= '0'; + end if; + end if; + end process processing_core; + + +end neorv32_trng_rtl; + + +-- ############################################################################################################################ +-- ############################################################################################################################ + + +-- ################################################################################################# +-- # << NEORV32 - True Random Number Generator (TRNG) - Ring-Oscillator-Based Entropy Source >> # +-- # ********************************************************************************************* # +-- # An inverter chain (ring oscillator) is used as entropy source. # +-- # The inverter chain is constructed as an "asynchronous" LFSR. The single inverters are # +-- # connected via latches that are used to enable/disable the TRNG. Also, these latches are used # +-- # as additional delay element. By using unique enable signals for each latch, the synthesis # +-- # tool cannot "optimize" (=remove) any of the inverters out of the design. Furthermore, the # +-- # latches prevent the synthesis tool from detecting combinatorial loops. # +-- # ********************************************************************************************* # +-- # BSD 3-Clause License # +-- # # +-- # Copyright (c) 2021, Stephan Nolting. All rights reserved. # +-- # # +-- # Redistribution and use in source and binary forms, with or without modification, are # +-- # permitted provided that the following conditions are met: # +-- # # +-- # 1. Redistributions of source code must retain the above copyright notice, this list of # +-- # conditions and the following disclaimer. # +-- # # +-- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of # +-- # conditions and the following disclaimer in the documentation and/or other materials # +-- # provided with the distribution. # +-- # # +-- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to # +-- # endorse or promote products derived from this software without specific prior written # +-- # permission. # +-- # # +-- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS # +-- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # +-- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # +-- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # +-- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE # +-- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED # +-- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # +-- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # +-- # OF THE POSSIBILITY OF SUCH DAMAGE. # +-- # ********************************************************************************************* # +-- # The NEORV32 Processor - https://github.com/stnolting/neorv32 (c) Stephan Nolting # +-- ################################################################################################# + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library neorv32; +-- use neorv32.neorv32_package.all; + +entity neorv32_trng_ring_osc is + generic ( + NUM_INV : natural := 15 -- number of inverters in chain + ); + port ( + clk_i : in std_ulogic; + enable_i : in std_ulogic; -- enable chain input + enable_o : out std_ulogic; -- enable chain output + data_o : out std_ulogic -- sync random bit + ); +end neorv32_trng_ring_osc; + +architecture neorv32_trng_ring_osc_rtl of neorv32_trng_ring_osc is + + signal inv_chain : std_ulogic_vector(NUM_INV-1 downto 0); -- oscillator chain + signal enable_sreg : std_ulogic_vector(NUM_INV-1 downto 0); -- enable shift register + signal sync_ff : std_ulogic_vector(1 downto 0); -- output signal synchronizer + +begin + + -- Ring Oscillator ------------------------------------------------------------------------ + -- ------------------------------------------------------------------------------------------- + ring_osc: process(enable_i, enable_sreg, inv_chain) + begin + -- Using individual enable signals for each inverter - derived from a shift register - to prevent the synthesis tool + -- from removing all but one inverter (since they implement "logical identical functions"). + -- This also allows to make the TRNG platform independent. + for i in 0 to NUM_INV-1 loop -- inverters in chain + if (enable_i = '0') then -- start with a defined state (latch reset) + inv_chain(i) <= '0'; + elsif (enable_sreg(i) = '1') then + -- here we have the inverter chain -- + if (i = NUM_INV-1) then -- left-most inverter? + inv_chain(i) <= not inv_chain(0); + else + inv_chain(i) <= not inv_chain(i+1); + end if; + end if; + end loop; -- i + end process ring_osc; + + + -- Control -------------------------------------------------------------------------------- + -- ------------------------------------------------------------------------------------------- + ctrl_unit: process(clk_i) + begin + if rising_edge(clk_i) then + enable_sreg <= enable_sreg(enable_sreg'left-1 downto 0) & enable_i; -- activate right-most inverter first + sync_ff <= sync_ff(0) & inv_chain(0); -- synchronize to prevent metastability + end if; + end process ctrl_unit; + + -- output for "enable chain" -- + enable_o <= enable_sreg(enable_sreg'left); + + -- rnd output -- + data_o <= sync_ff(1); + + +end neorv32_trng_ring_osc_rtl; diff --git a/sbus-to-ztex-gateware-migen/netbsd_csr.h b/sbus-to-ztex-gateware-migen/netbsd_csr.h new file mode 100644 index 0000000..01b3798 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/netbsd_csr.h @@ -0,0 +1,1125 @@ +//-------------------------------------------------------------------------------- +// Auto-generated by Migen (3ffd64c) & LiteX (8a644c90) on 2021-09-03 09:40:05 +//-------------------------------------------------------------------------------- +#ifndef __GENERATED_CSR_H +#define __GENERATED_CSR_H +#ifndef CSR_BASE +#define CSR_BASE 0x40000L +#endif + +/* leds */ +#ifndef CSR_LEDS_BASE +#define CSR_LEDS_BASE (CSR_BASE + 0x0L) +#define CSR_LEDS_OUT_ADDR (CSR_LEDS_BASE + 0x0L) +#define CSR_LEDS_OUT_SIZE 1 +static inline uint32_t leds_out_read(struct sbusfpga_leds_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_leds, 0x0L); +} +static inline void leds_out_write(struct sbusfpga_leds_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_leds, 0x0L, v); +} +#endif // CSR_LEDS_BASE + +/* curve25519engine */ +#ifndef CSR_CURVE25519ENGINE_BASE +#define CSR_CURVE25519ENGINE_BASE (CSR_BASE + 0x1000L) +#define CSR_CURVE25519ENGINE_WINDOW_ADDR (CSR_CURVE25519ENGINE_BASE + 0x0L) +#define CSR_CURVE25519ENGINE_WINDOW_SIZE 1 +static inline uint32_t curve25519engine_window_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x0L); +} +static inline void curve25519engine_window_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x0L, v); +} +#define CSR_CURVE25519ENGINE_WINDOW_WINDOW_OFFSET 0 +#define CSR_CURVE25519ENGINE_WINDOW_WINDOW_SIZE 4 +static inline uint32_t curve25519engine_window_window_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 4)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_window_window_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_window_read(sc); + return curve25519engine_window_window_extract(sc, word); +} +static inline uint32_t curve25519engine_window_window_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 4)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void curve25519engine_window_window_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_window_read(sc); + uint32_t newword = curve25519engine_window_window_replace(sc, oldword, plain_value); + curve25519engine_window_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_MPSTART_ADDR (CSR_CURVE25519ENGINE_BASE + 0x4L) +#define CSR_CURVE25519ENGINE_MPSTART_SIZE 1 +static inline uint32_t curve25519engine_mpstart_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x4L); +} +static inline void curve25519engine_mpstart_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x4L, v); +} +#define CSR_CURVE25519ENGINE_MPSTART_MPSTART_OFFSET 0 +#define CSR_CURVE25519ENGINE_MPSTART_MPSTART_SIZE 10 +static inline uint32_t curve25519engine_mpstart_mpstart_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 10)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_mpstart_mpstart_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_mpstart_read(sc); + return curve25519engine_mpstart_mpstart_extract(sc, word); +} +static inline uint32_t curve25519engine_mpstart_mpstart_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 10)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void curve25519engine_mpstart_mpstart_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_mpstart_read(sc); + uint32_t newword = curve25519engine_mpstart_mpstart_replace(sc, oldword, plain_value); + curve25519engine_mpstart_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_MPLEN_ADDR (CSR_CURVE25519ENGINE_BASE + 0x8L) +#define CSR_CURVE25519ENGINE_MPLEN_SIZE 1 +static inline uint32_t curve25519engine_mplen_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x8L); +} +static inline void curve25519engine_mplen_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x8L, v); +} +#define CSR_CURVE25519ENGINE_MPLEN_MPLEN_OFFSET 0 +#define CSR_CURVE25519ENGINE_MPLEN_MPLEN_SIZE 10 +static inline uint32_t curve25519engine_mplen_mplen_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 10)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_mplen_mplen_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_mplen_read(sc); + return curve25519engine_mplen_mplen_extract(sc, word); +} +static inline uint32_t curve25519engine_mplen_mplen_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 10)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void curve25519engine_mplen_mplen_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_mplen_read(sc); + uint32_t newword = curve25519engine_mplen_mplen_replace(sc, oldword, plain_value); + curve25519engine_mplen_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_CONTROL_ADDR (CSR_CURVE25519ENGINE_BASE + 0xcL) +#define CSR_CURVE25519ENGINE_CONTROL_SIZE 1 +static inline uint32_t curve25519engine_control_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0xcL); +} +static inline void curve25519engine_control_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0xcL, v); +} +#define CSR_CURVE25519ENGINE_CONTROL_GO_OFFSET 0 +#define CSR_CURVE25519ENGINE_CONTROL_GO_SIZE 1 +static inline uint32_t curve25519engine_control_go_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_control_go_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_control_read(sc); + return curve25519engine_control_go_extract(sc, word); +} +static inline uint32_t curve25519engine_control_go_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void curve25519engine_control_go_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_control_read(sc); + uint32_t newword = curve25519engine_control_go_replace(sc, oldword, plain_value); + curve25519engine_control_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_MPRESUME_ADDR (CSR_CURVE25519ENGINE_BASE + 0x10L) +#define CSR_CURVE25519ENGINE_MPRESUME_SIZE 1 +static inline uint32_t curve25519engine_mpresume_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x10L); +} +#define CSR_CURVE25519ENGINE_MPRESUME_MPRESUME_OFFSET 0 +#define CSR_CURVE25519ENGINE_MPRESUME_MPRESUME_SIZE 10 +static inline uint32_t curve25519engine_mpresume_mpresume_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 10)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_mpresume_mpresume_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_mpresume_read(sc); + return curve25519engine_mpresume_mpresume_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_POWER_ADDR (CSR_CURVE25519ENGINE_BASE + 0x14L) +#define CSR_CURVE25519ENGINE_POWER_SIZE 1 +static inline uint32_t curve25519engine_power_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x14L); +} +static inline void curve25519engine_power_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x14L, v); +} +#define CSR_CURVE25519ENGINE_POWER_ON_OFFSET 0 +#define CSR_CURVE25519ENGINE_POWER_ON_SIZE 1 +static inline uint32_t curve25519engine_power_on_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_power_on_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_power_read(sc); + return curve25519engine_power_on_extract(sc, word); +} +static inline uint32_t curve25519engine_power_on_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void curve25519engine_power_on_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_power_read(sc); + uint32_t newword = curve25519engine_power_on_replace(sc, oldword, plain_value); + curve25519engine_power_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_POWER_PAUSE_REQ_OFFSET 1 +#define CSR_CURVE25519ENGINE_POWER_PAUSE_REQ_SIZE 1 +static inline uint32_t curve25519engine_power_pause_req_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 1) & mask ); +} +static inline uint32_t curve25519engine_power_pause_req_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_power_read(sc); + return curve25519engine_power_pause_req_extract(sc, word); +} +static inline uint32_t curve25519engine_power_pause_req_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 1))) | (mask & plain_value)<< 1 ; +} +static inline void curve25519engine_power_pause_req_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_power_read(sc); + uint32_t newword = curve25519engine_power_pause_req_replace(sc, oldword, plain_value); + curve25519engine_power_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_STATUS_ADDR (CSR_CURVE25519ENGINE_BASE + 0x18L) +#define CSR_CURVE25519ENGINE_STATUS_SIZE 1 +static inline uint32_t curve25519engine_status_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x18L); +} +#define CSR_CURVE25519ENGINE_STATUS_RUNNING_OFFSET 0 +#define CSR_CURVE25519ENGINE_STATUS_RUNNING_SIZE 1 +static inline uint32_t curve25519engine_status_running_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_status_running_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_status_read(sc); + return curve25519engine_status_running_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_STATUS_MPC_OFFSET 1 +#define CSR_CURVE25519ENGINE_STATUS_MPC_SIZE 10 +static inline uint32_t curve25519engine_status_mpc_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 10)-1); + return ( (oldword >> 1) & mask ); +} +static inline uint32_t curve25519engine_status_mpc_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_status_read(sc); + return curve25519engine_status_mpc_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_STATUS_PAUSE_GNT_OFFSET 11 +#define CSR_CURVE25519ENGINE_STATUS_PAUSE_GNT_SIZE 1 +static inline uint32_t curve25519engine_status_pause_gnt_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 11) & mask ); +} +static inline uint32_t curve25519engine_status_pause_gnt_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_status_read(sc); + return curve25519engine_status_pause_gnt_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_STATUS_SIGILL_OFFSET 12 +#define CSR_CURVE25519ENGINE_STATUS_SIGILL_SIZE 1 +static inline uint32_t curve25519engine_status_sigill_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 12) & mask ); +} +static inline uint32_t curve25519engine_status_sigill_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_status_read(sc); + return curve25519engine_status_sigill_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_STATUS_ABORT_OFFSET 13 +#define CSR_CURVE25519ENGINE_STATUS_ABORT_SIZE 1 +static inline uint32_t curve25519engine_status_abort_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 13) & mask ); +} +static inline uint32_t curve25519engine_status_abort_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_status_read(sc); + return curve25519engine_status_abort_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_STATUS_FINISHED_OFFSET 14 +#define CSR_CURVE25519ENGINE_STATUS_FINISHED_SIZE 1 +static inline uint32_t curve25519engine_status_finished_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 14) & mask ); +} +static inline uint32_t curve25519engine_status_finished_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_status_read(sc); + return curve25519engine_status_finished_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_EV_STATUS_ADDR (CSR_CURVE25519ENGINE_BASE + 0x1cL) +#define CSR_CURVE25519ENGINE_EV_STATUS_SIZE 1 +static inline uint32_t curve25519engine_ev_status_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x1cL); +} +#define CSR_CURVE25519ENGINE_EV_STATUS_FINISHED_OFFSET 0 +#define CSR_CURVE25519ENGINE_EV_STATUS_FINISHED_SIZE 1 +static inline uint32_t curve25519engine_ev_status_finished_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_ev_status_finished_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_ev_status_read(sc); + return curve25519engine_ev_status_finished_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_EV_STATUS_ILLEGAL_OPCODE_OFFSET 1 +#define CSR_CURVE25519ENGINE_EV_STATUS_ILLEGAL_OPCODE_SIZE 1 +static inline uint32_t curve25519engine_ev_status_illegal_opcode_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 1) & mask ); +} +static inline uint32_t curve25519engine_ev_status_illegal_opcode_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_ev_status_read(sc); + return curve25519engine_ev_status_illegal_opcode_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_EV_PENDING_ADDR (CSR_CURVE25519ENGINE_BASE + 0x20L) +#define CSR_CURVE25519ENGINE_EV_PENDING_SIZE 1 +static inline uint32_t curve25519engine_ev_pending_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x20L); +} +static inline void curve25519engine_ev_pending_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x20L, v); +} +#define CSR_CURVE25519ENGINE_EV_PENDING_FINISHED_OFFSET 0 +#define CSR_CURVE25519ENGINE_EV_PENDING_FINISHED_SIZE 1 +static inline uint32_t curve25519engine_ev_pending_finished_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_ev_pending_finished_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_ev_pending_read(sc); + return curve25519engine_ev_pending_finished_extract(sc, word); +} +static inline uint32_t curve25519engine_ev_pending_finished_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void curve25519engine_ev_pending_finished_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_ev_pending_read(sc); + uint32_t newword = curve25519engine_ev_pending_finished_replace(sc, oldword, plain_value); + curve25519engine_ev_pending_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_EV_PENDING_ILLEGAL_OPCODE_OFFSET 1 +#define CSR_CURVE25519ENGINE_EV_PENDING_ILLEGAL_OPCODE_SIZE 1 +static inline uint32_t curve25519engine_ev_pending_illegal_opcode_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 1) & mask ); +} +static inline uint32_t curve25519engine_ev_pending_illegal_opcode_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_ev_pending_read(sc); + return curve25519engine_ev_pending_illegal_opcode_extract(sc, word); +} +static inline uint32_t curve25519engine_ev_pending_illegal_opcode_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 1))) | (mask & plain_value)<< 1 ; +} +static inline void curve25519engine_ev_pending_illegal_opcode_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_ev_pending_read(sc); + uint32_t newword = curve25519engine_ev_pending_illegal_opcode_replace(sc, oldword, plain_value); + curve25519engine_ev_pending_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_EV_ENABLE_ADDR (CSR_CURVE25519ENGINE_BASE + 0x24L) +#define CSR_CURVE25519ENGINE_EV_ENABLE_SIZE 1 +static inline uint32_t curve25519engine_ev_enable_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x24L); +} +static inline void curve25519engine_ev_enable_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x24L, v); +} +#define CSR_CURVE25519ENGINE_EV_ENABLE_FINISHED_OFFSET 0 +#define CSR_CURVE25519ENGINE_EV_ENABLE_FINISHED_SIZE 1 +static inline uint32_t curve25519engine_ev_enable_finished_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_ev_enable_finished_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_ev_enable_read(sc); + return curve25519engine_ev_enable_finished_extract(sc, word); +} +static inline uint32_t curve25519engine_ev_enable_finished_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void curve25519engine_ev_enable_finished_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_ev_enable_read(sc); + uint32_t newword = curve25519engine_ev_enable_finished_replace(sc, oldword, plain_value); + curve25519engine_ev_enable_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_EV_ENABLE_ILLEGAL_OPCODE_OFFSET 1 +#define CSR_CURVE25519ENGINE_EV_ENABLE_ILLEGAL_OPCODE_SIZE 1 +static inline uint32_t curve25519engine_ev_enable_illegal_opcode_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 1) & mask ); +} +static inline uint32_t curve25519engine_ev_enable_illegal_opcode_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_ev_enable_read(sc); + return curve25519engine_ev_enable_illegal_opcode_extract(sc, word); +} +static inline uint32_t curve25519engine_ev_enable_illegal_opcode_replace(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 1))) | (mask & plain_value)<< 1 ; +} +static inline void curve25519engine_ev_enable_illegal_opcode_write(struct sbusfpga_curve25519engine_softc *sc, uint32_t plain_value) { + uint32_t oldword = curve25519engine_ev_enable_read(sc); + uint32_t newword = curve25519engine_ev_enable_illegal_opcode_replace(sc, oldword, plain_value); + curve25519engine_ev_enable_write(sc, newword); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_ADDR (CSR_CURVE25519ENGINE_BASE + 0x28L) +#define CSR_CURVE25519ENGINE_INSTRUCTION_SIZE 1 +static inline uint32_t curve25519engine_instruction_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x28L); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_OPCODE_OFFSET 0 +#define CSR_CURVE25519ENGINE_INSTRUCTION_OPCODE_SIZE 6 +static inline uint32_t curve25519engine_instruction_opcode_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 6)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t curve25519engine_instruction_opcode_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_instruction_read(sc); + return curve25519engine_instruction_opcode_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_RA_OFFSET 6 +#define CSR_CURVE25519ENGINE_INSTRUCTION_RA_SIZE 5 +static inline uint32_t curve25519engine_instruction_ra_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 5)-1); + return ( (oldword >> 6) & mask ); +} +static inline uint32_t curve25519engine_instruction_ra_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_instruction_read(sc); + return curve25519engine_instruction_ra_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_CA_OFFSET 11 +#define CSR_CURVE25519ENGINE_INSTRUCTION_CA_SIZE 1 +static inline uint32_t curve25519engine_instruction_ca_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 11) & mask ); +} +static inline uint32_t curve25519engine_instruction_ca_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_instruction_read(sc); + return curve25519engine_instruction_ca_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_RB_OFFSET 12 +#define CSR_CURVE25519ENGINE_INSTRUCTION_RB_SIZE 5 +static inline uint32_t curve25519engine_instruction_rb_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 5)-1); + return ( (oldword >> 12) & mask ); +} +static inline uint32_t curve25519engine_instruction_rb_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_instruction_read(sc); + return curve25519engine_instruction_rb_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_CB_OFFSET 17 +#define CSR_CURVE25519ENGINE_INSTRUCTION_CB_SIZE 1 +static inline uint32_t curve25519engine_instruction_cb_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 17) & mask ); +} +static inline uint32_t curve25519engine_instruction_cb_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_instruction_read(sc); + return curve25519engine_instruction_cb_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_WD_OFFSET 18 +#define CSR_CURVE25519ENGINE_INSTRUCTION_WD_SIZE 5 +static inline uint32_t curve25519engine_instruction_wd_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 5)-1); + return ( (oldword >> 18) & mask ); +} +static inline uint32_t curve25519engine_instruction_wd_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_instruction_read(sc); + return curve25519engine_instruction_wd_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_INSTRUCTION_IMMEDIATE_OFFSET 23 +#define CSR_CURVE25519ENGINE_INSTRUCTION_IMMEDIATE_SIZE 9 +static inline uint32_t curve25519engine_instruction_immediate_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 9)-1); + return ( (oldword >> 23) & mask ); +} +static inline uint32_t curve25519engine_instruction_immediate_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_instruction_read(sc); + return curve25519engine_instruction_immediate_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_LS_STATUS_ADDR (CSR_CURVE25519ENGINE_BASE + 0x2cL) +#define CSR_CURVE25519ENGINE_LS_STATUS_SIZE 1 +static inline uint32_t curve25519engine_ls_status_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x2cL); +} +#endif // CSR_CURVE25519ENGINE_BASE + +/* ddrphy */ +#ifndef CSR_DDRPHY_BASE +#define CSR_DDRPHY_BASE (CSR_BASE + 0x2000L) +#define CSR_DDRPHY_RST_ADDR (CSR_DDRPHY_BASE + 0x0L) +#define CSR_DDRPHY_RST_SIZE 1 +static inline uint32_t ddrphy_rst_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x0L); +} +static inline void ddrphy_rst_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x0L, v); +} +#define CSR_DDRPHY_HALF_SYS8X_TAPS_ADDR (CSR_DDRPHY_BASE + 0x4L) +#define CSR_DDRPHY_HALF_SYS8X_TAPS_SIZE 1 +static inline uint32_t ddrphy_half_sys8x_taps_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x4L); +} +static inline void ddrphy_half_sys8x_taps_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x4L, v); +} +#define CSR_DDRPHY_WLEVEL_EN_ADDR (CSR_DDRPHY_BASE + 0x8L) +#define CSR_DDRPHY_WLEVEL_EN_SIZE 1 +static inline uint32_t ddrphy_wlevel_en_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x8L); +} +static inline void ddrphy_wlevel_en_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x8L, v); +} +#define CSR_DDRPHY_WLEVEL_STROBE_ADDR (CSR_DDRPHY_BASE + 0xcL) +#define CSR_DDRPHY_WLEVEL_STROBE_SIZE 1 +static inline uint32_t ddrphy_wlevel_strobe_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0xcL); +} +static inline void ddrphy_wlevel_strobe_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0xcL, v); +} +#define CSR_DDRPHY_DLY_SEL_ADDR (CSR_DDRPHY_BASE + 0x10L) +#define CSR_DDRPHY_DLY_SEL_SIZE 1 +static inline uint32_t ddrphy_dly_sel_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x10L); +} +static inline void ddrphy_dly_sel_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x10L, v); +} +#define CSR_DDRPHY_RDLY_DQ_RST_ADDR (CSR_DDRPHY_BASE + 0x14L) +#define CSR_DDRPHY_RDLY_DQ_RST_SIZE 1 +static inline uint32_t ddrphy_rdly_dq_rst_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x14L); +} +static inline void ddrphy_rdly_dq_rst_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x14L, v); +} +#define CSR_DDRPHY_RDLY_DQ_INC_ADDR (CSR_DDRPHY_BASE + 0x18L) +#define CSR_DDRPHY_RDLY_DQ_INC_SIZE 1 +static inline uint32_t ddrphy_rdly_dq_inc_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x18L); +} +static inline void ddrphy_rdly_dq_inc_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x18L, v); +} +#define CSR_DDRPHY_RDLY_DQ_BITSLIP_RST_ADDR (CSR_DDRPHY_BASE + 0x1cL) +#define CSR_DDRPHY_RDLY_DQ_BITSLIP_RST_SIZE 1 +static inline uint32_t ddrphy_rdly_dq_bitslip_rst_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x1cL); +} +static inline void ddrphy_rdly_dq_bitslip_rst_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x1cL, v); +} +#define CSR_DDRPHY_RDLY_DQ_BITSLIP_ADDR (CSR_DDRPHY_BASE + 0x20L) +#define CSR_DDRPHY_RDLY_DQ_BITSLIP_SIZE 1 +static inline uint32_t ddrphy_rdly_dq_bitslip_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x20L); +} +static inline void ddrphy_rdly_dq_bitslip_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x20L, v); +} +#define CSR_DDRPHY_WDLY_DQ_BITSLIP_RST_ADDR (CSR_DDRPHY_BASE + 0x24L) +#define CSR_DDRPHY_WDLY_DQ_BITSLIP_RST_SIZE 1 +static inline uint32_t ddrphy_wdly_dq_bitslip_rst_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x24L); +} +static inline void ddrphy_wdly_dq_bitslip_rst_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x24L, v); +} +#define CSR_DDRPHY_WDLY_DQ_BITSLIP_ADDR (CSR_DDRPHY_BASE + 0x28L) +#define CSR_DDRPHY_WDLY_DQ_BITSLIP_SIZE 1 +static inline uint32_t ddrphy_wdly_dq_bitslip_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x28L); +} +static inline void ddrphy_wdly_dq_bitslip_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x28L, v); +} +#define CSR_DDRPHY_RDPHASE_ADDR (CSR_DDRPHY_BASE + 0x2cL) +#define CSR_DDRPHY_RDPHASE_SIZE 1 +static inline uint32_t ddrphy_rdphase_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x2cL); +} +static inline void ddrphy_rdphase_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x2cL, v); +} +#define CSR_DDRPHY_WRPHASE_ADDR (CSR_DDRPHY_BASE + 0x30L) +#define CSR_DDRPHY_WRPHASE_SIZE 1 +static inline uint32_t ddrphy_wrphase_read(struct sbusfpga_ddrphy_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x30L); +} +static inline void ddrphy_wrphase_write(struct sbusfpga_ddrphy_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_ddrphy, 0x30L, v); +} +#endif // CSR_DDRPHY_BASE + +/* exchange_with_mem */ +#ifndef CSR_EXCHANGE_WITH_MEM_BASE +#define CSR_EXCHANGE_WITH_MEM_BASE (CSR_BASE + 0x3000L) +#define CSR_EXCHANGE_WITH_MEM_BLK_SIZE_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x0L) +#define CSR_EXCHANGE_WITH_MEM_BLK_SIZE_SIZE 1 +static inline uint32_t exchange_with_mem_blk_size_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x0L); +} +#define CSR_EXCHANGE_WITH_MEM_BLK_BASE_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x4L) +#define CSR_EXCHANGE_WITH_MEM_BLK_BASE_SIZE 1 +static inline uint32_t exchange_with_mem_blk_base_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x4L); +} +#define CSR_EXCHANGE_WITH_MEM_MEM_SIZE_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x8L) +#define CSR_EXCHANGE_WITH_MEM_MEM_SIZE_SIZE 1 +static inline uint32_t exchange_with_mem_mem_size_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x8L); +} +#define CSR_EXCHANGE_WITH_MEM_BLK_ADDR_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0xcL) +#define CSR_EXCHANGE_WITH_MEM_BLK_ADDR_SIZE 1 +static inline uint32_t exchange_with_mem_blk_addr_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0xcL); +} +static inline void exchange_with_mem_blk_addr_write(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0xcL, v); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_ADDR_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x10L) +#define CSR_EXCHANGE_WITH_MEM_DMA_ADDR_SIZE 1 +static inline uint32_t exchange_with_mem_dma_addr_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x10L); +} +static inline void exchange_with_mem_dma_addr_write(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x10L, v); +} +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x14L) +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_SIZE 1 +static inline uint32_t exchange_with_mem_blk_cnt_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x14L); +} +static inline void exchange_with_mem_blk_cnt_write(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x14L, v); +} +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_BLK_CNT_OFFSET 0 +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_BLK_CNT_SIZE 16 +static inline uint32_t exchange_with_mem_blk_cnt_blk_cnt_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 16)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t exchange_with_mem_blk_cnt_blk_cnt_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_blk_cnt_read(sc); + return exchange_with_mem_blk_cnt_blk_cnt_extract(sc, word); +} +static inline uint32_t exchange_with_mem_blk_cnt_blk_cnt_replace(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 16)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void exchange_with_mem_blk_cnt_blk_cnt_write(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t plain_value) { + uint32_t oldword = exchange_with_mem_blk_cnt_read(sc); + uint32_t newword = exchange_with_mem_blk_cnt_blk_cnt_replace(sc, oldword, plain_value); + exchange_with_mem_blk_cnt_write(sc, newword); +} +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_RSVD_OFFSET 16 +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_RSVD_SIZE 15 +static inline uint32_t exchange_with_mem_blk_cnt_rsvd_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 15)-1); + return ( (oldword >> 16) & mask ); +} +static inline uint32_t exchange_with_mem_blk_cnt_rsvd_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_blk_cnt_read(sc); + return exchange_with_mem_blk_cnt_rsvd_extract(sc, word); +} +static inline uint32_t exchange_with_mem_blk_cnt_rsvd_replace(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 15)-1); + return (oldword & (~(mask << 16))) | (mask & plain_value)<< 16 ; +} +static inline void exchange_with_mem_blk_cnt_rsvd_write(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t plain_value) { + uint32_t oldword = exchange_with_mem_blk_cnt_read(sc); + uint32_t newword = exchange_with_mem_blk_cnt_rsvd_replace(sc, oldword, plain_value); + exchange_with_mem_blk_cnt_write(sc, newword); +} +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_RD_WR_OFFSET 31 +#define CSR_EXCHANGE_WITH_MEM_BLK_CNT_RD_WR_SIZE 1 +static inline uint32_t exchange_with_mem_blk_cnt_rd_wr_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 31) & mask ); +} +static inline uint32_t exchange_with_mem_blk_cnt_rd_wr_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_blk_cnt_read(sc); + return exchange_with_mem_blk_cnt_rd_wr_extract(sc, word); +} +static inline uint32_t exchange_with_mem_blk_cnt_rd_wr_replace(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 31))) | (mask & plain_value)<< 31 ; +} +static inline void exchange_with_mem_blk_cnt_rd_wr_write(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t plain_value) { + uint32_t oldword = exchange_with_mem_blk_cnt_read(sc); + uint32_t newword = exchange_with_mem_blk_cnt_rd_wr_replace(sc, oldword, plain_value); + exchange_with_mem_blk_cnt_write(sc, newword); +} +#define CSR_EXCHANGE_WITH_MEM_LAST_BLK_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x18L) +#define CSR_EXCHANGE_WITH_MEM_LAST_BLK_SIZE 1 +static inline uint32_t exchange_with_mem_last_blk_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x18L); +} +#define CSR_EXCHANGE_WITH_MEM_LAST_DMA_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x1cL) +#define CSR_EXCHANGE_WITH_MEM_LAST_DMA_SIZE 1 +static inline uint32_t exchange_with_mem_last_dma_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x1cL); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_WRDONE_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x20L) +#define CSR_EXCHANGE_WITH_MEM_DMA_WRDONE_SIZE 1 +static inline uint32_t exchange_with_mem_dma_wrdone_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x20L); +} +#define CSR_EXCHANGE_WITH_MEM_BLK_REM_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x24L) +#define CSR_EXCHANGE_WITH_MEM_BLK_REM_SIZE 1 +static inline uint32_t exchange_with_mem_blk_rem_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x24L); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x28L) +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_SIZE 1 +static inline uint32_t exchange_with_mem_dma_status_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x28L); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_RD_FSM_BUSY_OFFSET 0 +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_RD_FSM_BUSY_SIZE 1 +static inline uint32_t exchange_with_mem_dma_status_rd_fsm_busy_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t exchange_with_mem_dma_status_rd_fsm_busy_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_dma_status_read(sc); + return exchange_with_mem_dma_status_rd_fsm_busy_extract(sc, word); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_WR_FSM_BUSY_OFFSET 1 +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_WR_FSM_BUSY_SIZE 1 +static inline uint32_t exchange_with_mem_dma_status_wr_fsm_busy_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 1) & mask ); +} +static inline uint32_t exchange_with_mem_dma_status_wr_fsm_busy_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_dma_status_read(sc); + return exchange_with_mem_dma_status_wr_fsm_busy_extract(sc, word); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_HAS_WR_DATA_OFFSET 2 +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_HAS_WR_DATA_SIZE 1 +static inline uint32_t exchange_with_mem_dma_status_has_wr_data_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 2) & mask ); +} +static inline uint32_t exchange_with_mem_dma_status_has_wr_data_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_dma_status_read(sc); + return exchange_with_mem_dma_status_has_wr_data_extract(sc, word); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_HAS_REQUESTS_OFFSET 3 +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_HAS_REQUESTS_SIZE 1 +static inline uint32_t exchange_with_mem_dma_status_has_requests_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 3) & mask ); +} +static inline uint32_t exchange_with_mem_dma_status_has_requests_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_dma_status_read(sc); + return exchange_with_mem_dma_status_has_requests_extract(sc, word); +} +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_HAS_RD_DATA_OFFSET 4 +#define CSR_EXCHANGE_WITH_MEM_DMA_STATUS_HAS_RD_DATA_SIZE 1 +static inline uint32_t exchange_with_mem_dma_status_has_rd_data_extract(struct sbusfpga_exchange_with_mem_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 4) & mask ); +} +static inline uint32_t exchange_with_mem_dma_status_has_rd_data_read(struct sbusfpga_exchange_with_mem_softc *sc) { + uint32_t word = exchange_with_mem_dma_status_read(sc); + return exchange_with_mem_dma_status_has_rd_data_extract(sc, word); +} +#define CSR_EXCHANGE_WITH_MEM_WR_TOSDRAM_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x2cL) +#define CSR_EXCHANGE_WITH_MEM_WR_TOSDRAM_SIZE 1 +static inline uint32_t exchange_with_mem_wr_tosdram_read(struct sbusfpga_exchange_with_mem_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_exchange_with_mem, 0x2cL); +} +#define CSR_EXCHANGE_WITH_MEM_CHECKSUM_ADDR (CSR_EXCHANGE_WITH_MEM_BASE + 0x30L) +#define CSR_EXCHANGE_WITH_MEM_CHECKSUM_SIZE 8 +#endif // CSR_EXCHANGE_WITH_MEM_BASE + +/* sbus_bus_stat */ +#ifndef CSR_SBUS_BUS_STAT_BASE +#define CSR_SBUS_BUS_STAT_BASE (CSR_BASE + 0x4000L) +#define CSR_SBUS_BUS_STAT_STAT_CTRL_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x0L) +#define CSR_SBUS_BUS_STAT_STAT_CTRL_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_ctrl_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x0L); +} +static inline void sbus_bus_stat_stat_ctrl_write(struct sbusfpga_sbus_bus_stat_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x0L, v); +} +#define CSR_SBUS_BUS_STAT_STAT_CTRL_UPDATE_OFFSET 0 +#define CSR_SBUS_BUS_STAT_STAT_CTRL_UPDATE_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_ctrl_update_extract(struct sbusfpga_sbus_bus_stat_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t sbus_bus_stat_stat_ctrl_update_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + uint32_t word = sbus_bus_stat_stat_ctrl_read(sc); + return sbus_bus_stat_stat_ctrl_update_extract(sc, word); +} +static inline uint32_t sbus_bus_stat_stat_ctrl_update_replace(struct sbusfpga_sbus_bus_stat_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void sbus_bus_stat_stat_ctrl_update_write(struct sbusfpga_sbus_bus_stat_softc *sc, uint32_t plain_value) { + uint32_t oldword = sbus_bus_stat_stat_ctrl_read(sc); + uint32_t newword = sbus_bus_stat_stat_ctrl_update_replace(sc, oldword, plain_value); + sbus_bus_stat_stat_ctrl_write(sc, newword); +} +#define CSR_SBUS_BUS_STAT_LIVE_STAT_CYCLE_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x4L) +#define CSR_SBUS_BUS_STAT_LIVE_STAT_CYCLE_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_live_stat_cycle_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x4L); +} +#define CSR_SBUS_BUS_STAT_STAT_CYCLE_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x8L) +#define CSR_SBUS_BUS_STAT_STAT_CYCLE_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_cycle_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x8L); +} +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_START_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0xcL) +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_START_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_slave_start_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0xcL); +} +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_DONE_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x10L) +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_DONE_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_slave_done_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x10L); +} +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_RERUN_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x14L) +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_RERUN_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_slave_rerun_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x14L); +} +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_EARLY_ERROR_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x18L) +#define CSR_SBUS_BUS_STAT_STAT_SLAVE_EARLY_ERROR_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_slave_early_error_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x18L); +} +#define CSR_SBUS_BUS_STAT_STAT_MASTER_START_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x1cL) +#define CSR_SBUS_BUS_STAT_STAT_MASTER_START_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_master_start_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x1cL); +} +#define CSR_SBUS_BUS_STAT_STAT_MASTER_DONE_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x20L) +#define CSR_SBUS_BUS_STAT_STAT_MASTER_DONE_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_master_done_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x20L); +} +#define CSR_SBUS_BUS_STAT_STAT_MASTER_ERROR_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x24L) +#define CSR_SBUS_BUS_STAT_STAT_MASTER_ERROR_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_master_error_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x24L); +} +#define CSR_SBUS_BUS_STAT_STAT_MASTER_RERUN_COUNTER_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x28L) +#define CSR_SBUS_BUS_STAT_STAT_MASTER_RERUN_COUNTER_SIZE 1 +static inline uint32_t sbus_bus_stat_stat_master_rerun_counter_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x28L); +} +#define CSR_SBUS_BUS_STAT_SBUS_MASTER_ERROR_VIRTUAL_ADDR (CSR_SBUS_BUS_STAT_BASE + 0x2cL) +#define CSR_SBUS_BUS_STAT_SBUS_MASTER_ERROR_VIRTUAL_SIZE 1 +static inline uint32_t sbus_bus_stat_sbus_master_error_virtual_read(struct sbusfpga_sbus_bus_stat_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sbus_bus_stat, 0x2cL); +} +#endif // CSR_SBUS_BUS_STAT_BASE + +/* sdram */ +#ifndef CSR_SDRAM_BASE +#define CSR_SDRAM_BASE (CSR_BASE + 0x5000L) +#define CSR_SDRAM_DFII_CONTROL_ADDR (CSR_SDRAM_BASE + 0x0L) +#define CSR_SDRAM_DFII_CONTROL_SIZE 1 +static inline uint32_t sdram_dfii_control_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x0L); +} +static inline void sdram_dfii_control_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x0L, v); +} +#define CSR_SDRAM_DFII_CONTROL_SEL_OFFSET 0 +#define CSR_SDRAM_DFII_CONTROL_SEL_SIZE 1 +static inline uint32_t sdram_dfii_control_sel_extract(struct sbusfpga_sdram_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 0) & mask ); +} +static inline uint32_t sdram_dfii_control_sel_read(struct sbusfpga_sdram_softc *sc) { + uint32_t word = sdram_dfii_control_read(sc); + return sdram_dfii_control_sel_extract(sc, word); +} +static inline uint32_t sdram_dfii_control_sel_replace(struct sbusfpga_sdram_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 0))) | (mask & plain_value)<< 0 ; +} +static inline void sdram_dfii_control_sel_write(struct sbusfpga_sdram_softc *sc, uint32_t plain_value) { + uint32_t oldword = sdram_dfii_control_read(sc); + uint32_t newword = sdram_dfii_control_sel_replace(sc, oldword, plain_value); + sdram_dfii_control_write(sc, newword); +} +#define CSR_SDRAM_DFII_CONTROL_CKE_OFFSET 1 +#define CSR_SDRAM_DFII_CONTROL_CKE_SIZE 1 +static inline uint32_t sdram_dfii_control_cke_extract(struct sbusfpga_sdram_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 1) & mask ); +} +static inline uint32_t sdram_dfii_control_cke_read(struct sbusfpga_sdram_softc *sc) { + uint32_t word = sdram_dfii_control_read(sc); + return sdram_dfii_control_cke_extract(sc, word); +} +static inline uint32_t sdram_dfii_control_cke_replace(struct sbusfpga_sdram_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 1))) | (mask & plain_value)<< 1 ; +} +static inline void sdram_dfii_control_cke_write(struct sbusfpga_sdram_softc *sc, uint32_t plain_value) { + uint32_t oldword = sdram_dfii_control_read(sc); + uint32_t newword = sdram_dfii_control_cke_replace(sc, oldword, plain_value); + sdram_dfii_control_write(sc, newword); +} +#define CSR_SDRAM_DFII_CONTROL_ODT_OFFSET 2 +#define CSR_SDRAM_DFII_CONTROL_ODT_SIZE 1 +static inline uint32_t sdram_dfii_control_odt_extract(struct sbusfpga_sdram_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 2) & mask ); +} +static inline uint32_t sdram_dfii_control_odt_read(struct sbusfpga_sdram_softc *sc) { + uint32_t word = sdram_dfii_control_read(sc); + return sdram_dfii_control_odt_extract(sc, word); +} +static inline uint32_t sdram_dfii_control_odt_replace(struct sbusfpga_sdram_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 2))) | (mask & plain_value)<< 2 ; +} +static inline void sdram_dfii_control_odt_write(struct sbusfpga_sdram_softc *sc, uint32_t plain_value) { + uint32_t oldword = sdram_dfii_control_read(sc); + uint32_t newword = sdram_dfii_control_odt_replace(sc, oldword, plain_value); + sdram_dfii_control_write(sc, newword); +} +#define CSR_SDRAM_DFII_CONTROL_RESET_N_OFFSET 3 +#define CSR_SDRAM_DFII_CONTROL_RESET_N_SIZE 1 +static inline uint32_t sdram_dfii_control_reset_n_extract(struct sbusfpga_sdram_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 3) & mask ); +} +static inline uint32_t sdram_dfii_control_reset_n_read(struct sbusfpga_sdram_softc *sc) { + uint32_t word = sdram_dfii_control_read(sc); + return sdram_dfii_control_reset_n_extract(sc, word); +} +static inline uint32_t sdram_dfii_control_reset_n_replace(struct sbusfpga_sdram_softc *sc, uint32_t oldword, uint32_t plain_value) { + uint32_t mask = ((1 << 1)-1); + return (oldword & (~(mask << 3))) | (mask & plain_value)<< 3 ; +} +static inline void sdram_dfii_control_reset_n_write(struct sbusfpga_sdram_softc *sc, uint32_t plain_value) { + uint32_t oldword = sdram_dfii_control_read(sc); + uint32_t newword = sdram_dfii_control_reset_n_replace(sc, oldword, plain_value); + sdram_dfii_control_write(sc, newword); +} +#define CSR_SDRAM_DFII_PI0_COMMAND_ADDR (CSR_SDRAM_BASE + 0x4L) +#define CSR_SDRAM_DFII_PI0_COMMAND_SIZE 1 +static inline uint32_t sdram_dfii_pi0_command_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x4L); +} +static inline void sdram_dfii_pi0_command_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x4L, v); +} +#define CSR_SDRAM_DFII_PI0_COMMAND_ISSUE_ADDR (CSR_SDRAM_BASE + 0x8L) +#define CSR_SDRAM_DFII_PI0_COMMAND_ISSUE_SIZE 1 +static inline uint32_t sdram_dfii_pi0_command_issue_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x8L); +} +static inline void sdram_dfii_pi0_command_issue_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x8L, v); +} +#define CSR_SDRAM_DFII_PI0_ADDRESS_ADDR (CSR_SDRAM_BASE + 0xcL) +#define CSR_SDRAM_DFII_PI0_ADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi0_address_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0xcL); +} +static inline void sdram_dfii_pi0_address_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0xcL, v); +} +#define CSR_SDRAM_DFII_PI0_BADDRESS_ADDR (CSR_SDRAM_BASE + 0x10L) +#define CSR_SDRAM_DFII_PI0_BADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi0_baddress_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x10L); +} +static inline void sdram_dfii_pi0_baddress_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x10L, v); +} +#define CSR_SDRAM_DFII_PI0_WRDATA_ADDR (CSR_SDRAM_BASE + 0x14L) +#define CSR_SDRAM_DFII_PI0_WRDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi0_wrdata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x14L); +} +static inline void sdram_dfii_pi0_wrdata_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x14L, v); +} +#define CSR_SDRAM_DFII_PI0_RDDATA_ADDR (CSR_SDRAM_BASE + 0x18L) +#define CSR_SDRAM_DFII_PI0_RDDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi0_rddata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x18L); +} +#define CSR_SDRAM_DFII_PI1_COMMAND_ADDR (CSR_SDRAM_BASE + 0x1cL) +#define CSR_SDRAM_DFII_PI1_COMMAND_SIZE 1 +static inline uint32_t sdram_dfii_pi1_command_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x1cL); +} +static inline void sdram_dfii_pi1_command_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x1cL, v); +} +#define CSR_SDRAM_DFII_PI1_COMMAND_ISSUE_ADDR (CSR_SDRAM_BASE + 0x20L) +#define CSR_SDRAM_DFII_PI1_COMMAND_ISSUE_SIZE 1 +static inline uint32_t sdram_dfii_pi1_command_issue_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x20L); +} +static inline void sdram_dfii_pi1_command_issue_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x20L, v); +} +#define CSR_SDRAM_DFII_PI1_ADDRESS_ADDR (CSR_SDRAM_BASE + 0x24L) +#define CSR_SDRAM_DFII_PI1_ADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi1_address_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x24L); +} +static inline void sdram_dfii_pi1_address_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x24L, v); +} +#define CSR_SDRAM_DFII_PI1_BADDRESS_ADDR (CSR_SDRAM_BASE + 0x28L) +#define CSR_SDRAM_DFII_PI1_BADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi1_baddress_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x28L); +} +static inline void sdram_dfii_pi1_baddress_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x28L, v); +} +#define CSR_SDRAM_DFII_PI1_WRDATA_ADDR (CSR_SDRAM_BASE + 0x2cL) +#define CSR_SDRAM_DFII_PI1_WRDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi1_wrdata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x2cL); +} +static inline void sdram_dfii_pi1_wrdata_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x2cL, v); +} +#define CSR_SDRAM_DFII_PI1_RDDATA_ADDR (CSR_SDRAM_BASE + 0x30L) +#define CSR_SDRAM_DFII_PI1_RDDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi1_rddata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x30L); +} +#define CSR_SDRAM_DFII_PI2_COMMAND_ADDR (CSR_SDRAM_BASE + 0x34L) +#define CSR_SDRAM_DFII_PI2_COMMAND_SIZE 1 +static inline uint32_t sdram_dfii_pi2_command_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x34L); +} +static inline void sdram_dfii_pi2_command_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x34L, v); +} +#define CSR_SDRAM_DFII_PI2_COMMAND_ISSUE_ADDR (CSR_SDRAM_BASE + 0x38L) +#define CSR_SDRAM_DFII_PI2_COMMAND_ISSUE_SIZE 1 +static inline uint32_t sdram_dfii_pi2_command_issue_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x38L); +} +static inline void sdram_dfii_pi2_command_issue_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x38L, v); +} +#define CSR_SDRAM_DFII_PI2_ADDRESS_ADDR (CSR_SDRAM_BASE + 0x3cL) +#define CSR_SDRAM_DFII_PI2_ADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi2_address_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x3cL); +} +static inline void sdram_dfii_pi2_address_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x3cL, v); +} +#define CSR_SDRAM_DFII_PI2_BADDRESS_ADDR (CSR_SDRAM_BASE + 0x40L) +#define CSR_SDRAM_DFII_PI2_BADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi2_baddress_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x40L); +} +static inline void sdram_dfii_pi2_baddress_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x40L, v); +} +#define CSR_SDRAM_DFII_PI2_WRDATA_ADDR (CSR_SDRAM_BASE + 0x44L) +#define CSR_SDRAM_DFII_PI2_WRDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi2_wrdata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x44L); +} +static inline void sdram_dfii_pi2_wrdata_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x44L, v); +} +#define CSR_SDRAM_DFII_PI2_RDDATA_ADDR (CSR_SDRAM_BASE + 0x48L) +#define CSR_SDRAM_DFII_PI2_RDDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi2_rddata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x48L); +} +#define CSR_SDRAM_DFII_PI3_COMMAND_ADDR (CSR_SDRAM_BASE + 0x4cL) +#define CSR_SDRAM_DFII_PI3_COMMAND_SIZE 1 +static inline uint32_t sdram_dfii_pi3_command_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x4cL); +} +static inline void sdram_dfii_pi3_command_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x4cL, v); +} +#define CSR_SDRAM_DFII_PI3_COMMAND_ISSUE_ADDR (CSR_SDRAM_BASE + 0x50L) +#define CSR_SDRAM_DFII_PI3_COMMAND_ISSUE_SIZE 1 +static inline uint32_t sdram_dfii_pi3_command_issue_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x50L); +} +static inline void sdram_dfii_pi3_command_issue_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x50L, v); +} +#define CSR_SDRAM_DFII_PI3_ADDRESS_ADDR (CSR_SDRAM_BASE + 0x54L) +#define CSR_SDRAM_DFII_PI3_ADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi3_address_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x54L); +} +static inline void sdram_dfii_pi3_address_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x54L, v); +} +#define CSR_SDRAM_DFII_PI3_BADDRESS_ADDR (CSR_SDRAM_BASE + 0x58L) +#define CSR_SDRAM_DFII_PI3_BADDRESS_SIZE 1 +static inline uint32_t sdram_dfii_pi3_baddress_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x58L); +} +static inline void sdram_dfii_pi3_baddress_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x58L, v); +} +#define CSR_SDRAM_DFII_PI3_WRDATA_ADDR (CSR_SDRAM_BASE + 0x5cL) +#define CSR_SDRAM_DFII_PI3_WRDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi3_wrdata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x5cL); +} +static inline void sdram_dfii_pi3_wrdata_write(struct sbusfpga_sdram_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x5cL, v); +} +#define CSR_SDRAM_DFII_PI3_RDDATA_ADDR (CSR_SDRAM_BASE + 0x60L) +#define CSR_SDRAM_DFII_PI3_RDDATA_SIZE 1 +static inline uint32_t sdram_dfii_pi3_rddata_read(struct sbusfpga_sdram_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_sdram, 0x60L); +} +#endif // CSR_SDRAM_BASE + +/* trng */ +#ifndef CSR_TRNG_BASE +#define CSR_TRNG_BASE (CSR_BASE + 0x6000L) +#define CSR_TRNG_CTRL_ADDR (CSR_TRNG_BASE + 0x0L) +#define CSR_TRNG_CTRL_SIZE 1 +static inline uint32_t trng_ctrl_read(struct sbusfpga_trng_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_trng, 0x0L); +} +static inline void trng_ctrl_write(struct sbusfpga_trng_softc *sc, uint32_t v) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_trng, 0x0L, v); +} +#define CSR_TRNG_DATA_ADDR (CSR_TRNG_BASE + 0x4L) +#define CSR_TRNG_DATA_SIZE 1 +static inline uint32_t trng_data_read(struct sbusfpga_trng_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_trng, 0x4L); +} +#endif // CSR_TRNG_BASE + +#endif diff --git a/sbus-to-ztex-gateware-migen/prom_csr.fth b/sbus-to-ztex-gateware-migen/prom_csr.fth new file mode 100644 index 0000000..39cc271 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/prom_csr.fth @@ -0,0 +1,14 @@ +\ auto-generated base regions for CSRs in the PROM +h# 40000 constant sbusfpga_csraddr_leds +h# 41000 constant sbusfpga_csraddr_curve25519engine +h# 42000 constant sbusfpga_csraddr_ddrphy +h# 43000 constant sbusfpga_csraddr_exchange_with_mem +h# 44000 constant sbusfpga_csraddr_sbus_bus_stat +h# 45000 constant sbusfpga_csraddr_sdram +h# 46000 constant sbusfpga_csraddr_trng +h# 80000 constant sbusfpga_regionaddr_usb_host_ctrl +h# 0 constant sbusfpga_regionaddr_prom +h# 80000000 constant sbusfpga_regionaddr_main_ram +h# fc000000 constant sbusfpga_regionaddr_usb_fake_dma +h# a0000 constant sbusfpga_regionaddr_curve25519engine +h# 40000 constant sbusfpga_regionaddr_csr diff --git a/sbus-to-ztex-gateware-migen/prom_migen.bth b/sbus-to-ztex-gateware-migen/prom_migen.bth new file mode 100644 index 0000000..2285c83 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/prom_migen.bth @@ -0,0 +1,31 @@ +purpose: Load file for SBusFPGA + +command: &builder &this + +\ in: ${BP}/dev/usb2/device/hub/build/hub.fc +\ in: ${BP}/dev/usb2/device/generic/build/generic.fc +\ in: ${BP}/dev/usb2/device/net/build/usbnet.fc +\ in: ${BP}/dev/usb2/device/serial/build/usbserial.fc +\ in: ${BP}/dev/usb2/device/storage/build/usbstorage.fc +\ in: ${BP}/dev/usb2/device/keyboard/build/usbkbd.fc +\ in: ${BP}/dev/usb2/device/mouse/build/usbmouse.fc + +build-now + +\ silent on + +begin-tokenizing prom_migen.fc + +fload prom_migen.fth + +end-tokenizing + +\ h# 8000 to reserved-start +\ h# f000 to reserved-end +\ " ${BP}/dev/usb2/device/hub/build/hub.fc" " usb,class9" $add-dropin +\ " ${BP}/dev/usb2/device/generic/build/generic.fc" " usbdevice" $add-deflated-dropin +\ " ${BP}/dev/usb2/device/net/build/usbnet.fc" " usbnet" $add-deflated-dropin +\ " ${BP}/dev/usb2/device/keyboard/build/usbkbd.fc" " usb,class3,1,1" $add-deflated-dropin +\ " ${BP}/dev/usb2/device/mouse/build/usbmouse.fc" " usb,class3,1,2" $add-deflated-dropin +\ " ${BP}/dev/usb2/device/serial/build/usbserial.fc" " usbserial" $add-deflated-dropin +\ " ${BP}/dev/usb2/device/storage/build/usbstorage.fc" " usbstorage" $add-deflated-dropin diff --git a/sbus-to-ztex-gateware-migen/prom_migen.fth b/sbus-to-ztex-gateware-migen/prom_migen.fth new file mode 100644 index 0000000..a85283b --- /dev/null +++ b/sbus-to-ztex-gateware-migen/prom_migen.fth @@ -0,0 +1,247 @@ +fcode-version2 + +\ loads constants +fload prom_csr.fth + +\ fload v2compat.fth + +\ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ LEDs +\ Absolute minimal stuff; name & registers def. +" RDOL,led" device-name +my-address sbusfpga_csraddr_leds + my-space h# 4 reg +\ we don't support ET or HWORD +h# 7d xdrint " slave-burst-sizes" attribute +h# 7d xdrint " burst-sizes" attribute + +headers +-1 instance value led-virt +my-address constant my-sbus-address +my-space constant my-sbus-space + +: map-in ( adr space size -- virt ) " map-in" $call-parent ; +: map-out ( virt size -- ) " map-out" $call-parent ; + +: map-in-led ( -- ) my-sbus-address sbusfpga_csraddr_leds + my-sbus-space h# 4 map-in is led-virt ; +: map-out-led ( -- ) led-virt h# 4 map-out ; + +: setled! ( pattern -- ) + map-in-led + led-virt l! ( pattern virt -- ) + map-out-led +; + +\ h# a5 setled! + +\ OpenBIOS tokenizer won't accept finish-device without new-device +\ Cheat by using the tokenizer so we can do OpenBoot 2.x siblings +\ tokenizer[ 01 emit-byte h# 27 emit-byte h# 01 emit-byte h# 1f emit-byte ]tokenizer +\ The OpenFirmware tokenizer does accept the 'clean' syntax +finish-device +\ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ USB OHCI +new-device + +\ Absolute minimal stuff; name & registers def. +" generic-ohci" device-name + +\ USB registers are in the device space, not the CSR space +my-address sbusfpga_regionaddr_usb_host_ctrl + my-space h# 1000 reg +\ we don't support ET or anything non-32bits +h# 7c xdrint " slave-burst-sizes" attribute +h# 7c xdrint " burst-sizes" attribute + +1 xdrint " interrupts" attribute + +headers +-1 instance value regs-virt +my-address constant my-sbus-address +my-space constant my-sbus-space + +: map-in ( adr space size -- virt ) " map-in" $call-parent ; +: map-out ( virt size -- ) " map-out" $call-parent ; + +: map-in-regs ( -- ) my-sbus-address sbusfpga_regionaddr_usb_host_ctrl + my-sbus-space h# 1000 map-in is regs-virt ; +: map-out-regs ( -- ) regs-virt h# 1000 map-out ; + +: my-reset! ( -- ) + map-in-regs + 00000001 regs-virt h# 4 + l! ( -- ) ( reset the HC ) + 00000000 regs-virt h# 18 + l! ( -- ) ( reset HCCA & friends ) + 00000000 regs-virt h# 1c + l! ( -- ) + 00000000 regs-virt h# 20 + l! ( -- ) + 00000000 regs-virt h# 24 + l! ( -- ) + 00000000 regs-virt h# 28 + l! ( -- ) + 00000000 regs-virt h# 2c + l! ( -- ) + 00000000 regs-virt h# 30 + l! ( -- ) + map-out-regs +; + +my-reset! + +\ " ohci" encode-string " device_type" property +\ fload openfirmware/dev/usb2/hcd/ohci/loadpkg-sbus.fth +\ open + +\ OpenBIOS tokenizer won't accept finish-device without new-device +\ Cheat by using the tokenizer so we can do OpenBoot 2.x siblings +\ tokenizer[ 01 emit-byte h# 27 emit-byte h# 01 emit-byte h# 1f emit-byte ]tokenizer +\ The OpenFirmware tokenizer does accept the 'clean' syntax +finish-device +\ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ SDRAM +new-device + +\ Absolute minimal stuff; name & registers def. +" RDOL,sdram" device-name +\ three pages of registers: +my-address sbusfpga_csraddr_ddrphy + my-space xdrphys \ Offset#1 +h# 1000 xdrint xdr+ \ Merge size#1 +my-address sbusfpga_csraddr_sdram + my-space xdrphys xdr+ \ Merge offset#2 +h# 1000 xdrint xdr+ \ Merge size#2 +my-address sbusfpga_csraddr_exchange_with_mem + my-space xdrphys xdr+ \ Merge offset#3 +h# 1000 xdrint xdr+ \ Merge size#3 +\ my-address sbusfpga_regionaddr_main_ram + my-space xdrphys xdr+ \ Merge offset#4 +\ h# 10000 xdrint xdr+ \ Merge size#4 +" reg" attribute + +\ we don't support ET or anything non-32bits +h# 7c xdrint " slave-burst-sizes" attribute +h# 7c xdrint " burst-sizes" attribute + +headers +-1 instance value mregs-ddrphy-virt +-1 instance value mregs-sdram-virt +-1 instance value mregs-exchange_with_mem-virt +my-address constant my-sbus-address +my-space constant my-sbus-space +: map-in ( adr space size -- virt ) " map-in" $call-parent ; +: map-out ( virt size -- ) " map-out" $call-parent ; + +: map-in-mregs ( -- ) + my-sbus-address sbusfpga_csraddr_ddrphy + my-sbus-space h# 1000 map-in is mregs-ddrphy-virt + my-sbus-address sbusfpga_csraddr_sdram + my-sbus-space h# 1000 map-in is mregs-sdram-virt + my-sbus-address sbusfpga_csraddr_exchange_with_mem + my-sbus-space h# 1000 map-in is mregs-exchange_with_mem-virt +; +: map-out-mregs ( -- ) + mregs-ddrphy-virt h# 1000 map-out + mregs-sdram-virt h# 1000 map-out + mregs-exchange_with_mem-virt h# 1000 map-out +; + +\ fload sdram_init.fth + +\ init! + + +\ OpenBIOS tokenizer won't accept finish-device without new-device +\ Cheat by using the tokenizer so we can do OpenBoot 2.x siblings +\ tokenizer[ 01 emit-byte h# 27 emit-byte h# 01 emit-byte h# 1f emit-byte ]tokenizer +\ The OpenFirmware tokenizer does accept the 'clean' syntax +finish-device +\ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ TRNG +new-device + +\ Absolute minimal stuff; name & registers def. +" RDOL,neorv32trng" device-name + +my-address sbusfpga_csraddr_trng + my-space h# 8 reg +\ we don't support ET or HWORD +h# 7d xdrint " slave-burst-sizes" attribute +h# 7d xdrint " burst-sizes" attribute + +headers +-1 instance value trng-virt +my-address constant my-sbus-address +my-space constant my-sbus-space + +: map-in ( adr space size -- virt ) " map-in" $call-parent ; +: map-out ( virt size -- ) " map-out" $call-parent ; + +: map-in-trng ( -- ) my-sbus-address sbusfpga_csraddr_trng + my-sbus-space h# 8 map-in is trng-virt ; +: map-out-trng ( -- ) trng-virt h# 8 map-out ; + +: disabletrng! ( -- ) + map-in-trng + 1 trng-virt l! ( pattern virt -- ) + map-out-trng +; + +disabletrng! + + +\ OpenBIOS tokenizer won't accept finish-device without new-device +\ Cheat by using the tokenizer so we can do OpenBoot 2.x siblings +\ tokenizer[ 01 emit-byte h# 27 emit-byte h# 01 emit-byte h# 1f emit-byte ]tokenizer +\ The OpenFirmware tokenizer does accept the 'clean' syntax +finish-device +\ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ CURVE25519 +new-device + +\ Absolute minimal stuff; name & registers def. +" betrustedc25519e" device-name + +\ one page of CSR registers, plus the memory +\ we might want to replace the slave memory access +\ by another instance of exchange_with_mem ? +\ we split the memory space in two +\ 0x1000 @ 0x0 for the microcode +\ 0x10000 @ 0x10000 for the register file +my-address sbusfpga_csraddr_curve25519engine + my-space xdrphys \ Offset#1 +h# 1000 xdrint xdr+ \ Merge size#1 +my-address sbusfpga_regionaddr_curve25519engine + my-space xdrphys xdr+ \ Merge offset#2 +h# 1000 xdrint xdr+ \ Merge size#2 +my-address sbusfpga_regionaddr_curve25519engine h# 10000 + + my-space xdrphys xdr+ \ Merge offset#3 +h# 10000 xdrint xdr+ \ Merge size#3 +" reg" attribute + +\ we don't support ET or HWORD +h# 7d xdrint " slave-burst-sizes" attribute +h# 7d xdrint " burst-sizes" attribute + +headers +-1 instance value curve25519engine-virt +-1 instance value curve25519engine-microcode-virt +-1 instance value curve25519engine-regfile-virt +my-address constant my-sbus-address +my-space constant my-sbus-space + +: map-in ( adr space size -- virt ) " map-in" $call-parent ; +: map-out ( virt size -- ) " map-out" $call-parent ; + +: map-in-curve25519engine ( -- ) + my-sbus-address sbusfpga_csraddr_curve25519engine + my-sbus-space h# 1000 map-in is curve25519engine-virt + my-sbus-address sbusfpga_regionaddr_curve25519engine + my-sbus-space h# 1000 map-in is curve25519engine-microcode-virt + my-sbus-address sbusfpga_regionaddr_curve25519engine h# 10000 + + my-sbus-space h# 10000 map-in is curve25519engine-regfile-virt +; +: map-out-curve25519engine ( -- ) + curve25519engine-virt h# 1000 map-out + curve25519engine-microcode-virt h# 1000 map-out + curve25519engine-regfile-virt h# 10000 map-out +; + +\ OpenBIOS tokenizer won't accept finish-device without new-device +\ Cheat by using the tokenizer so we can do OpenBoot 2.x siblings +\ tokenizer[ 01 emit-byte h# 27 emit-byte h# 01 emit-byte h# 1f emit-byte ]tokenizer +\ The OpenFirmware tokenizer does accept the 'clean' syntax +finish-device +\ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ STAT +new-device + +\ Absolute minimal stuff; name & registers def. +" RDOL,sbusstat" device-name + +my-address sbusfpga_csraddr_sbus_bus_stat + my-space h# 100 reg +\ we don't support ET or HWORD +h# 7d xdrint " slave-burst-sizes" attribute +h# 7d xdrint " burst-sizes" attribute + +headers +-1 instance value sbus_bus_stat-virt +my-address constant my-sbus-address +my-space constant my-sbus-space + +: map-in ( adr space size -- virt ) " map-in" $call-parent ; +: map-out ( virt size -- ) " map-out" $call-parent ; + +: map-in-sbus_bus_stat ( -- ) my-sbus-address sbusfpga_csraddr_sbus_bus_stat + my-sbus-space h# 100 map-in is sbus_bus_stat-virt ; +: map-out-sbus_bus_stat ( -- ) sbus_bus_stat-virt h# 100 map-out ; + +end0 diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_blk_dma.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_blk_dma.py new file mode 100644 index 0000000..af0fe63 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_blk_dma.py @@ -0,0 +1,239 @@ +from migen import * +from migen.genlib.fifo import * +from migen.genlib.cdc import BusSynchronizer +from litex.soc.interconnect.csr import * +from litex.soc.interconnect import wishbone + +# width of towrite_fifo is '32'+'burst_size * 32' (vaddr + data) +# so the SBus DMA has all the needed info +# width of fromsbus_req_fifo is 'blk_addr_width' + 'vaddr' (blk_addr + vaddr) +# width of fromsbus_fifo is 'blk_addr_width' + 'burst_size * 32' (blk_addr + data) +# the blk_addr does the round-trip to accompany the data +class ExchangeWithMem(Module, AutoCSR): + def __init__(self, soc, tosbus_fifo, fromsbus_fifo, fromsbus_req_fifo, dram_dma_writer, dram_dma_reader, burst_size = 8, do_checksum = False): + #self.wishbone_r_slave = wishbone.Interface(data_width=soc.bus.data_width) + #self.wishbone_w_slave = wishbone.Interface(data_width=soc.bus.data_width) + self.tosbus_fifo = tosbus_fifo + self.fromsbus_fifo = fromsbus_fifo + self.fromsbus_req_fifo = fromsbus_req_fifo + self.dram_dma_writer = dram_dma_writer + self.dram_dma_reader = dram_dma_reader + + data_width = burst_size * 4 + data_width_bits = burst_size * 32 + blk_addr_width = 32 - log2_int(data_width) # 27 for burst_size == 8 + + assert(len(self.dram_dma_writer.sink.data) == data_width_bits) + assert(len(self.dram_dma_reader.source.data) == data_width_bits) + + #self.wishbone_r_master = wishbone.Interface(data_width=data_width_bits) + #self.wishbone_w_master = wishbone.Interface(data_width=data_width_bits) + + #self.submodules += wishbone.Converter(self.wishbone_r_master, self.wishbone_r_slave) + #self.submodules += wishbone.Converter(self.wishbone_w_master, self.wishbone_w_slave) + + print("ExchangeWithMem: data_width = {}, data_width_bits = {}, blk_addr_width = {}\n".format(data_width, data_width_bits, blk_addr_width)) + print("ExchangeWithMem: tosbus_fifo width = {}, fromsbus_fifo width = {}, fromsbus_req_fifo width = {}\n".format(len(tosbus_fifo.din), len(fromsbus_fifo.dout), len(fromsbus_req_fifo.din))) + + local_r_addr = Signal(blk_addr_width) + dma_r_addr = Signal(32) + #local_r_widx = Signal(log2_int(burst_size)) # so width is 3 for burst_size == 8 + #local_r_buffer = Signal(data_width_bits) + + local_w_addr = Signal(blk_addr_width) + dma_w_addr = Signal(32) + #local_w_widx = Signal(log2_int(burst_size)) # so width is 3 for burst_size == 8 + #local_w_buffer = Signal(data_width_bits) + + max_block_bits=16 + + # CSRConstant do not seem to appear in the CSR Map, but they need to be accessible to the OS driver + #self.blk_size = CSRConstant(value=data_width) # report the block size to the SW layer + #self.blk_base = CSRConstant(value=soc.wb_mem_map["main_ram"] >> log2_int(data_width)) # report where the blk starts + self.blk_size = CSRStatus(32) # report the block size to the SW layer + self.blk_base = CSRStatus(32) # report where the blk starts + self.mem_size = CSRStatus(32) # report how much memory we have + self.comb += self.blk_size.status.eq(data_width) + self.comb += self.blk_base.status.eq(soc.wb_mem_map["main_ram"] >> log2_int(data_width)) + self.comb += self.mem_size.status.eq((256 * 1024 * 1024) >> log2_int(data_width)) # is it already available from mem_regions ? + + self.blk_addr = CSRStorage(32, description = "SDRAM Block address to read/write from Wishbone memory (block of size {})".format(data_width)) + self.dma_addr = CSRStorage(32, description = "Host Base address where to write/read data (i.e. SPARC Virtual addr)") + #self.blk_cnt = CSRStorage(32, write_from_dev=True, description = "How many blk to read/write (max 2^{}-1); bit 31 is RD".format(max_block_bits), reset = 0) + self.blk_cnt = CSRStorage(write_from_dev=True, fields = [CSRField("blk_cnt", max_block_bits, description = "How many blk to read/write (max 2^{}-1)".format(max_block_bits)), + CSRField("rsvd", 32 - (max_block_bits + 1), description = "Reserved"), + CSRField("rd_wr", 1, description = "Read/Write selector"), + ]) + self.last_blk = CSRStatus(32, description = "Last Blk addr finished on WB side") + self.last_dma = CSRStatus(32, description = "Last DMA addr finished on WB side") + self.dma_wrdone = CSRStatus(32, description = "DMA Block written to SDRAM", reset = 0) + self.blk_rem = CSRStatus(32, description = "How many block remaining; bit 31 is RD", reset = 0) + self.dma_status = CSRStatus(fields = [CSRField("rd_fsm_busy", 1, description = "Read FSM is doing some work"), + CSRField("wr_fsm_busy", 1, description = "Write FSM is doing some work"), + CSRField("has_wr_data", 1, description = "Data available to write to SDRAM"), + CSRField("has_requests", 1, description = "There's outstanding requests to the SBus"), + CSRField("has_rd_data", 1, description = "Data available to write to SBus"), + ]) + self.wr_tosdram = CSRStatus(32, description = "Last address written to SDRAM") + + if (do_checksum): + self.checksum = CSRStorage(data_width_bits, write_from_dev=True, description = "checksum (XOR)"); + + self.submodules.req_r_fsm = req_r_fsm = FSM(reset_state="Reset") + self.submodules.req_w_fsm = req_w_fsm = FSM(reset_state="Reset") + + self.comb += self.dma_status.fields.rd_fsm_busy.eq(~req_r_fsm.ongoing("Idle")) # Read FSM Busy + self.comb += self.dma_status.fields.wr_fsm_busy.eq(~req_w_fsm.ongoing("Idle")) # Write FSM Busy + self.comb += self.dma_status.fields.has_wr_data.eq(self.fromsbus_fifo.readable) # Some data available to write to memory + + # The next two status bits reflect stats in the SBus clock domain + self.submodules.fromsbus_req_fifo_readable_sync = BusSynchronizer(width = 1, idomain = "sbus", odomain = "sys") + fromsbus_req_fifo_readable_in_sys = Signal() + self.comb += self.fromsbus_req_fifo_readable_sync.i.eq(self.fromsbus_req_fifo.readable) + self.comb += fromsbus_req_fifo_readable_in_sys.eq(self.fromsbus_req_fifo_readable_sync.o) + + # w/o this extra delay, the driver sees an outdated checksum for some reason... + # there's probably a more fundamental issue :-( + # note: replaced PulseSynchronizer with BusSynchronizer, should I retry w/o this ? + fromsbus_req_fifo_readable_in_sys_cnt = Signal(5) + self.sync += If(fromsbus_req_fifo_readable_in_sys, + fromsbus_req_fifo_readable_in_sys_cnt.eq(0x1F) + ).Else( + If(fromsbus_req_fifo_readable_in_sys_cnt > 0, + fromsbus_req_fifo_readable_in_sys_cnt.eq(fromsbus_req_fifo_readable_in_sys_cnt - 1) + ) + ) + #self.comb += self.dma_status.fields.has_requests.eq(fromsbus_req_fifo_readable_in_sys) # we still have outstanding requests + self.comb += self.dma_status.fields.has_requests.eq(fromsbus_req_fifo_readable_in_sys | (fromsbus_req_fifo_readable_in_sys_cnt != 0)) # we still have outstanding requests, or had recently + + self.submodules.tosbus_fifo_readable_sync = BusSynchronizer(width = 1, idomain = "sbus", odomain = "sys") + tosbus_fifo_readable_in_sys = Signal() + self.comb += self.tosbus_fifo_readable_sync.i.eq(self.tosbus_fifo.readable) + self.comb += tosbus_fifo_readable_in_sys.eq(self.tosbus_fifo_readable_sync.o) + self.comb += self.dma_status.fields.has_rd_data.eq(tosbus_fifo_readable_in_sys) # there's still data to be sent to memory; this will drop before the last SBus Master Cycle is finished, but then the SBus is busy so the host won't be able to read the status before the cycle is finished so we're good + + #self.comb += self.dma_status.status[16:17].eq(self.wishbone_w_master.cyc) # show the WB iface status (W) + #self.comb += self.dma_status.status[17:18].eq(self.wishbone_w_master.stb) + #self.comb += self.dma_status.status[18:19].eq(self.wishbone_w_master.we) + #self.comb += self.dma_status.status[19:20].eq(self.wishbone_w_master.ack) + #self.comb += self.dma_status.status[20:21].eq(self.wishbone_w_master.err) + + #self.comb += self.dma_status.status[24:25].eq(self.wishbone_r_master.cyc) # show the WB iface status (R) + #self.comb += self.dma_status.status[25:26].eq(self.wishbone_r_master.stb) + #self.comb += self.dma_status.status[26:27].eq(self.wishbone_r_master.we) + #self.comb += self.dma_status.status[27:28].eq(self.wishbone_r_master.ack) + #self.comb += self.dma_status.status[28:29].eq(self.wishbone_r_master.err) + + req_r_fsm.act("Reset", + NextState("Idle") + ) + req_r_fsm.act("Idle", + If(((self.blk_cnt.fields.blk_cnt != 0) & # checking self.blk_cnt.re might be too transient ? -> need to auto-reset + (~self.blk_cnt.fields.rd_wr)), # !read -> write + NextValue(local_r_addr, self.blk_addr.storage), + NextValue(dma_r_addr, self.dma_addr.storage), + NextValue(self.blk_rem.status, Cat(self.blk_cnt.fields.blk_cnt, Signal(32-max_block_bits, reset = 0))), + NextState("ReqFromMemory") + ).Elif(((self.blk_cnt.fields.blk_cnt != 0) & # checking self.blk_cnt.re might be too transient ? -> need to auto-reset + (self.blk_cnt.fields.rd_wr)), # read + NextValue(local_r_addr, self.blk_addr.storage), + NextValue(dma_r_addr, self.dma_addr.storage), + NextValue(self.blk_rem.status, Cat(self.blk_cnt.fields.blk_cnt, Signal(32-max_block_bits, reset = 0))), + NextState("QueueReqToMemory") + ) + ) + req_r_fsm.act("ReqFromMemory", + self.dram_dma_reader.sink.address.eq(local_r_addr), + self.dram_dma_reader.sink.valid.eq(1), + If(self.dram_dma_reader.sink.ready, + NextState("WaitForData") + ) + ) + req_r_fsm.act("WaitForData", + If(self.dram_dma_reader.source.valid & self.tosbus_fifo.writable, + self.tosbus_fifo.we.eq(1), + self.tosbus_fifo.din.eq(Cat(dma_r_addr, self.dram_dma_reader.source.data)), + If(do_checksum, + self.checksum.we.eq(1), + self.checksum.dat_w.eq(self.checksum.storage ^ self.dram_dma_reader.source.data), + ), + self.dram_dma_reader.source.ready.eq(1), + NextValue(self.last_blk.status, local_r_addr), + NextValue(self.last_dma.status, dma_r_addr), + NextValue(self.blk_rem.status, self.blk_rem.status - 1), + If(self.blk_rem.status[0:max_block_bits] <= 1, + self.blk_cnt.we.eq(1), ## auto-reset + self.blk_cnt.dat_w.eq(0), + NextState("Idle"), + ).Else( + NextValue(local_r_addr, local_r_addr + 1), + NextValue(dma_r_addr, dma_r_addr + data_width), + NextState("ReqFromMemory"), + ) + ) + ) + req_r_fsm.act("QueueReqToMemory", + If(self.fromsbus_req_fifo.writable, + self.fromsbus_req_fifo.we.eq(1), + self.fromsbus_req_fifo.din.eq(Cat(local_r_addr, dma_r_addr)), + NextValue(self.last_blk.status, local_r_addr), + NextValue(self.last_dma.status, dma_r_addr), + NextValue(self.blk_rem.status, self.blk_rem.status - 1), + If(self.blk_rem.status[0:max_block_bits] <= 1, + self.blk_cnt.we.eq(1), ## auto-reset + self.blk_cnt.dat_w.eq(0), + NextState("Idle"), + ).Else( + NextValue(local_r_addr, local_r_addr + 1), + NextValue(dma_r_addr, dma_r_addr + data_width), + NextValue(self.blk_rem.status, self.blk_rem.status - 1), + NextState("QueueReqToMemory"), #redundant + ) + ) + ) + + +# req_w_fsm.act("Reset", +# NextState("Idle") +# ) +# req_w_fsm.act("Idle", +# If(self.fromsbus_fifo.readable & +# ~self.wishbone_w_master.ack, +# self.fromsbus_fifo.re.eq(1), +# NextValue(self.wishbone_w_master.cyc, 1), +# NextValue(self.wishbone_w_master.stb, 1), +# NextValue(self.wishbone_w_master.sel, 2**len(self.wishbone_w_master.sel)-1), +# NextValue(self.wishbone_w_master.we, 1), +# NextValue(self.wishbone_w_master.adr, self.fromsbus_fifo.dout[0:blk_addr_width]), +# NextValue(self.wishbone_w_master.dat_w, self.fromsbus_fifo.dout[blk_addr_width:(blk_addr_width + data_width_bits)]), +# NextValue(self.wr_tosdram.status, self.fromsbus_fifo.dout[0:blk_addr_width]), +# NextState("WaitForAck") +# ) +# ) +# req_w_fsm.act("WaitForAck", +# If(self.wishbone_w_master.ack, +# NextValue(self.wishbone_w_master.cyc, 0), +# NextValue(self.wishbone_w_master.stb, 0), +# NextState("Idle"), +# ) +# ) + + req_w_fsm.act("Reset", + NextState("Idle") + ) + req_w_fsm.act("Idle", + If(self.fromsbus_fifo.readable, + self.dram_dma_writer.sink.address.eq(self.fromsbus_fifo.dout[0:blk_addr_width]), + self.dram_dma_writer.sink.data.eq(self.fromsbus_fifo.dout[blk_addr_width:(blk_addr_width + data_width_bits)]), + self.dram_dma_writer.sink.valid.eq(1), + NextValue(self.wr_tosdram.status, self.fromsbus_fifo.dout[0:blk_addr_width]), + If(self.dram_dma_writer.sink.ready, + self.fromsbus_fifo.re.eq(1), + NextValue(self.dma_wrdone.status, self.dma_wrdone.status + 1), + If(do_checksum, + self.checksum.we.eq(1), + self.checksum.dat_w.eq(self.checksum.storage ^ self.fromsbus_fifo.dout[blk_addr_width:(blk_addr_width + data_width_bits)]), + ) + ) + ) + ) diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_export.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_export.py new file mode 100644 index 0000000..e6a2f1a --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_export.py @@ -0,0 +1,133 @@ +import os +import json +import inspect +from shutil import which +from sysconfig import get_platform + +from migen import * + +from litex.soc.interconnect.csr import CSRStatus + +from litex.build.tools import generated_banner + +from litex.soc.doc.rst import reflow +from litex.soc.doc.module import gather_submodules, ModuleNotDocumented, DocumentedModule, DocumentedInterrupts +from litex.soc.doc.csr import DocumentedCSRRegion +from litex.soc.interconnect.csr import _CompoundCSR + +# for generating a timestamp in the description field, if none is otherwise given +import datetime +import time + +def _get_rw_functions_c(name, csr_name, reg_base, area_base, nwords, busword, alignment, read_only, with_access_functions): + reg_name = name + "_" + csr_name + r = "" + + addr_str = "CSR_{}_ADDR".format(reg_name.upper()) + size_str = "CSR_{}_SIZE".format(reg_name.upper()) + r += "#define {} (CSR_{}_BASE + {}L)\n".format(addr_str, name.upper(), hex(reg_base - area_base)) + r += "#define {} {}\n".format(size_str, nwords) + + size = nwords*busword//8 + if size > 8: + # downstream should select appropriate `csr_[rd|wr]_buf_uintX()` pair! + return r + elif size > 4: + ctype = "uint64_t" + elif size > 2: + ctype = "uint32_t" + elif size > 1: + ctype = "uint16_t" + else: + ctype = "uint8_t" + + stride = alignment//8; + if with_access_functions: + r += "static inline {} {}_read(struct sbusfpga_{}_softc *sc) {{\n".format(ctype, reg_name, name) + if nwords > 1: + r += "\t{} r = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_{}, {}L);\n".format(ctype, name, hex(reg_base - area_base)) + for sub in range(1, nwords): + r += "\tr <<= {};\n".format(busword) + r += "\tr |= bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_{}, {}L);\n".format(name, hex(reg_base - area_base + sub*stride)) + r += "\treturn r;\n}\n" + else: + r += "\treturn bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_{}, {}L);\n}}\n".format(name, hex(reg_base - area_base)) + + if not read_only: + r += "static inline void {}_write(struct sbusfpga_{}_softc *sc, {} v) {{\n".format(reg_name, name, ctype) + for sub in range(nwords): + shift = (nwords-sub-1)*busword + if shift: + v_shift = "v >> {}".format(shift) + else: + v_shift = "v" + r += "\tbus_space_write_4(sc->sc_bustag, sc->sc_bhregs_{}, {}L, {});\n".format(name, hex(reg_base - area_base + sub*stride), v_shift) + r += "}\n" + return r + + +def get_csr_header(regions, constants, csr_base=None, with_access_functions=True): + alignment = constants.get("CONFIG_CSR_ALIGNMENT", 32) + r = generated_banner("//") + #if with_access_functions: # FIXME + # r += "#include \n" + r += "#ifndef __GENERATED_CSR_H\n#define __GENERATED_CSR_H\n" + #if with_access_functions: + # r += "#include \n" + # r += "#include \n" + # r += "#ifndef CSR_ACCESSORS_DEFINED\n" + # r += "#include \n" + # r += "#endif /* ! CSR_ACCESSORS_DEFINED */\n" + csr_base = csr_base if csr_base is not None else regions[next(iter(regions))].origin + r += "#ifndef CSR_BASE\n" + r += "#define CSR_BASE {}L\n".format(hex(csr_base)) + r += "#endif\n" + for name, region in regions.items(): + origin = region.origin - csr_base + r += "\n/* "+name+" */\n" + r += "#ifndef CSR_"+name.upper()+"_BASE\n" + r += "#define CSR_"+name.upper()+"_BASE (CSR_BASE + "+hex(origin)+"L)\n" + if not isinstance(region.obj, Memory): + for csr in region.obj: + nr = (csr.size + region.busword - 1)//region.busword + r += _get_rw_functions_c(name, csr.name, origin, region.origin - csr_base, nr, region.busword, alignment, + getattr(csr, "read_only", False), with_access_functions) + origin += alignment//8*nr + if hasattr(csr, "fields"): + for field in csr.fields.fields: + offset = str(field.offset) + size = str(field.size) + r += "#define CSR_"+name.upper()+"_"+csr.name.upper()+"_"+field.name.upper()+"_OFFSET "+offset+"\n" + r += "#define CSR_"+name.upper()+"_"+csr.name.upper()+"_"+field.name.upper()+"_SIZE "+size+"\n" + if with_access_functions and csr.size <= 32: # FIXME: Implement extract/read functions for csr.size > 32-bit. + reg_name = name + "_" + csr.name.lower() + field_name = reg_name + "_" + field.name.lower() + r += "static inline uint32_t " + field_name + "_extract(struct sbusfpga_" + name + "_softc *sc, uint32_t oldword) {\n" + r += "\tuint32_t mask = ((1 << " + size + ")-1);\n" + r += "\treturn ( (oldword >> " + offset + ") & mask );\n}\n" + r += "static inline uint32_t " + field_name + "_read(struct sbusfpga_" + name + "_softc *sc) {\n" + r += "\tuint32_t word = " + reg_name + "_read(sc);\n" + r += "\treturn " + field_name + "_extract(sc, word);\n" + r += "}\n" + if not getattr(csr, "read_only", False): + r += "static inline uint32_t " + field_name + "_replace(struct sbusfpga_" + name + "_softc *sc, uint32_t oldword, uint32_t plain_value) {\n" + r += "\tuint32_t mask = ((1 << " + size + ")-1);\n" + r += "\treturn (oldword & (~(mask << " + offset + "))) | (mask & plain_value)<< " + offset + " ;\n}\n" + r += "static inline void " + field_name + "_write(struct sbusfpga_" + name + "_softc *sc, uint32_t plain_value) {\n" + r += "\tuint32_t oldword = " + reg_name + "_read(sc);\n" + r += "\tuint32_t newword = " + field_name + "_replace(sc, oldword, plain_value);\n" + r += "\t" + reg_name + "_write(sc, newword);\n" + r += "}\n" + r += "#endif // CSR_"+name.upper()+"_BASE\n" + + r += "\n#endif\n" + return r + + +def get_csr_forth_header(csr_regions, mem_regions, constants, csr_base=None): + r = "\\ auto-generated base regions for CSRs in the PROM\n" + for name, region in csr_regions.items(): + r += "h# " + hex(region.origin).replace("0x", "") + " constant " + "sbusfpga_csraddr_{}".format(name) + "\n" + for name, region in mem_regions.items(): + r += "h# " + hex(region.origin).replace("0x", "") + " constant " + "sbusfpga_regionaddr_{}".format(name) + "\n" + return r diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py new file mode 100644 index 0000000..e3204f0 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py @@ -0,0 +1,1829 @@ + +from migen import * +from migen.fhdl.specials import Tristate + +SIZ_WORD = 0x0 +SIZ_BYTE = 0x1 +SIZ_HWORD = 0x2 +SIZ_EXT = 0x3 +SIZ_BURST4 = 0x4 +SIZ_BURST8 = 0x5 +SIZ_BURST16 = 0x6 +SIZ_BURST2 = 0x7 + +ACK_IDLE = 0x7 +ACK_ERR = 0x6 +ACK_BYTE = 0x5 +ACK_RERUN = 0x4 +ACK_WORD = 0x3 +ACK_DWORD = 0x2 +ACK_HWORD = 0x1 +ACK_RECV = 0x0 + +ADDR_PHYS_HIGH = 27 +ADDR_PHYS_LOW = 0 +ADDR_PFX_HIGH = ADDR_PHYS_HIGH +ADDR_PFX_LOW = 16 ## 64 KiB per prefix +ADDR_PFX_LENGTH = 12 #(1 + ADDR_PFX_HIGH - ADDR_PFX_LOW) +ROM_ADDR_PFX = Signal(12, reset = 0x000) +WISHBONE_CSR_ADDR_PFX = Signal(12, reset = 0x004) +USBOHCI_ADDR_PFX = Signal(12, reset = 0x008) +SRAM_ADDR_PFX = Signal(12, reset = 0x009) +ENGINE_ADDR_PFXA = Signal(12, reset = 0x00a) +ENGINE_ADDR_PFXB = Signal(12, reset = 0x00b) +#SDRAM_ADDR_PFX = Signal(12, reset = 2048) + +wishbone_default_timeout = 120 ## +sbus_default_timeout = 50 ## must be below 255 +sbus_default_master_throttle = 3 + +def siz_is_word(siz): + return (SIZ_WORD == siz) | (SIZ_BURST2 == siz) | (SIZ_BURST4 == siz) | (SIZ_BURST8 == siz) | (SIZ_BURST16 == siz) + +# FIXME: this doesn't work. Verilog aways use value[0:4] +#def _index_with_wrap(counter, limit_m1, value): +# if (limit_m1 == 0): +# return value[0:4] +# elif (limit_m1 == 1): +# return Cat((value + counter)[0:1], value[1:4]) +# elif (limit_m1 == 3): +# return Cat((value + counter)[0:2], value[2:4]) +# elif (limit_m1 == 7): +# return Cat((value + counter)[0:3], value[3:4]) +# elif (limit_m1 == 15): +# return (value + counter)[0:4] +# return value[0:4] + +def index_with_wrap(counter, limit_m1, value): + return ((value+counter) & limit_m1)[0:4] | (value&(~limit_m1))[0:4] + +# FIXME: this doesn't work. Verilog aways use 1 +def siz_to_burst_size_m1(siz): + if (SIZ_WORD == siz): + return 0 + elif (SIZ_BURST2 == siz): + return 1 + elif (SIZ_BURST4 == siz): + return 3 + elif (SIZ_BURST8 == siz): + return 7 + elif (SIZ_BURST16 == siz): + return 15 + return 1 + +class LedDisplay(Module): + def __init__(self, pads): + n = len(pads) + self.value = Signal(40, reset = 0x0018244281) + old_value = Signal(40) + self.display = Signal(8) + self.comb += pads.eq(self.display) + + self.submodules.fsm = fsm = FSM(reset_state="Reset") + time_counter = Signal(32, reset = 0) + blink_counter = Signal(4, reset = 0) + fsm.act("Reset", + NextValue(time_counter, 25000000//2), + NextValue(blink_counter, 0), + NextValue(self.display, self.value[0:8]), + NextValue(old_value, self.value), + NextState("Byte0")) + fsm.act("Quick", + If(old_value != self.value, + NextState("Reset") + ).Elif(time_counter == 0, + If(blink_counter == 0, + NextValue(time_counter, 25000000//2), + NextValue(self.display, self.value[0:8]), + NextState("Byte0") + ).Else( + NextValue(self.display, ~self.display), + NextValue(time_counter, 25000000//10), + NextValue(blink_counter, blink_counter - 1) + ) + ).Else( + NextValue(time_counter, time_counter - 1) + ) + ) + fsm.act("Byte0", + If(old_value != self.value, + NextState("Reset") + ).Elif(time_counter == 0, + NextValue(time_counter, 25000000//2), + NextValue(self.display, self.value[8:16]), + NextState("Byte1") + ).Else( + NextValue(time_counter, time_counter - 1) + ) + ) + fsm.act("Byte1", + If(old_value != self.value, + NextState("Reset") + ).Elif(time_counter == 0, + NextValue(time_counter, 25000000//2), + NextValue(self.display, self.value[16:24]), + NextState("Byte2") + ).Else( + NextValue(time_counter, time_counter - 1) + ) + ) + fsm.act("Byte2", + If(old_value != self.value, + NextState("Reset") + ).Elif(time_counter == 0, + NextValue(time_counter, 25000000//2), + NextValue(self.display, self.value[24:32]), + NextState("Byte3") + ).Else( + NextValue(time_counter, time_counter - 1) + ) + ) + fsm.act("Byte3", + If(old_value != self.value, + NextState("Reset") + ).Elif(time_counter == 0, + NextValue(time_counter, 25000000//2), + NextValue(self.display, self.value[32:40]), + NextState("Byte4") + ).Else( + NextValue(time_counter, time_counter - 1) + ) + ) + fsm.act("Byte4", + If(old_value != self.value, + NextState("Reset") + ).Elif(time_counter == 0, + NextValue(time_counter, 25000000//10), + NextValue(blink_counter, 10), + NextValue(self.display, 0x00), + NextState("Quick") + ).Else( + NextValue(time_counter, time_counter - 1) + ) + ) + +LED_PARITY=0x11 +LED_ADDRESS=0x12 +LED_UNKNOWNREQ=0x14 +LED_RERUN=0x8 +LED_RERUN_WRITE=0x4 +LED_RERUN_WORD=0x2 +LED_RERUN_LATE=0x1 + +LED_M_WRITE = 0x10 +LED_M_READ = 0x20 +LED_M_CACHE = 0x40 + +class SBusFPGABus(Module): + def __init__(self, platform, hold_reset, wishbone_slave, wishbone_master, tosbus_fifo, fromsbus_fifo, fromsbus_req_fifo, burst_size = 8): + self.platform = platform + self.hold_reset = hold_reset + + self.wishbone_slave = wishbone_slave + self.wishbone_master = wishbone_master + + self.tosbus_fifo = tosbus_fifo + self.fromsbus_fifo = fromsbus_fifo + self.fromsbus_req_fifo = fromsbus_req_fifo + + data_width = burst_size * 4 + data_width_bits = burst_size * 32 + blk_addr_width = 32 - log2_int(data_width) # 27 for burst_size == 8 + + fifo_blk_addr = Signal(blk_addr_width) + fifo_buffer = Signal(data_width_bits) + + pad_SBUS_DATA_OE_LED = platform.request("SBUS_DATA_OE_LED") + SBUS_DATA_OE_LED_o = Signal() + self.comb += pad_SBUS_DATA_OE_LED.eq(SBUS_DATA_OE_LED_o) + ##pad_SBUS_DATA_OE_LED_2 = platform.request("SBUS_DATA_OE_LED_2") + ##SBUS_DATA_OE_LED_2_o = Signal() + ##self.comb += pad_SBUS_DATA_OE_LED_2.eq(SBUS_DATA_OE_LED_2_o) + + #leds = Signal(7, reset=0x00) + #self.comb += platform.request("user_led", 0).eq(leds[0]) + #self.comb += platform.request("user_led", 1).eq(leds[1]) + #self.comb += platform.request("user_led", 2).eq(leds[2]) + #self.comb += platform.request("user_led", 3).eq(leds[3]) + #self.comb += platform.request("user_led", 4).eq(leds[4]) + #self.comb += platform.request("user_led", 5).eq(leds[5]) + #self.comb += platform.request("user_led", 6).eq(leds[6]) + ##self.comb += platform.request("user_led", 7).eq(leds[7]) + + #pad_SBUS_3V3_CLK = platform.request("SBUS_3V3_CLK") + pad_SBUS_3V3_ASs = platform.request("SBUS_3V3_ASs") + pad_SBUS_3V3_BGs = platform.request("SBUS_3V3_BGs") + pad_SBUS_3V3_BRs = platform.request("SBUS_3V3_BRs") + pad_SBUS_3V3_ERRs = platform.request("SBUS_3V3_ERRs") + #pad_SBUS_3V3_RSTs = platform.request("SBUS_3V3_RSTs") + pad_SBUS_3V3_SELs = platform.request("SBUS_3V3_SELs") + #pad_SBUS_3V3_INT1s = platform.request("SBUS_3V3_INT1s") + pad_SBUS_3V3_INT7s = platform.request("SBUS_3V3_INT7s") + pad_SBUS_3V3_PPRD = platform.request("SBUS_3V3_PPRD") + pad_SBUS_OE = platform.request("SBUS_OE") + pad_SBUS_3V3_ACKs = platform.request("SBUS_3V3_ACKs") + pad_SBUS_3V3_SIZ = platform.request("SBUS_3V3_SIZ") + pad_SBUS_3V3_D = platform.request("SBUS_3V3_D") + pad_SBUS_3V3_PA = platform.request("SBUS_3V3_PA") + assert len(pad_SBUS_3V3_D) == 32, "len(pad_SBUS_3V3_D) should be 32" + assert len(pad_SBUS_3V3_PA) == 28, "len(pad_SBUS_3V3_PA) should be 28" + + sbus_oe_data = Signal(reset=0) + sbus_oe_slave_in = Signal(reset=0) + sbus_oe_master_in = Signal(reset=0) + #sbus_oe_int1 = Signal(reset=0) + sbus_oe_int7 = Signal(reset=0) + #sbus_oe_master_br = Signal(reset=0) + + sbus_last_pa = Signal(28) + burst_index = Signal(4) + burst_counter = Signal(4) + burst_limit_m1 = Signal(4) + + #SBUS_3V3_CLK = Signal() + SBUS_3V3_ASs_i = Signal(reset=1) + self.comb += SBUS_3V3_ASs_i.eq(pad_SBUS_3V3_ASs) + SBUS_3V3_BGs_i = Signal(reset=1) + self.comb += SBUS_3V3_BGs_i.eq(pad_SBUS_3V3_BGs) + SBUS_3V3_BRs_o = Signal(reset=1) + #self.specials += Tristate(pad_SBUS_3V3_BRs, SBUS_3V3_BRs_o, sbus_oe_master_br, None) + self.comb += pad_SBUS_3V3_BRs.eq(SBUS_3V3_BRs_o) + SBUS_3V3_ERRs_i = Signal() + SBUS_3V3_ERRs_o = Signal() + self.specials += Tristate(pad_SBUS_3V3_ERRs, SBUS_3V3_ERRs_o, sbus_oe_master_in, SBUS_3V3_ERRs_i) + #SBUS_3V3_RSTs = Signal() + SBUS_3V3_SELs_i = Signal(reset=1) + self.comb += SBUS_3V3_SELs_i.eq(pad_SBUS_3V3_SELs) + #SBUS_3V3_INT1s_o = Signal(reset=1) + #self.specials += Tristate(pad_SBUS_3V3_INT1s, SBUS_3V3_INT1s_o, sbus_oe_int1, None) + SBUS_3V3_INT7s_o = Signal(reset=1) + self.specials += Tristate(pad_SBUS_3V3_INT7s, SBUS_3V3_INT7s_o, sbus_oe_int7, None) + SBUS_3V3_PPRD_i = Signal() + SBUS_3V3_PPRD_o = Signal() + self.specials += Tristate(pad_SBUS_3V3_PPRD, SBUS_3V3_PPRD_o, sbus_oe_slave_in, SBUS_3V3_PPRD_i) + #SBUS_OE_o = Signal() + self.comb += pad_SBUS_OE.eq(self.hold_reset) + SBUS_3V3_ACKs_i = Signal(3) + SBUS_3V3_ACKs_o = Signal(3) + self.specials += Tristate(pad_SBUS_3V3_ACKs, SBUS_3V3_ACKs_o, sbus_oe_master_in, SBUS_3V3_ACKs_i) + SBUS_3V3_SIZ_i = Signal(3) + SBUS_3V3_SIZ_o = Signal(3) + self.specials += Tristate(pad_SBUS_3V3_SIZ, SBUS_3V3_SIZ_o, sbus_oe_slave_in, SBUS_3V3_SIZ_i) + SBUS_3V3_D_i = Signal(32) + SBUS_3V3_D_o = Signal(32) + self.specials += Tristate(pad_SBUS_3V3_D, SBUS_3V3_D_o, sbus_oe_data, SBUS_3V3_D_i) + SBUS_3V3_PA_i = Signal(28) + self.comb += SBUS_3V3_PA_i.eq(pad_SBUS_3V3_PA) + + p_data = Signal(32) # data to read/write in Slave mode + + # buffers when someone inside issues a DMA write request to go over SBus + master_data = Signal(32) # could be merged with p_data + master_addr = Signal(30) # could be meged with data_read_addr + + MASTER_SRC_INV = 0 + MASTER_SRC_BLKDMAFIFO = 1 + MASTER_SRC_WISHBONE = 2 + MASTER_SRC_WISHBONEBUF = 3 + master_src = Signal(2) + master_src_retry = Signal(1) # reset after each successful master cycle + + master_size = Signal(4) + master_idx = Signal(2) + + master_we = Signal() + + sbus_wishbone_le = Signal() + + wishbone_master_timeout = Signal(log2_int(wishbone_default_timeout, False)) + wishbone_slave_timeout = Signal(log2_int(wishbone_default_timeout, False)) + sbus_slave_timeout = Signal(log2_int(sbus_default_timeout, False)) + + sbus_master_throttle = Signal(log2_int(sbus_default_master_throttle, False)) + + #self.submodules.led_display = LedDisplay(platform.request_all("user_led")) + + #self.sync += platform.request("user_led", 4).eq(self.wishbone_slave.cyc) + #self.sync += platform.request("user_led", 5).eq(self.wishbone_slave.stb) + #self.sync += platform.request("user_led", 6).eq(self.wishbone_slave.we) + #self.sync += platform.request("user_led", 7).eq(self.wishbone_slave.ack) + #self.sync += platform.request("user_led", 0).eq(self.wishbone_slave.err) + #led4 = platform.request("user_led", 4) + #led5 = platform.request("user_led", 5) + #led6 = platform.request("user_led", 6) + #led7 = platform.request("user_led", 7) + + #led0123 = Signal(4) + #self.sync += platform.request("user_led", 0).eq(led0123[0]) + #self.sync += platform.request("user_led", 1).eq(led0123[1]) + #self.sync += platform.request("user_led", 2).eq(led0123[2]) + #self.sync += platform.request("user_led", 3).eq(led0123[3]) + + #self.sync += platform.request("user_led", 0).eq(self.wishbone_master.cyc) + #self.sync += platform.request("user_led", 1).eq(self.wishbone_master.stb) + #self.sync += platform.request("user_led", 2).eq(self.wishbone_master.we) + #self.sync += platform.request("user_led", 3).eq(self.wishbone_master.ack) + #self.sync += platform.request("user_led", 4).eq(~SBUS_3V3_SELs_i) + + #self.sync += platform.request("user_led", 4).eq(self.wishbone_master.cyc) + #self.sync += platform.request("user_led", 5).eq(~SBUS_3V3_ASs_i) + #self.sync += platform.request("user_led", 6).eq(wishbone_master_timeout == 0) + #led7 = platform.request("user_led", 7) + + #self.sync += platform.request("user_led", 5).eq(self.wishbone_slave.cyc) + #self.sync += platform.request("user_led", 6).eq(~SBUS_3V3_BRs_o) + #self.sync += platform.request("user_led", 7).eq(~SBUS_3V3_BGs_i) + self.sync += SBUS_DATA_OE_LED_o.eq(~SBUS_3V3_BGs_i), + + #cycle_counter = Signal(8, reset = 0) + #self.sync += cycle_counter.eq(cycle_counter + 1) + #cycle_busmaster = Signal(8, reset = 0) + #self.sync += If(cycle_counter != 0, + # cycle_busmaster.eq(cycle_busmaster + ~SBUS_3V3_BGs_i)).Else( + # cycle_busmaster.eq(0)) + #self.sync += If(cycle_counter == 0, + # platform.request("user_led", 0).eq(cycle_busmaster[4]), + # platform.request("user_led", 1).eq(cycle_busmaster[5]), + # platform.request("user_led", 2).eq(cycle_busmaster[6]), + # platform.request("user_led", 3).eq(cycle_busmaster[7])) + + # Read buffering when a DMA read request is issued by Wishbone + self.master_read_buffer_data = Array(Signal(32) for a in range(4)) + self.master_read_buffer_addr = Signal(28) + self.master_read_buffer_done = Array(Signal() for a in range(4)) + self.master_read_buffer_read = Array(Signal() for a in range(4)) + self.master_read_buffer_start = Signal(reset = 0) + + #self.sync += platform.request("user_led", 1).eq(self.master_read_buffer_start) + + #self.master_write_buffer_data = Array(Signal(32) for a in range(4)) + #self.master_write_buffer_addr = Signal(28) + #self.master_write_buffer_todo = Array(Signal() for a in range(4)) + #self.master_write_buffer_start = Signal() + + self.submodules.slave_fsm = slave_fsm = FSM(reset_state="Reset") + + #self.sync += platform.request("user_led", 0).eq(slave_fsm.ongoing("Master_Translation")) + #self.sync += platform.request("user_led", 1).eq(slave_fsm.ongoing("Master_Read") | + # slave_fsm.ongoing("Master_Read_Ack") | + # slave_fsm.ongoing("Master_Read_Finish") | + # slave_fsm.ongoing("Master_Write") | + # slave_fsm.ongoing("Master_Write_Final")) + #self.sync += platform.request("user_led", 2).eq(slave_fsm.ongoing("Slave_Do_Read") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_Burst") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_Burst_Wait_For_Data") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_Burst_Wait_For_Wishbone") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_HWord") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_HWord_Wait_For_Data") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_HWord_Wait_For_Wishbone") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_Byte") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_Byte_Wait_For_Data") | + # slave_fsm.ongoing("Slave_Ack_Read_Reg_Byte_Wait_For_Wishbone")) + #self.sync += platform.request("user_led", 3).eq(slave_fsm.ongoing("Slave_Ack_Reg_Write_Burst") | + # slave_fsm.ongoing("Slave_Ack_Reg_Write_Final") | + # slave_fsm.ongoing("Slave_Ack_Reg_Write_Burst_Wait_For_Wishbone") | + # slave_fsm.ongoing("Slave_Ack_Reg_Write_HWord") | + # slave_fsm.ongoing("Slave_Ack_Reg_Write_HWord_Wait_For_Wishbone") | + # slave_fsm.ongoing("Slave_Ack_Reg_Write_Byte") | + # slave_fsm.ongoing("Slave_Ack_Reg_Write_Byte_Wait_For_Wishbone")) + + #self.sync += platform.request("user_led", 5).eq(~slave_fsm.ongoing("Idle")) + + stat_slave_start_counter = Signal(32) + stat_slave_done_counter = Signal(32) + stat_slave_rerun_counter = Signal(32) + stat_slave_early_error_counter = Signal(32) + + stat_master_start_counter = Signal(32) + stat_master_done_counter = Signal(32) + stat_master_error_counter = Signal(32) + stat_master_rerun_counter = Signal(32) + + sbus_master_last_virtual = Signal(32) # last VDMA address put on the bus in master mode + sbus_master_error_virtual = Signal(32) + + slave_fsm.act("Reset", + #NextValue(self.led_display.value, 0x0000000000), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(p_data, 0), + NextState("Start"), + NextValue(self.wishbone_master.we, 0), + NextValue(self.wishbone_master.cyc, 0), + NextValue(self.wishbone_master.stb, 0), + NextValue(self.wishbone_slave.ack, 0), + NextValue(self.wishbone_slave.err, 0), + NextValue(wishbone_master_timeout, 0), + NextValue(wishbone_slave_timeout, 0), + NextValue(sbus_slave_timeout, 0) + ) + slave_fsm.act("Start", + #NextValue(self.led_display.value, 0x0FF0000000), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(p_data, 0), + If((self.hold_reset == 0), NextState("Idle")) + ) + slave_fsm.act("Idle", + If(((SBUS_3V3_SELs_i == 0) & + (SBUS_3V3_ASs_i == 0) & + (siz_is_word(SBUS_3V3_SIZ_i)) & + (SBUS_3V3_PPRD_i == 1)), + NextValue(sbus_oe_master_in, 1), + NextValue(sbus_last_pa, SBUS_3V3_PA_i), + NextValue(burst_counter, 0), + Case(SBUS_3V3_SIZ_i, { + SIZ_WORD: NextValue(burst_limit_m1, 0), + SIZ_BURST2: NextValue(burst_limit_m1, 1), + SIZ_BURST4: NextValue(burst_limit_m1, 3), + SIZ_BURST8: NextValue(burst_limit_m1, 7), + SIZ_BURST16: NextValue(burst_limit_m1, 15)}), + If(SBUS_3V3_PA_i[0:2] != 0, + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_PARITY), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ).Elif(((SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ROM_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == WISHBONE_CSR_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == USBOHCI_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXA) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXB)), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), # need to wait for data, don't ACK yet + NextValue(SBUS_3V3_ERRs_o, 1), + NextValue(sbus_wishbone_le, (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(stat_slave_start_counter, stat_slave_start_counter + 1), + If(self.wishbone_master.cyc == 0, + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.we, 0), + NextValue(self.wishbone_master.adr, Cat(SBUS_3V3_PA_i[2:28], Signal(4, reset = 0))), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + NextValue(sbus_slave_timeout, sbus_default_timeout), + #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextState("Slave_Ack_Read_Reg_Burst_Wait_For_Data") + ).Else( + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Read_Reg_Burst_Wait_For_Wishbone") + ) + ).Else( + #NextValue(self.led_display.value, 0x0000000020 | 0x0000000001), + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_ADDRESS), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ) + ).Elif(((SBUS_3V3_SELs_i == 0) & + (SBUS_3V3_ASs_i == 0) & + (SIZ_BYTE == SBUS_3V3_SIZ_i) & + (SBUS_3V3_PPRD_i == 1)), + NextValue(sbus_oe_master_in, 1), + NextValue(sbus_last_pa, SBUS_3V3_PA_i), + If(((SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ROM_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), # need to wait for data, don't ACK yet + NextValue(SBUS_3V3_ERRs_o, 1), + NextValue(sbus_wishbone_le, (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(stat_slave_start_counter, stat_slave_start_counter + 1), + If(self.wishbone_master.cyc == 0, + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.we, 0), + NextValue(self.wishbone_master.adr, Cat(SBUS_3V3_PA_i[2:28], Signal(4, reset = 0))), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + NextValue(sbus_slave_timeout, sbus_default_timeout), + #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextState("Slave_Ack_Read_Reg_Byte_Wait_For_Data") + ).Else( + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Read_Reg_Byte_Wait_For_Wishbone") + ) + ).Else( + #NextValue(self.led_display.value, 0x0000000040 | 0x0000000001), + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_ADDRESS), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ) + ).Elif(((SBUS_3V3_SELs_i == 0) & + (SBUS_3V3_ASs_i == 0) & + (SIZ_HWORD == SBUS_3V3_SIZ_i) & + (SBUS_3V3_PPRD_i == 1)), + NextValue(sbus_oe_master_in, 1), + NextValue(sbus_last_pa, SBUS_3V3_PA_i), + If(SBUS_3V3_PA_i[0:1] != 0, + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_PARITY), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ).Elif(((SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ROM_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), # need to wait for data, don't ACK yet + NextValue(SBUS_3V3_ERRs_o, 1), + NextValue(sbus_wishbone_le, (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(stat_slave_start_counter, stat_slave_start_counter + 1), + If(self.wishbone_master.cyc == 0, + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.we, 0), + NextValue(self.wishbone_master.adr, Cat(SBUS_3V3_PA_i[2:28], Signal(4, reset = 0))), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + NextValue(sbus_slave_timeout, sbus_default_timeout), + #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextState("Slave_Ack_Read_Reg_HWord_Wait_For_Data") + ).Else( + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Read_Reg_HWord_Wait_For_Wishbone") + ) + ).Else( + #NextValue(self.led_display.value, 0x0000000040 | 0x0000000001), + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_ADDRESS), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ) + ).Elif(((SBUS_3V3_SELs_i == 0) & + (SBUS_3V3_ASs_i == 0) & + (siz_is_word(SBUS_3V3_SIZ_i)) & + (SBUS_3V3_PPRD_i == 0)), + NextValue(sbus_oe_master_in, 1), + NextValue(sbus_last_pa, SBUS_3V3_PA_i), + NextValue(burst_counter, 0), + Case(SBUS_3V3_SIZ_i, { + SIZ_WORD: NextValue(burst_limit_m1, 0), + SIZ_BURST2: NextValue(burst_limit_m1, 1), + SIZ_BURST4: NextValue(burst_limit_m1, 3), + SIZ_BURST8: NextValue(burst_limit_m1, 7), + SIZ_BURST16: NextValue(burst_limit_m1, 15) + }), + If(SBUS_3V3_PA_i[0:2] != 0, + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_PARITY), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ).Elif(((SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == WISHBONE_CSR_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == USBOHCI_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXA) | + (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == ENGINE_ADDR_PFXB)), + NextValue(sbus_wishbone_le, (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(stat_slave_start_counter, stat_slave_start_counter + 1), + If(~self.wishbone_master.cyc, + NextValue(SBUS_3V3_ACKs_o, ACK_WORD), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(self.led_display.value, 0x0000000010 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Reg_Write_Burst") + ).Else( + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextValue(SBUS_3V3_ERRs_o, 1), + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Reg_Write_Burst_Wait_For_Wishbone") + ) + ).Else( + #NextValue(self.led_display.value, 0x0000000060 | 0x0000000001), + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_ADDRESS), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ) + ).Elif(((SBUS_3V3_SELs_i == 0) & + (SBUS_3V3_ASs_i == 0) & + (SIZ_BYTE == SBUS_3V3_SIZ_i) & + (SBUS_3V3_PPRD_i == 0)), + NextValue(sbus_oe_master_in, 1), + NextValue(sbus_last_pa, SBUS_3V3_PA_i), + If(((SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(sbus_wishbone_le, (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(stat_slave_start_counter, stat_slave_start_counter + 1), + If(~self.wishbone_master.cyc, + NextValue(SBUS_3V3_ACKs_o, ACK_BYTE), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(self.led_display.value, 0x0000000010 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Reg_Write_Byte") + ).Else( + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextValue(SBUS_3V3_ERRs_o, 1), + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Reg_Write_Byte_Wait_For_Wishbone") + ) + ).Else( + #NextValue(self.led_display.value, 0x0000000060 | 0x0000000001), + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_ADDRESS), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ) + ).Elif(((SBUS_3V3_SELs_i == 0) & + (SBUS_3V3_ASs_i == 0) & + (SIZ_HWORD == SBUS_3V3_SIZ_i) & + (SBUS_3V3_PPRD_i == 0)), + NextValue(sbus_oe_master_in, 1), + NextValue(sbus_last_pa, SBUS_3V3_PA_i), + If(SBUS_3V3_PA_i[0:1] != 0, + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_PARITY), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ).Elif(((SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(sbus_wishbone_le, (SBUS_3V3_PA_i[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH] == SRAM_ADDR_PFX)), + NextValue(stat_slave_start_counter, stat_slave_start_counter + 1), + If(~self.wishbone_master.cyc, + NextValue(SBUS_3V3_ACKs_o, ACK_HWORD), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(self.led_display.value, 0x0000000010 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Reg_Write_HWord") + ).Else( + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextValue(SBUS_3V3_ERRs_o, 1), + NextValue(sbus_slave_timeout, sbus_default_timeout), + NextState("Slave_Ack_Reg_Write_HWord_Wait_For_Wishbone") + ) + ).Else( + #NextValue(self.led_display.value, 0x0000000060 | 0x0000000001), + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(led0123, led0123 | LED_ADDRESS), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ) + ).Elif(self.wishbone_slave.cyc & + self.wishbone_slave.stb & + ~self.wishbone_slave.ack & + ~self.wishbone_slave.err & + self.wishbone_slave.we & + (self.wishbone_slave.sel == 0) & + (wishbone_slave_timeout == 0), + ## sel == 0 so nothing to write, don't acquire the SBus + NextValue(self.wishbone_slave.ack, 1), + ).Elif(SBUS_3V3_BGs_i & ## highest priority are retries, otherwise we'd lose the data + master_src_retry & + (master_we == 0) & + (master_src == MASTER_SRC_BLKDMAFIFO) & + (sbus_master_throttle == 0), + NextValue(SBUS_3V3_BRs_o, 0) + ).Elif(~SBUS_3V3_BGs_i & ## highest priority are retries, otherwise we'd lose the data + master_src_retry & + (master_we == 0) & + (master_src == MASTER_SRC_BLKDMAFIFO), + NextValue(sbus_wishbone_le, 0), # checkme + NextValue(SBUS_3V3_BRs_o, 1), # relinquish the request + NextValue(sbus_oe_data, 1), ## output data (at least for @ during translation) + NextValue(sbus_oe_slave_in, 1), ## PPRD, SIZ becomes output + NextValue(sbus_oe_master_in, 0), ## ERRs, ACKs are input + NextValue(burst_counter, 0), + NextValue(SBUS_3V3_D_o, sbus_master_last_virtual), + NextValue(SBUS_3V3_PPRD_o, 1), + #NextValue(stat_master_start_counter, stat_master_start_counter + 1), + NextState("Master_Translation"), + ).Elif(SBUS_3V3_BGs_i & + self.wishbone_slave.cyc & + self.wishbone_slave.stb & + ~self.wishbone_slave.ack & + ~self.wishbone_slave.err & + self.wishbone_slave.we & + (sbus_master_throttle == 0) & + (wishbone_slave_timeout == 0), + NextValue(SBUS_3V3_BRs_o, 0) + ).Elif(~SBUS_3V3_BGs_i & + self.wishbone_slave.cyc & + self.wishbone_slave.stb & + ~self.wishbone_slave.ack & + ~self.wishbone_slave.err & + self.wishbone_slave.we, + NextValue(sbus_wishbone_le, 1), # checkme + NextValue(SBUS_3V3_BRs_o, 1), # relinquish the request + NextValue(sbus_oe_data, 1), ## output data (at least for @ during translation) + NextValue(sbus_oe_slave_in, 1), ## PPRD, SIZ becomes output + NextValue(sbus_oe_master_in, 0), ## ERRs, ACKs are input + NextValue(burst_counter, 0), + NextValue(burst_limit_m1, 0), ## only single word for now + NextValue(master_addr, self.wishbone_slave.adr), + NextValue(master_data, Cat(self.wishbone_slave.dat_w[24:32], ## LE + self.wishbone_slave.dat_w[16:24], + self.wishbone_slave.dat_w[ 8:16], + self.wishbone_slave.dat_w[ 0: 8])), + NextValue(master_src, MASTER_SRC_WISHBONE), + Case(self.wishbone_slave.sel, { + 0xf: [NextValue(burst_counter, 0), + NextValue(burst_limit_m1, 0), ## only single word for now + NextValue(master_size, SIZ_WORD), + NextValue(SBUS_3V3_SIZ_o, SIZ_WORD), + NextValue(SBUS_3V3_D_o, Cat(Signal(2, reset = 0), self.wishbone_slave.adr)), + NextValue(sbus_master_last_virtual, Cat(Signal(2, reset = 0), self.wishbone_slave.adr)), + ], + 0x1: [NextValue(master_idx, 3), + NextValue(master_size, SIZ_BYTE), + NextValue(SBUS_3V3_SIZ_o, SIZ_BYTE), + NextValue(SBUS_3V3_D_o, Cat(Signal(2, reset = 0), self.wishbone_slave.adr)), + NextValue(sbus_master_last_virtual, Cat(Signal(2, reset = 0), self.wishbone_slave.adr)), + ], + 0x2: [NextValue(master_idx, 2), + NextValue(master_size, SIZ_BYTE), + NextValue(SBUS_3V3_SIZ_o, SIZ_BYTE), + NextValue(SBUS_3V3_D_o, Cat(Signal(2, reset = 1), self.wishbone_slave.adr)), + NextValue(sbus_master_last_virtual, Cat(Signal(2, reset = 1), self.wishbone_slave.adr)), + ], + 0x4: [NextValue(master_idx, 1), + NextValue(master_size, SIZ_BYTE), + NextValue(SBUS_3V3_SIZ_o, SIZ_BYTE), + NextValue(SBUS_3V3_D_o, Cat(Signal(2, reset = 2), self.wishbone_slave.adr)), + NextValue(sbus_master_last_virtual, Cat(Signal(2, reset = 2), self.wishbone_slave.adr)), + ], + 0x8: [NextValue(master_idx, 0), + NextValue(master_size, SIZ_BYTE), + NextValue(SBUS_3V3_SIZ_o, SIZ_BYTE), + NextValue(SBUS_3V3_D_o, Cat(Signal(2, reset = 3), self.wishbone_slave.adr)), + NextValue(sbus_master_last_virtual, Cat(Signal(2, reset = 3), self.wishbone_slave.adr)), + ], + 0x3: [NextValue(master_idx, 2), + NextValue(master_size, SIZ_HWORD), + NextValue(SBUS_3V3_SIZ_o, SIZ_HWORD), + NextValue(SBUS_3V3_D_o, Cat(Signal(2, reset = 0), self.wishbone_slave.adr)), + NextValue(sbus_master_last_virtual, Cat(Signal(2, reset = 0), self.wishbone_slave.adr)), + ], + 0xc: [NextValue(master_idx, 0), + NextValue(master_size, SIZ_HWORD), + NextValue(SBUS_3V3_SIZ_o, SIZ_HWORD), + NextValue(SBUS_3V3_D_o, Cat(Signal(2, reset = 2), self.wishbone_slave.adr)), + NextValue(sbus_master_last_virtual, Cat(Signal(2, reset = 2), self.wishbone_slave.adr)), + ], + "default":[NextValue(burst_counter, 0), # FIXME if it happens! + NextValue(burst_limit_m1, 0), ## only single word for now + NextValue(master_size, SIZ_WORD), + NextValue(SBUS_3V3_SIZ_o, SIZ_WORD), + #NextValue(led0123, self.wishbone_slave.sel) + ] + }), + NextValue(self.wishbone_slave.ack, 1), + NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(SBUS_3V3_PPRD_o, 0), + NextValue(master_we, 1), + #NextValue(self.led_display.value, 0x0000000010 | Cat(Signal(8, reset = 0x00), self.wishbone_slave.adr)), + #NextValue(self.led_display.value, Cat(Signal(8, reset = LED_M_WRITE), Signal(2, reset = 0), self.wishbone_slave.adr)), + NextValue(stat_master_start_counter, stat_master_start_counter + 1), + NextState("Master_Translation") + ).Elif(SBUS_3V3_BGs_i & + self.master_read_buffer_start & + (sbus_master_throttle == 0) & + (wishbone_slave_timeout == 0), + NextValue(SBUS_3V3_BRs_o, 0) + ).Elif(~SBUS_3V3_BGs_i & + self.master_read_buffer_start, + NextValue(sbus_wishbone_le, 1), # checkme + NextValue(SBUS_3V3_BRs_o, 1), # relinquish the request + NextValue(sbus_oe_data, 1), ## output data (at least for @ during translation) + NextValue(sbus_oe_slave_in, 1), ## PPRD, SIZ becomes output + NextValue(sbus_oe_master_in, 0), ## ERRs, ACKs are input + NextValue(burst_counter, 0), + NextValue(burst_limit_m1, 3), ## only quadword word for now + NextValue(SBUS_3V3_D_o, Cat(Signal(4, reset = 0), self.master_read_buffer_addr)), + NextValue(sbus_master_last_virtual, Cat(Signal(4, reset = 0), self.master_read_buffer_addr)), + NextValue(master_src, MASTER_SRC_WISHBONEBUF), + NextValue(SBUS_3V3_PPRD_o, 1), + NextValue(SBUS_3V3_SIZ_o, SIZ_BURST4), + NextValue(master_we, 0), + #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0x00), self.wishbone_slave.adr)), + #NextValue(self.led_display.value, Cat(Signal(8, reset = LED_M_READ), Signal(2, reset = 0), self.master_read_buffer_addr)), + NextValue(stat_master_start_counter, stat_master_start_counter + 1), + NextState("Master_Translation") + ).Elif(SBUS_3V3_BGs_i & + self.tosbus_fifo.readable & + (sbus_master_throttle == 0), + NextValue(SBUS_3V3_BRs_o, 0) + ).Elif(~SBUS_3V3_BGs_i & + self.tosbus_fifo.readable, + NextValue(sbus_wishbone_le, 0), # checkme + NextValue(SBUS_3V3_BRs_o, 1), # relinquish the request + NextValue(sbus_oe_data, 1), ## output data (at least for @ during translation) + NextValue(sbus_oe_slave_in, 1), ## PPRD, SIZ becomes output + NextValue(sbus_oe_master_in, 0), ## ERRs, ACKs are input + NextValue(burst_counter, 0), + NextValue(burst_limit_m1, burst_size - 1), + NextValue(SBUS_3V3_D_o, self.tosbus_fifo.dout[0:32]), + NextValue(sbus_master_last_virtual, self.tosbus_fifo.dout[0:32]), + NextValue(master_addr, self.tosbus_fifo.dout[2:32]), + NextValue(master_data, self.tosbus_fifo.dout[32:64]), + NextValue(fifo_buffer, self.tosbus_fifo.dout[32:]), + NextValue(master_src, MASTER_SRC_BLKDMAFIFO), + self.tosbus_fifo.re.eq(1), + Case(burst_size, { + 2 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST2), + NextValue(master_size, SIZ_BURST2)], + 4 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST4), + NextValue(master_size, SIZ_BURST4)], + 8 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST8), + NextValue(master_size, SIZ_BURST8)], + 16 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST16), + NextValue(master_size, SIZ_BURST16)], + }), + NextValue(SBUS_3V3_PPRD_o, 0), + NextValue(master_we, 1), + NextValue(stat_master_start_counter, stat_master_start_counter + 1), + NextState("Master_Translation") + ).Elif(SBUS_3V3_BGs_i & + self.fromsbus_req_fifo.readable & + self.fromsbus_fifo.writable & + (sbus_master_throttle == 0), + NextValue(SBUS_3V3_BRs_o, 0) + ).Elif(~SBUS_3V3_BGs_i & + self.fromsbus_req_fifo.readable & + self.fromsbus_fifo.writable, + NextValue(sbus_wishbone_le, 0), # checkme + NextValue(SBUS_3V3_BRs_o, 1), # relinquish the request + NextValue(sbus_oe_data, 1), ## output data (at least for @ during translation) + NextValue(sbus_oe_slave_in, 1), ## PPRD, SIZ becomes output + NextValue(sbus_oe_master_in, 0), ## ERRs, ACKs are input + NextValue(burst_counter, 0), + NextValue(burst_limit_m1, burst_size - 1), + NextValue(SBUS_3V3_D_o, self.fromsbus_req_fifo.dout[blk_addr_width:blk_addr_width+32]), + NextValue(sbus_master_last_virtual, self.fromsbus_req_fifo.dout[blk_addr_width:blk_addr_width+32]), + NextValue(fifo_blk_addr, self.fromsbus_req_fifo.dout[0:blk_addr_width]), + NextValue(master_src, MASTER_SRC_BLKDMAFIFO), + self.fromsbus_req_fifo.re.eq(1), + Case(burst_size, { + 2 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST2), + NextValue(master_size, SIZ_BURST2)], + 4 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST4), + NextValue(master_size, SIZ_BURST4)], + 8 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST8), + NextValue(master_size, SIZ_BURST8)], + 16 : [NextValue(SBUS_3V3_SIZ_o, SIZ_BURST16), + NextValue(master_size, SIZ_BURST16)], + }), + NextValue(SBUS_3V3_PPRD_o, 1), + NextValue(master_we, 0), + NextValue(stat_master_start_counter, stat_master_start_counter + 1), + NextState("Master_Translation") + ).Elif(((SBUS_3V3_SELs_i == 0) & + (SBUS_3V3_ASs_i == 0)), + NextValue(sbus_oe_master_in, 1), + NextValue(SBUS_3V3_ACKs_o, ACK_ERR), + NextValue(SBUS_3V3_ERRs_o, 1), + #NextValue(self.led_display.value, 0x000000000F | Cat(Signal(8, reset = 0x00), SBUS_3V3_PA_i, SBUS_3V3_SIZ_i, SBUS_3V3_PPRD_i)), + #NextValue(led0123, led0123 | LED_UNKNOWNREQ), + NextValue(stat_slave_early_error_counter, stat_slave_early_error_counter + 1), + NextState("Slave_Error") + ).Elif(~SBUS_3V3_BGs_i, + ### ouch we got the bus but nothing more to do ?!? + NextValue(SBUS_3V3_BRs_o, 1), + ).Else( + # FIXME: handle error + ) + ) + # ##### SLAVE READ ##### + # ## BURST (1->16 words) ## + slave_fsm.act("Slave_Do_Read", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x04), self.led_display.value[8:40])), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + If(((SBUS_3V3_ASs_i == 1) | ((SBUS_3V3_ASs_i == 0) & (SBUS_3V3_SELs_i == 1))), + NextValue(stat_slave_done_counter, stat_slave_done_counter + 1), + NextState("Idle") + ) + ) + slave_fsm.act("Slave_Ack_Read_Reg_Burst", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x05), self.led_display.value[8:40])), + NextValue(sbus_oe_data, 1), + NextValue(SBUS_3V3_D_o, p_data), + If((burst_counter == burst_limit_m1), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextState("Slave_Do_Read") + ).Else( + NextValue(burst_counter, burst_counter + 1), + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.we, 0), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + NextValue(self.wishbone_master.adr, Cat(index_with_wrap(burst_counter+1, burst_limit_m1, sbus_last_pa[ADDR_PHYS_LOW+2:ADDR_PHYS_LOW+6]), # 4 bits, adr FIXME + sbus_last_pa[ADDR_PHYS_LOW+6:ADDR_PFX_LOW], # 10 bits, adr + sbus_last_pa[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH], # 12 bits, adr + Signal(4, reset = 0))), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextState("Slave_Ack_Read_Reg_Burst_Wait_For_Data") + ) + ) + slave_fsm.act("Slave_Ack_Read_Reg_Burst_Wait_For_Data", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x06), self.led_display.value[8:40])), + If(self.wishbone_master.ack, + Case(sbus_wishbone_le, { + 0: NextValue(p_data,self.wishbone_master.dat_r), + 1: NextValue(p_data, Cat(self.wishbone_master.dat_r[24:32], + self.wishbone_master.dat_r[16:24], + self.wishbone_master.dat_r[ 8:16], + self.wishbone_master.dat_r[ 0: 8])) + }), + NextValue(self.wishbone_master.cyc, 0), + NextValue(self.wishbone_master.stb, 0), + NextValue(wishbone_master_timeout, 0), + NextValue(sbus_slave_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_WORD), + NextState("Slave_Ack_Read_Reg_Burst") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(self.wishbone_master.cyc, 0), ## abort transaction + NextValue(self.wishbone_master.stb, 0), + NextValue(wishbone_master_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN | LED_RERUN_WORD | LED_RERUN_LATE), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + slave_fsm.act("Slave_Ack_Read_Reg_Burst_Wait_For_Wishbone", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x68), self.led_display.value[8:40])), + If(self.wishbone_master.cyc == 0, + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.we, 0), + NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[2:28], Signal(4, reset = 0))), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextState("Slave_Ack_Read_Reg_Burst_Wait_For_Data") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN | LED_RERUN_WORD), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + # ## HWORD + slave_fsm.act("Slave_Ack_Read_Reg_HWord", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x05), self.led_display.value[8:40])), + NextValue(sbus_oe_data, 1), + NextValue(SBUS_3V3_D_o, p_data), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextState("Slave_Do_Read") + ) + slave_fsm.act("Slave_Ack_Read_Reg_HWord_Wait_For_Data", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x06), self.led_display.value[8:40])), + If(self.wishbone_master.ack, + Case(sbus_wishbone_le, { + 0: Case(sbus_last_pa[ADDR_PHYS_LOW+1:ADDR_PHYS_LOW+2], { + 0: NextValue(p_data, Cat(Signal(16, reset = 0), + self.wishbone_master.dat_r[16:32])), + 1: NextValue(p_data, Cat(Signal(16, reset = 0), + self.wishbone_master.dat_r[ 0:16])), + }), + 1: Case(sbus_last_pa[ADDR_PHYS_LOW+1:ADDR_PHYS_LOW+2], { + 1: NextValue(p_data, Cat(Signal(16, reset = 0), + self.wishbone_master.dat_r[24:32], + self.wishbone_master.dat_r[16:24])), + 0: NextValue(p_data, Cat(Signal(16, reset = 0), + self.wishbone_master.dat_r[ 8:16], + self.wishbone_master.dat_r[ 0: 8])), + }) + }), + NextValue(self.wishbone_master.cyc, 0), + NextValue(self.wishbone_master.stb, 0), + NextValue(wishbone_master_timeout, 0), + NextValue(sbus_slave_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_HWORD), + NextState("Slave_Ack_Read_Reg_HWord") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(self.wishbone_master.cyc, 0), ## abort transaction + NextValue(self.wishbone_master.stb, 0), + NextValue(wishbone_master_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN | LED_RERUN_LATE), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + slave_fsm.act("Slave_Ack_Read_Reg_HWord_Wait_For_Wishbone", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x68), self.led_display.value[8:40])), + If(self.wishbone_master.cyc == 0, + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.we, 0), + NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[2:28], Signal(4, reset = 0))), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextState("Slave_Ack_Read_Reg_HWord_Wait_For_Data") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + # ## BYTE + slave_fsm.act("Slave_Ack_Read_Reg_Byte", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x05), self.led_display.value[8:40])), + NextValue(sbus_oe_data, 1), + NextValue(SBUS_3V3_D_o, p_data), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextState("Slave_Do_Read") + ) + slave_fsm.act("Slave_Ack_Read_Reg_Byte_Wait_For_Data", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x06), self.led_display.value[8:40])), + If(self.wishbone_master.ack, + Case(sbus_wishbone_le, { + 0: Case(sbus_last_pa[ADDR_PHYS_LOW:ADDR_PHYS_LOW+2], { + 0: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[24:32])), + 1: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[16:24])), + 2: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[ 8:16])), + 3: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[ 0: 8])), + }), + 1: Case(sbus_last_pa[ADDR_PHYS_LOW:ADDR_PHYS_LOW+2], { + 3: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[24:32])), + 2: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[16:24])), + 1: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[ 8:16])), + 0: NextValue(p_data, Cat(Signal(24, reset = 0), self.wishbone_master.dat_r[ 0: 8])), + }) + }), + NextValue(self.wishbone_master.cyc, 0), + NextValue(self.wishbone_master.stb, 0), + NextValue(wishbone_master_timeout, 0), + NextValue(sbus_slave_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_BYTE), + NextState("Slave_Ack_Read_Reg_Byte") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(self.wishbone_master.cyc, 0), ## abort transaction + NextValue(self.wishbone_master.stb, 0), + NextValue(wishbone_master_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN | LED_RERUN_LATE), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + slave_fsm.act("Slave_Ack_Read_Reg_Byte_Wait_For_Wishbone", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x68), self.led_display.value[8:40])), + If(self.wishbone_master.cyc == 0, + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.we, 0), + NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[2:28], Signal(4, reset = 0))), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextState("Slave_Ack_Read_Reg_Byte_Wait_For_Data") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + # ##### SLAVE WRITE ##### + # ## BURST (1->16 words) ## + slave_fsm.act("Slave_Ack_Reg_Write_Burst", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x07), self.led_display.value[8:40])), + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), + NextValue(self.wishbone_master.adr, Cat(index_with_wrap(burst_counter, burst_limit_m1, sbus_last_pa[ADDR_PHYS_LOW+2:ADDR_PHYS_LOW+6]), # 4 bits, adr FIXME + sbus_last_pa[ADDR_PHYS_LOW+6:ADDR_PFX_LOW], # 10 bits, adr + sbus_last_pa[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH], # 12 bits, adr + Signal(4, reset = 0))), + Case(sbus_wishbone_le, { + 0: NextValue(self.wishbone_master.dat_w, Cat(SBUS_3V3_D_i)), + 1: NextValue(self.wishbone_master.dat_w, Cat(SBUS_3V3_D_i[24:32], + SBUS_3V3_D_i[16:24], + SBUS_3V3_D_i[ 8:16], + SBUS_3V3_D_i[ 0: 8])) + }), + NextValue(self.wishbone_master.we, 1), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + If((burst_counter == burst_limit_m1), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextState("Slave_Ack_Reg_Write_Final") + ).Else( + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextValue(burst_counter, burst_counter + 1), + NextState("Slave_Ack_Reg_Write_Burst_Wait_For_Wishbone"), + ) + ) + slave_fsm.act("Slave_Ack_Reg_Write_Final", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x08), self.led_display.value[8:40])), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + If(((SBUS_3V3_ASs_i == 1) | ((SBUS_3V3_ASs_i == 0) & (SBUS_3V3_SELs_i == 1))), + NextValue(stat_slave_done_counter, stat_slave_done_counter + 1), + NextState("Idle") + ) + ) + slave_fsm.act("Slave_Ack_Reg_Write_Burst_Wait_For_Wishbone", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x68), self.led_display.value[8:40])), + If(self.wishbone_master.cyc == 0, + NextValue(sbus_slave_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_WORD), + NextState("Slave_Ack_Reg_Write_Burst") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(self.led_display.value, Cat(Signal(8, reset = LED_RERUN | LED_RERUN_WRITE | LED_RERUN_WORD), sbus_last_pa, Signal(4, reset = 0))), + #NextValue(led0123, LED_RERUN | LED_RERUN_WRITE | LED_RERUN_WORD), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + # ## HWORD + slave_fsm.act("Slave_Ack_Reg_Write_HWord", + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + Case(sbus_wishbone_le, { + 0: Case(sbus_last_pa[ADDR_PHYS_LOW+1:ADDR_PHYS_LOW+2], { + 0: NextValue(self.wishbone_master.sel, 0xc), + 1: NextValue(self.wishbone_master.sel, 0x3), + }), + 1: Case(sbus_last_pa[ADDR_PHYS_LOW+1:ADDR_PHYS_LOW+2], { + 1: NextValue(self.wishbone_master.sel, 0xc), + 0: NextValue(self.wishbone_master.sel, 0x3), + }), + }), + NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[ADDR_PHYS_LOW+2:ADDR_PHYS_LOW+6], # 4 bits, adr FIXME + sbus_last_pa[ADDR_PHYS_LOW+6:ADDR_PFX_LOW], # 10 bits, adr + sbus_last_pa[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH], # 12 bits, adr + Signal(4, reset = 0))), + Case(sbus_wishbone_le, { + 0: NextValue(self.wishbone_master.dat_w, Cat(SBUS_3V3_D_i[16:32], + SBUS_3V3_D_i[16:32])), + 1: NextValue(self.wishbone_master.dat_w, Cat(SBUS_3V3_D_i[24:32], + SBUS_3V3_D_i[16:24], + SBUS_3V3_D_i[24:32], + SBUS_3V3_D_i[16:24])), + }), + NextValue(self.wishbone_master.we, 1), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextState("Slave_Ack_Reg_Write_Final") + ) + slave_fsm.act("Slave_Ack_Reg_Write_HWord_Wait_For_Wishbone", + If(self.wishbone_master.cyc == 0, + NextValue(sbus_slave_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_HWORD), + NextState("Slave_Ack_Reg_Write_HWord") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN | LED_RERUN_WRITE), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + # ## BYTE + slave_fsm.act("Slave_Ack_Reg_Write_Byte", + NextValue(self.wishbone_master.cyc, 1), + NextValue(self.wishbone_master.stb, 1), + Case(sbus_wishbone_le, { + 0: Case(sbus_last_pa[ADDR_PHYS_LOW:ADDR_PHYS_LOW+2], { + 0: NextValue(self.wishbone_master.sel, 0x8), + 1: NextValue(self.wishbone_master.sel, 0x4), + 2: NextValue(self.wishbone_master.sel, 0x2), + 3: NextValue(self.wishbone_master.sel, 0x1), + }), + 1: Case(sbus_last_pa[ADDR_PHYS_LOW:ADDR_PHYS_LOW+2], { + 3: NextValue(self.wishbone_master.sel, 0x8), + 2: NextValue(self.wishbone_master.sel, 0x4), + 1: NextValue(self.wishbone_master.sel, 0x2), + 0: NextValue(self.wishbone_master.sel, 0x1), + }), + }), + NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[ADDR_PHYS_LOW+2:ADDR_PHYS_LOW+6], # 4 bits, adr FIXME + sbus_last_pa[ADDR_PHYS_LOW+6:ADDR_PFX_LOW], # 10 bits, adr + sbus_last_pa[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH], # 12 bits, adr + Signal(4, reset = 0))), + NextValue(self.wishbone_master.dat_w, Cat(SBUS_3V3_D_i[24:32], # LE/BE identical + SBUS_3V3_D_i[24:32], + SBUS_3V3_D_i[24:32], + SBUS_3V3_D_i[24:32])), + NextValue(self.wishbone_master.we, 1), + NextValue(wishbone_master_timeout, wishbone_default_timeout), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextState("Slave_Ack_Reg_Write_Final") + ) + slave_fsm.act("Slave_Ack_Reg_Write_Byte_Wait_For_Wishbone", + If(self.wishbone_master.cyc == 0, + NextValue(sbus_slave_timeout, 0), + NextValue(SBUS_3V3_ACKs_o, ACK_BYTE), + NextState("Slave_Ack_Reg_Write_Byte") + ).Elif(sbus_slave_timeout == 0, ### this is taking too long + NextValue(SBUS_3V3_ACKs_o, ACK_RERUN), + #NextValue(led0123, LED_RERUN | LED_RERUN_WRITE), + NextValue(stat_slave_rerun_counter, stat_slave_rerun_counter + 1), + NextState("Slave_Error") + ) + ) + # ##### SLAVE ERROR ##### + slave_fsm.act("Slave_Error", + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + #NextValue(self.led_display.value, 0x0000000080 | self.led_display.value), + If(((SBUS_3V3_ASs_i == 1) | ((SBUS_3V3_ASs_i == 0) & (SBUS_3V3_SELs_i == 1))), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(sbus_slave_timeout, 0), + NextState("Idle") + ) + ) + # ##### MASTER ##### + slave_fsm.act("Master_Translation", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x09), self.led_display.value[8:40])), + If(master_we, + NextValue(sbus_oe_data, 1), + Case(master_size, { + SIZ_BURST2: NextValue(SBUS_3V3_D_o, master_data), + SIZ_BURST4: NextValue(SBUS_3V3_D_o, master_data), + SIZ_BURST8: NextValue(SBUS_3V3_D_o, master_data), + SIZ_BURST16: NextValue(SBUS_3V3_D_o, master_data), + SIZ_WORD: NextValue(SBUS_3V3_D_o, master_data), + SIZ_BYTE: Case(master_idx, { + 0: NextValue(SBUS_3V3_D_o, Cat(master_data[ 0: 8], + master_data[ 0: 8], + master_data[ 0: 8], + master_data[ 0: 8],)), + 1: NextValue(SBUS_3V3_D_o, Cat(master_data[ 8:16], + master_data[ 8:16], + master_data[ 8:16], + master_data[ 8:16],)), + 2: NextValue(SBUS_3V3_D_o, Cat(master_data[16:24], + master_data[16:24], + master_data[16:24], + master_data[16:24],)), + 3: NextValue(SBUS_3V3_D_o, Cat(master_data[24:32], + master_data[24:32], + master_data[24:32], + master_data[24:32],)), + }), + SIZ_HWORD: Case(master_idx, { + 0: NextValue(SBUS_3V3_D_o, Cat(master_data[ 0:16], + master_data[ 0:16],)), + 2: NextValue(SBUS_3V3_D_o, Cat(master_data[16:32], + master_data[16:32],)), + }) + }), + Case(master_src, { + MASTER_SRC_BLKDMAFIFO: + [NextValue(master_data, fifo_buffer[32:64]), # 0:32 is on the bus already + ], + }), + ).Else( + NextValue(sbus_oe_data, 0) + ), + Case(SBUS_3V3_ACKs_i, { + ACK_ERR: ## ouch + [Case(master_src, { + MASTER_SRC_WISHBONE: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + MASTER_SRC_WISHBONEBUF: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + }), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_error_counter, stat_master_error_counter + 1), + NextValue(sbus_master_error_virtual, sbus_master_last_virtual), + NextState("Idle")], + ACK_RERUN: ### dunno how to handle that yet, + [Case(master_src, { + MASTER_SRC_WISHBONE: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + MASTER_SRC_WISHBONEBUF: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + }), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_rerun_counter, stat_master_rerun_counter + 1), + NextState("Idle")], + ACK_IDLE: + [If(master_we, + NextState("Master_Write"), + ## FIXME: in burst mode, should update master_data with the next value + ## FIXME: we don't do burst mode yet + ## FIXME: actually now from FIFO is handled above + ).Else( + NextState("Master_Read") + )], + "default": + [If(SBUS_3V3_BGs_i, ## oups, we lost our bus access without error ?!? + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextState("Idle") + )], + }) + ) + slave_fsm.act("Master_Read", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x0a), self.led_display.value[8:40])), + Case(SBUS_3V3_ACKs_i, { + ACK_WORD: + [NextState("Master_Read_Ack") + ], + ACK_IDLE: + [NextState("Master_Read") ## redundant + ], + ACK_RERUN: ### burst not handled + [Case(master_src, { + MASTER_SRC_WISHBONE: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + MASTER_SRC_WISHBONEBUF: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + MASTER_SRC_BLKDMAFIFO: + [NextValue(master_src_retry, 1), + ], + }), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_rerun_counter, stat_master_rerun_counter + 1), + NextState("Idle") + ], + ACK_ERR: ## ### burst not handled + [Case(master_src, { + MASTER_SRC_WISHBONE: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + MASTER_SRC_WISHBONEBUF: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + MASTER_SRC_BLKDMAFIFO: + [NextValue(master_src_retry, ~master_src_retry), # only retry if this wasn't a retry + ], + }), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_error_counter, stat_master_error_counter + 1), + NextValue(sbus_master_error_virtual, sbus_master_last_virtual), + NextState("Idle") + ], + "default": ## other ### burst not handled + [Case(master_src, { + MASTER_SRC_WISHBONE: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + MASTER_SRC_WISHBONEBUF: + [NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextValue(self.wishbone_slave.err, 1), + ], + }), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_error_counter, stat_master_error_counter + 1), + NextState("Idle") + ], + }) + ) + slave_fsm.act("Master_Read_Ack", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x0b), self.led_display.value[8:40])), + Case(master_src, { + MASTER_SRC_BLKDMAFIFO: + [Case(burst_counter, { + 0: NextValue(fifo_buffer[0:32], SBUS_3V3_D_i), + 1: NextValue(fifo_buffer[32:64], SBUS_3V3_D_i), + 2: NextValue(fifo_buffer[64:96], SBUS_3V3_D_i), + 3: NextValue(fifo_buffer[96:128], SBUS_3V3_D_i), + 4: NextValue(fifo_buffer[128:160], SBUS_3V3_D_i), + 5: NextValue(fifo_buffer[160:192], SBUS_3V3_D_i), + 6: NextValue(fifo_buffer[192:224], SBUS_3V3_D_i), + 7: NextValue(fifo_buffer[224:256], SBUS_3V3_D_i), +# 8: NextValue(fifo_buffer[256:288], SBUS_3V3_D_i), +# 9: NextValue(fifo_buffer[288:320], SBUS_3V3_D_i), +# 10: NextValue(fifo_buffer[320:352], SBUS_3V3_D_i), +# 11: NextValue(fifo_buffer[352:384], SBUS_3V3_D_i), +# 12: NextValue(fifo_buffer[384:416], SBUS_3V3_D_i), +# 13: NextValue(fifo_buffer[416:448], SBUS_3V3_D_i), +# 14: NextValue(fifo_buffer[448:480], SBUS_3V3_D_i), +# 15: NextValue(fifo_buffer[480:512], SBUS_3V3_D_i), + }), + ], + MASTER_SRC_WISHBONEBUF: + [NextValue(self.master_read_buffer_data[burst_counter[0:2]], SBUS_3V3_D_i), + NextValue(self.master_read_buffer_done[burst_counter[0:2]], 1), + ], + }), + NextValue(burst_counter, burst_counter + 1), + If(burst_counter == burst_limit_m1, + Case(master_src, { + MASTER_SRC_WISHBONEBUF: + [NextValue(self.master_read_buffer_start, 0), + ], + }), + NextState("Master_Read_Finish") + ).Else( + Case(SBUS_3V3_ACKs_i, { + ACK_WORD: NextState("Master_Read_Ack"), ## redundant + ACK_IDLE: NextState("Master_Read"), + ACK_RERUN: ### dunno how to handle that yet + [NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_rerun_counter, stat_master_rerun_counter + 1), + NextState("Idle") + ], + ACK_ERR: + [NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_error_counter, stat_master_error_counter + 1), + NextValue(sbus_master_error_virtual, sbus_master_last_virtual), + NextState("Idle") + ], + "default": + [NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_error_counter, stat_master_error_counter + 1), + NextState("Idle") + ], + }), + ) + ) + slave_fsm.act("Master_Read_Finish", ## missing the handling of late error + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x0c), self.led_display.value[8:40])), + Case(master_src, { + MASTER_SRC_BLKDMAFIFO: + [fromsbus_fifo.we.eq(1), + fromsbus_fifo.din.eq(Cat(fifo_blk_addr, fifo_buffer)), + ], + }), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(sbus_master_throttle, sbus_default_master_throttle), + NextValue(stat_master_done_counter, stat_master_done_counter + 1), + NextValue(master_src_retry, 0), + NextState("Idle") + ) + slave_fsm.act("Master_Write", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x0d), self.led_display.value[8:40])), + Case(SBUS_3V3_ACKs_i, { + ACK_WORD: # FIXME: check againt master_size ? + [If(burst_counter == burst_limit_m1, + NextState("Master_Write_Final"), + ).Else( + NextValue(SBUS_3V3_D_o, master_data), + NextValue(burst_counter, burst_counter + 1), + Case(master_src, { + MASTER_SRC_BLKDMAFIFO: + [Case(burst_counter, { #0:32 just ack'd, 32:64 is on the bus now, burst_counter will only increment for the next cycle, so we're two steps ahead + 0: NextValue(master_data, fifo_buffer[64:96]), + 1: NextValue(master_data, fifo_buffer[96:128]), + 2: NextValue(master_data, fifo_buffer[128:160]), + 3: NextValue(master_data, fifo_buffer[160:192]), + 4: NextValue(master_data, fifo_buffer[192:224]), + 5: NextValue(master_data, fifo_buffer[224:256]), +# 6: NextValue(master_data, fifo_buffer[256:288]), +# 7: NextValue(master_data, fifo_buffer[288:320]), +# 8: NextValue(master_data, fifo_buffer[320:352]), +# 9: NextValue(master_data, fifo_buffer[352:384]), +# 10: NextValue(master_data, fifo_buffer[384:416]), +# 11: NextValue(master_data, fifo_buffer[416:448]), +# 12: NextValue(master_data, fifo_buffer[448:480]), +# 13: NextValue(master_data, fifo_buffer[480:512]), + #14: NextValue(master_data, fifo_buffer[512:544]), + #15: NextValue(master_data, fifo_buffer[544:576]), + "default": NextValue(master_data, 0), + }) + ], + }), + )], + ACK_BYTE: # FIXME: check againt master_size ? + [NextState("Master_Write_Final"), + ], + ACK_HWORD: # FIXME: check againt master_size ? + [NextState("Master_Write_Final"), + ], + ACK_IDLE: + [NextState("Master_Write") ## redundant + ], + ACK_RERUN: ### dunno how to handle that yet + [NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_rerun_counter, stat_master_rerun_counter + 1), + NextState("Idle") + ], + ACK_ERR: ## ACK_ERRS or other + [NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_error_counter, stat_master_error_counter + 1), + NextValue(sbus_master_error_virtual, sbus_master_last_virtual), + NextState("Idle"), + ], + "default": ## other + [NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(stat_master_error_counter, stat_master_error_counter + 1), + NextState("Idle"), + ], + }) + ) + slave_fsm.act("Master_Write_Final", + #NextValue(self.led_display.value, Cat(Signal(8, reset = 0x0e), self.led_display.value[8:40])), + NextValue(sbus_oe_data, 0), + NextValue(sbus_oe_slave_in, 0), + NextValue(sbus_oe_master_in, 0), + NextValue(sbus_master_throttle, sbus_default_master_throttle), + NextValue(stat_master_done_counter, stat_master_done_counter + 1), + NextValue(master_src_retry, 0), + NextState("Idle") + ) + # ##### FINISHED ##### + + + # ##### FSMs to finish wishbone transactions asynchronously + + self.submodules.wishbone_master_wait_fsm = wishbone_master_wait_fsm = FSM(reset_state="Reset") + wishbone_master_wait_fsm.act("Reset", + NextState("Idle") + ) + wishbone_master_wait_fsm.act("Idle", + If(wishbone_master_timeout != 0, + NextValue(wishbone_master_timeout, wishbone_master_timeout -1) + ), + If(self.wishbone_master.cyc & self.wishbone_master.stb & self.wishbone_master.we, + If(self.wishbone_master.ack,# | (wishbone_master_timeout == 0), + #If(~self.wishbone_master.ack, + # NextValue(led7, 1) + #), + NextValue(self.wishbone_master.cyc, 0), + NextValue(self.wishbone_master.stb, 0), + NextValue(self.wishbone_master.we, 0), + NextValue(wishbone_master_timeout, 0) + ) + ) + ) + + + self.submodules.wishbone_slave_wait_fsm = wishbone_slave_wait_fsm = FSM(reset_state="Reset") + wishbone_slave_wait_fsm.act("Reset", + NextState("Idle") + ) + wishbone_slave_wait_fsm.act("Idle", + If(wishbone_slave_timeout != 0, + NextValue(wishbone_slave_timeout, wishbone_slave_timeout -1) + ), + If(self.wishbone_slave.ack & self.wishbone_slave.we, + If((~self.wishbone_slave.stb), # | (wishbone_slave_timeout == 0), #~self.wishbone_slave.cyc & + NextValue(self.wishbone_slave.ack, 0), + NextValue(wishbone_slave_timeout, 0) + ) + ), + If(self.wishbone_slave.ack & ~self.wishbone_slave.we, + If((~self.wishbone_slave.stb), # | (wishbone_slave_timeout == 0), #~self.wishbone_slave.cyc & + NextValue(self.wishbone_slave.ack, 0), + NextValue(wishbone_slave_timeout, 0) + ) + ), + If(self.wishbone_slave.err, + If((~self.wishbone_slave.stb), # | (wishbone_slave_timeout == 0), #~self.wishbone_slave.cyc & + NextValue(self.wishbone_slave.err, 0), + NextValue(wishbone_slave_timeout, 0) + ) + ) + ) + + self.submodules.sbus_slave_wait_fsm = sbus_slave_wait_fsm = FSM(reset_state="Reset") + sbus_slave_wait_fsm.act("Reset", + NextState("Idle") + ) + sbus_slave_wait_fsm.act("Idle", + If(sbus_slave_timeout != 0, + NextValue(sbus_slave_timeout, sbus_slave_timeout -1) + ), + ) + + # ##### FIXME: debug only? + self.submodules.sbus_master_throttle_fsm = sbus_master_throttle_fsm = FSM(reset_state="Reset") + sbus_master_throttle_fsm.act("Reset", + NextState("Idle") + ) + sbus_master_throttle_fsm.act("Idle", + If(sbus_master_throttle != 0, + NextValue(sbus_master_throttle, sbus_master_throttle -1) + ), + ) + + # ##### Slave read buffering FSM #### + last_read_word_idx = Signal(2) + self.submodules.wishbone_slave_read_buffering_fsm = wishbone_slave_read_buffering_fsm = FSM(reset_state="Reset") + #self.sync += platform.request("user_led", 0).eq(~wishbone_slave_read_buffering_fsm.ongoing("Idle")) + #self.sync += platform.request("user_led", 1).eq(self.master_read_buffer_done[last_read_word_idx]) + wishbone_slave_read_buffering_fsm.act("Reset", + NextState("Idle") + ) + wishbone_slave_read_buffering_fsm.act("Idle", + If(self.wishbone_slave.cyc & + self.wishbone_slave.stb & + ~self.wishbone_slave.ack & + ~self.wishbone_slave.err & + ~self.wishbone_slave.we & + (wishbone_slave_timeout == 0), + #led3.eq(1), + If((self.master_read_buffer_addr == self.wishbone_slave.adr[2:30]) & + (self.master_read_buffer_done[self.wishbone_slave.adr[0:2]]) & + (~self.master_read_buffer_read[self.wishbone_slave.adr[0:2]]), + ## use cache + NextValue(self.wishbone_slave.ack, 1), + NextValue(self.wishbone_slave.dat_r, Cat(self.master_read_buffer_data[self.wishbone_slave.adr[0:2]][24:32], # LE + self.master_read_buffer_data[self.wishbone_slave.adr[0:2]][16:24], + self.master_read_buffer_data[self.wishbone_slave.adr[0:2]][ 8:16], + self.master_read_buffer_data[self.wishbone_slave.adr[0:2]][ 0: 8])), +# NextValue(self.wishbone_slave.dat_r, self.master_read_buffer_data[self.wishbone_slave.adr[0:2]]), + #NextValue(self.led_display.value, Cat(Signal(8, reset = LED_M_READ | LED_M_CACHE), Signal(2, reset = 0), self.wishbone_slave.adr)), + NextValue(self.master_read_buffer_read[self.wishbone_slave.adr[0:2]], 1), + NextValue(wishbone_slave_timeout, wishbone_default_timeout) + ).Elif(~self.master_read_buffer_start, + #led2.eq(1), + NextValue(self.master_read_buffer_addr, self.wishbone_slave.adr[2:30]), + NextValue(self.master_read_buffer_done[0], 0), + NextValue(self.master_read_buffer_done[1], 0), + NextValue(self.master_read_buffer_done[2], 0), + NextValue(self.master_read_buffer_done[3], 0), + NextValue(self.master_read_buffer_read[0], 0), + NextValue(self.master_read_buffer_read[1], 0), + NextValue(self.master_read_buffer_read[2], 0), + NextValue(self.master_read_buffer_read[3], 0), + NextValue(last_read_word_idx, self.wishbone_slave.adr[0:2]), + NextValue(self.master_read_buffer_start, 1), + NextState("WaitForData") + ).Else( + #led1.eq(self.master_read_buffer_start) + ) + ) + ) + wishbone_slave_read_buffering_fsm.act("WaitForData", + #led2.eq(1), + If(self.master_read_buffer_done[last_read_word_idx], + NextValue(self.wishbone_slave.ack, 1), + NextValue(self.wishbone_slave.dat_r, Cat(self.master_read_buffer_data[last_read_word_idx][24:32], # LE + self.master_read_buffer_data[last_read_word_idx][16:24], + self.master_read_buffer_data[last_read_word_idx][ 8:16], + self.master_read_buffer_data[last_read_word_idx][ 0: 8])), +# NextValue(self.wishbone_slave.dat_r, self.master_read_buffer_data[last_read_word_idx]), + NextValue(self.master_read_buffer_read[last_read_word_idx], 1), + NextValue(wishbone_slave_timeout, wishbone_default_timeout), + NextState("Idle") + ), + If(self.wishbone_slave.err, + NextState("Idle") + ) + ) + + + #last_write_word_idx = Signal(2) + #last_write_timeout = Signal(3) + #self.submodules.wishbone_slave_write_buffering_fsm = wishbone_slave_write_buffering_fsm = FSM(reset_state="Reset") + #wishbone_slave_write_buffering_fsm.act("Reset", + # NextState("Idle") + #) + #wishbone_slave_write_buffering_fsm.act("Idle", + # If(self.wishbone_slave.cyc & + # self.wishbone_slave.stb & + # ~self.wishbone_slave.ack & + # ~self.wishbone_slave.err & + # (self.wishbone_slave.sel == 0xf) & # Full Words Only + # self.wishbone_slave.we, + # NextValue(self.master_write_buffer_addr, self.wishbone_slave.adr[2:30]), + # NextValue(self.master_write_buffer_data[self.wishbone_slave.adr[0:2]], + # Cat(self.wishbone_slave.dat_w[24:32], # LE + # self.wishbone_slave.dat_w[16:24], + # self.wishbone_slave.dat_w[ 8:16], + # self.wishbone_slave.dat_w[ 0: 8])), + # NextValue(self.master_write_buffer_todo[self.wishbone_slave.adr[0:2]], 1), + # NextValue(self.wishbone_slave.ack, 1), + # NextValue(last_write_word_idx, self.wishbone_slave.adr[0:2]), + # NextValue(wishbone_slave_timeout, wishbone_default_timeout), + # If(self.wishbone_slave.adr[0:2] == 0, + # NextValue(last_write_timeout, 5), # CHECKME: 5 is arbitrary + # NextState("WaitForMoreData"), + # ).Else( + # NextValue(self.master_write_buffer_start, 1), + # NextState("WaitForWrite"), + # ) + # ) + #) + #wishbone_slave_write_buffering_fsm.act("WaitForMoreData", + # If(last_write_timeout > 0, + # NextValue(last_write_timeout, last_write_timeout - 1), + # ), + # If(self.wishbone_slave.cyc & + # self.wishbone_slave.stb & + # ~self.wishbone_slave.ack & + # ~self.wishbone_slave.err & + # self.wishbone_slave.we, + # If(((self.wishbone_slave.adr[2:30] != self.master_write_buffer_addr) | + # (self.wishbone_slave.sel != 0xf)), + # NextValue(self.master_write_buffer_start, 1), + # NextState("WaitForWrite"), + # ).Else( + # NextValue(self.master_write_buffer_data[self.wishbone_slave.adr[0:2]], + # Cat(self.wishbone_slave.dat_w[24:32], # LE + # self.wishbone_slave.dat_w[16:24], + # self.wishbone_slave.dat_w[ 8:16], + # self.wishbone_slave.dat_w[ 0: 8])), + # NextValue(self.master_write_buffer_todo[self.wishbone_slave.adr[0:2]], 1), + # NextValue(self.wishbone_slave.ack, 1), + # NextValue(last_write_word_idx, self.wishbone_slave.adr[0:2]), + # NextValue(wishbone_slave_timeout, wishbone_default_timeout), + # NextValue(last_write_timeout, 5), # CHECKME: 5 is arbitrary + # ) + # ).Elif(self.master_write_buffer_todo[0] & + # self.master_write_buffer_todo[1] & + # self.master_write_buffer_todo[2] & + # self.master_write_buffer_todo[3], + # NextValue(self.master_write_buffer_start, 1), + # NextState("WaitForWrite"), + # ).Elif(last_write_timeout == 0, + # NextValue(self.master_write_buffer_start, 1), + # NextState("WaitForWrite"), + # ) + #) + #wishbone_slave_write_buffering_fsm.act("WaitForWrite", + # If(self.master_write_buffer_start == 0, + # NextState("Idle"), + # ) + #) + + + self.stat_cycle_counter = Signal(32) + self.buf_stat_cycle_counter = Signal(32) + self.buf_stat_slave_start_counter = Signal(32) + self.buf_stat_slave_done_counter = Signal(32) + self.buf_stat_slave_rerun_counter = Signal(32) + self.buf_stat_slave_early_error_counter = Signal(32) + self.buf_stat_master_start_counter = Signal(32) + self.buf_stat_master_done_counter = Signal(32) + self.buf_stat_master_error_counter = Signal(32) + self.buf_stat_master_rerun_counter = Signal(32) + self.buf_sbus_master_error_virtual = Signal(32) + self.stat_update = Signal() + stat_update_prev = Signal() + + self.sync += stat_update_prev.eq(self.stat_update) + + self.sync += self.stat_cycle_counter.eq(self.stat_cycle_counter + 1) + self.sync += If(~stat_update_prev & self.stat_update, ## raising edge: copy to buffer and reset active + self.buf_stat_cycle_counter.eq(self.stat_cycle_counter), + self.buf_stat_slave_start_counter.eq(stat_slave_start_counter), + self.buf_stat_slave_done_counter.eq(stat_slave_done_counter), + self.buf_stat_slave_rerun_counter.eq(stat_slave_rerun_counter), + self.buf_stat_slave_early_error_counter.eq(stat_slave_early_error_counter), + self.buf_stat_master_start_counter.eq(stat_master_start_counter), + self.buf_stat_master_done_counter.eq(stat_master_done_counter), + self.buf_stat_master_error_counter.eq(stat_master_error_counter), + self.buf_stat_master_rerun_counter.eq(stat_master_rerun_counter), + self.buf_sbus_master_error_virtual.eq(sbus_master_error_virtual), + self.stat_cycle_counter.eq(0), + stat_slave_start_counter.eq(0), + stat_slave_done_counter.eq(0), + stat_slave_rerun_counter.eq(0), + stat_slave_early_error_counter.eq(0), + stat_master_start_counter.eq(0), + stat_master_done_counter.eq(0), + stat_master_error_counter.eq(0), + stat_master_rerun_counter.eq(0), + sbus_master_error_virtual.eq(0), + ) + self.sync += If(stat_update_prev & ~self.stat_update, ## falling edge: reset buffer + self.buf_stat_cycle_counter.eq(0), + self.buf_stat_slave_start_counter.eq(0), + self.buf_stat_slave_done_counter.eq(0), + self.buf_stat_slave_rerun_counter.eq(0), + self.buf_stat_slave_early_error_counter.eq(0), + self.buf_stat_master_start_counter.eq(0), + self.buf_stat_master_done_counter.eq(0), + self.buf_stat_master_error_counter.eq(0), + self.buf_stat_master_rerun_counter.eq(0), + self.buf_sbus_master_error_virtual.eq(0), + ) diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsmstat.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsmstat.py new file mode 100644 index 0000000..6ae0230 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsmstat.py @@ -0,0 +1,59 @@ +from migen import * +from migen.genlib.cdc import BusSynchronizer +from litex.soc.interconnect.csr import * +from litex.soc.interconnect import wishbone + +class SBusFPGABusStat(Module, AutoCSR): + def __init__(self, sbus_bus): + self.stat_ctrl = CSRStorage(fields = [CSRField("update", 1, description = "update")]) + self.submodules.sync_update = BusSynchronizer(width = 1, idomain="sys", odomain="sbus") + self.comb += self.sync_update.i.eq(self.stat_ctrl.fields.update) + self.comb += sbus_bus.stat_update.eq(self.sync_update.o) + + self.live_stat_cycle_counter = CSRStatus(32, description="live_stat_cycle_counter") + self.stat_cycle_counter = CSRStatus(32, description="stat_cycle_counter") + self.stat_slave_start_counter = CSRStatus(32, description="stat_slave_start_counter") + self.stat_slave_done_counter = CSRStatus(32, description="stat_slave_done_counter") + self.stat_slave_rerun_counter = CSRStatus(32, description="stat_slave_rerun_counter") + self.stat_slave_early_error_counter = CSRStatus(32, description="stat_slave_early_error_counter") + self.stat_master_start_counter = CSRStatus(32, description="stat_master_start_counter") + self.stat_master_done_counter = CSRStatus(32, description="stat_master_done_counter") + self.stat_master_error_counter = CSRStatus(32, description="stat_master_error_counter") + self.stat_master_rerun_counter = CSRStatus(32, description="stat_master_rerun_counter") + self.sbus_master_error_virtual = CSRStatus(32, description="sbus_master_error_virtual") + + self.submodules.sync_live_stat_cycle_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys") + self.comb += self.sync_live_stat_cycle_counter.i.eq(sbus_bus.stat_cycle_counter) + self.comb += self.live_stat_cycle_counter.status.eq(self.sync_live_stat_cycle_counter.o) + + self.submodules.sync_stat_cycle_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys") + self.comb += self.sync_stat_cycle_counter.i.eq(sbus_bus.buf_stat_cycle_counter) + self.comb += self.stat_cycle_counter.status.eq(self.sync_stat_cycle_counter.o) + + self.submodules.sync_stat_slave_start_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_slave_start_counter.i.eq(sbus_bus.buf_stat_slave_start_counter) + self.comb += self.stat_slave_start_counter.status.eq(self.sync_stat_slave_start_counter.o) + self.submodules.sync_stat_slave_done_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_slave_done_counter.i.eq(sbus_bus.buf_stat_slave_done_counter) + self.comb += self.stat_slave_done_counter.status.eq(self.sync_stat_slave_done_counter.o) + self.submodules.sync_stat_slave_rerun_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_slave_rerun_counter.i.eq(sbus_bus.buf_stat_slave_rerun_counter) + self.comb += self.stat_slave_rerun_counter.status.eq(self.sync_stat_slave_rerun_counter.o) + self.submodules.sync_stat_slave_early_error_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_slave_early_error_counter.i.eq(sbus_bus.buf_stat_slave_early_error_counter) + self.comb += self.stat_slave_early_error_counter.status.eq(self.sync_stat_slave_early_error_counter.o) + self.submodules.sync_stat_master_start_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_master_start_counter.i.eq(sbus_bus.buf_stat_master_start_counter) + self.comb += self.stat_master_start_counter.status.eq(self.sync_stat_master_start_counter.o) + self.submodules.sync_stat_master_done_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_master_done_counter.i.eq(sbus_bus.buf_stat_master_done_counter) + self.comb += self.stat_master_done_counter.status.eq(self.sync_stat_master_done_counter.o) + self.submodules.sync_stat_master_error_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_master_error_counter.i.eq(sbus_bus.buf_stat_master_error_counter) + self.comb += self.stat_master_error_counter.status.eq(self.sync_stat_master_error_counter.o) + self.submodules.sync_stat_master_rerun_counter = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_stat_master_rerun_counter.i.eq(sbus_bus.buf_stat_master_rerun_counter) + self.comb += self.stat_master_rerun_counter.status.eq(self.sync_stat_master_rerun_counter.o) + self.submodules.sync_sbus_master_error_virtual = BusSynchronizer(width = 32, idomain="sbus", odomain="sys"); + self.comb += self.sync_sbus_master_error_virtual.i.eq(sbus_bus.buf_sbus_master_error_virtual) + self.comb += self.sbus_master_error_virtual.status.eq(self.sync_sbus_master_error_virtual.o) diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py new file mode 100644 index 0000000..7467382 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py @@ -0,0 +1,350 @@ +import os +import argparse +from migen import * +import litex +from litex.build.generic_platform import * +from litex.build.xilinx.vivado import vivado_build_args, vivado_build_argdict +from litex.soc.integration.soc import * +from litex.soc.integration.soc_core import * +from litex.soc.integration.builder import * +from litex.soc.interconnect import wishbone +from litex.soc.cores.clock import * +from litex.soc.cores.led import LedChaser +import ztex213_sbus +from migen.genlib.fifo import * + +from litedram.modules import MT41J128M16 +from litedram.phy import s7ddrphy + +from sbus_to_fpga_fsm import * +from sbus_to_fpga_fsmstat import * +from sbus_to_fpga_blk_dma import * +from sbus_to_fpga_trng import * + +from litedram.frontend.dma import * + +from engine import Engine; +from migen.genlib.cdc import BusSynchronizer +from migen.genlib.resetsync import AsyncResetSynchronizer; + +import sbus_to_fpga_export; + +# CRG ---------------------------------------------------------------------------------------------- + +class _CRG(Module): + def __init__(self, platform, sys_clk_freq, usb=True): + self.clock_domains.cd_sys = ClockDomain() # 100 MHz PLL, reset'ed by SBus (via pll), SoC/Wishbone main clock + self.clock_domains.cd_sys4x = ClockDomain(reset_less=True) + self.clock_domains.cd_sys4x_dqs = ClockDomain(reset_less=True) + self.clock_domains.cd_idelay = ClockDomain() +## self.clock_domains.cd_sys = ClockDomain() # 16.67-25 MHz SBus, reset'ed by SBus, native SBus & SYS clock domain + self.clock_domains.cd_native = ClockDomain(reset_less=True) # 48MHz native, non-reset'ed (for power-on long delay, never reset, we don't want the delay after a warm reset) + self.clock_domains.cd_sbus = ClockDomain() # 16.67-25 MHz SBus, reset'ed by SBus, native SBus clock domain +# self.clock_domains.cd_por = ClockDomain() # 48 MHz native, reset'ed by SBus, power-on-reset timer + if (usb): + self.clock_domains.cd_usb = ClockDomain() # 48 MHZ PLL, reset'ed by SBus (via pll), for USB controller + self.clock_domains.cd_clk50 = ClockDomain() # 50 MHz (gated) for curve25519engine -> eng_clk + #self.clock_domains.cd_clk100 = ClockDomain() # 100 MHz for curve25519engine -> sys_clk + self.clock_domains.cd_clk100_gated = ClockDomain() # 100 MHz (gated) for curve25519engine -> mul_clk + self.clock_domains.cd_clk200 = ClockDomain() # 200 MHz (gated) for curve25519engine -> rf_clk + + # # # + clk48 = platform.request("clk48") + ###### explanations from betrusted-io/betrusted-soc/betrusted_soc.py + # Note: below feature cannot be used because Litex appends this *after* platform commands! This causes the generated + # clock derived constraints immediately below to fail, because .xdc file is parsed in-order, and the main clock needs + # to be created before the derived clocks. Instead, we use the line afterwards. + platform.add_platform_command("create_clock -name clk48 -period 20.8333 [get_nets clk48]") + # The above constraint must strictly proceed the below create_generated_clock constraints in the .XDC file + # This allows PLLs/MMCMEs to be placed anywhere and reference the input clock + self.clk48_bufg = Signal() + self.specials += Instance("BUFG", i_I=clk48, o_O=self.clk48_bufg) + self.comb += self.cd_native.clk.eq(self.clk48_bufg) + #self.cd_native.clk = clk48 + + clk_sbus = platform.request("SBUS_3V3_CLK") + self.cd_sbus.clk = clk_sbus + rst_sbus = platform.request("SBUS_3V3_RSTs") + self.comb += self.cd_sbus.rst.eq(~rst_sbus) + ##self.cd_sys.clk = clk_sbus + ##self.comb += self.cd_sys.rst.eq(~rst_sbus) + + self.curve25519_on = Signal() + + self.submodules.pll = pll = S7MMCM(speedgrade=-1) + #pll.register_clkin(clk48, 48e6) + pll.register_clkin(self.clk48_bufg, 48e6) + pll.create_clkout(self.cd_sys, sys_clk_freq, gated_replicas={self.cd_clk100_gated : pll.locked & self.curve25519_on}) + platform.add_platform_command("create_generated_clock -name sysclk [get_pins {{MMCME2_ADV/CLKOUT0}}]") + pll.create_clkout(self.cd_sys4x, 4*sys_clk_freq) + platform.add_platform_command("create_generated_clock -name sys4xclk [get_pins {{MMCME2_ADV/CLKOUT1}}]") + pll.create_clkout(self.cd_sys4x_dqs, 4*sys_clk_freq, phase=90) + platform.add_platform_command("create_generated_clock -name sys4x90clk [get_pins {{MMCME2_ADV/CLKOUT2}}]") + self.comb += pll.reset.eq(~rst_sbus) # | ~por_done + platform.add_false_path_constraints(self.cd_native.clk, self.cd_sbus.clk) + platform.add_false_path_constraints(self.cd_sbus.clk, self.cd_native.clk) + #platform.add_false_path_constraints(self.cd_sys.clk, self.cd_sbus.clk) + #platform.add_false_path_constraints(self.cd_sbus.clk, self.cd_sys.clk) + ##platform.add_false_path_constraints(self.cd_native.clk, self.cd_sys.clk) + + pll.create_clkout(self.cd_clk50, sys_clk_freq/2, ce=pll.locked & self.curve25519_on) + platform.add_platform_command("create_generated_clock -name clk50 [get_pins {{MMCME2_ADV/CLKOUT3}}]") + pll.create_clkout(self.cd_clk200, sys_clk_freq*2, ce=pll.locked & self.curve25519_on) + platform.add_platform_command("create_generated_clock -name clk200 [get_pins {{MMCME2_ADV/CLKOUT4}}]") + + #self.submodules.curve25519_pll = curve25519_pll = S7MMCM(speedgrade=-1) + #curve25519_clk_freq = 90e6 + ##self.curve25519_on = Signal() + ##curve25519_pll.register_clkin(clk48, 48e6) + #curve25519_pll.register_clkin(self.clk48_bufg, 48e6) + #curve25519_pll.create_clkout(self.cd_clk50, curve25519_clk_freq/2, margin=0, ce=curve25519_pll.locked & self.curve25519_on) + #platform.add_platform_command("create_generated_clock -name clk50 [get_pins {{MMCME2_ADV_1/CLKOUT0}}]") + #curve25519_pll.create_clkout(self.cd_clk100, curve25519_clk_freq, margin=0, ce=curve25519_pll.locked, + # gated_replicas={self.cd_clk100_gated : curve25519_pll.locked & self.curve25519_on}) + #platform.add_platform_command("create_generated_clock -name clk100 [get_pins {{MMCME2_ADV_1/CLKOUT1}}]") + #curve25519_pll.create_clkout(self.cd_clk200, curve25519_clk_freq*2, margin=0, ce=curve25519_pll.locked & self.curve25519_on) + #platform.add_platform_command("create_generated_clock -name clk200 [get_pins {{MMCME2_ADV_1/CLKOUT2}}]") + ##self.comb += curve25519_pll.reset.eq(~rst_sbus) # | ~por_done + #platform.add_false_path_constraints(self.cd_sys.clk, self.cd_clk50.clk) + #platform.add_false_path_constraints(self.cd_sys.clk, self.cd_clk100.clk) + #platform.add_false_path_constraints(self.cd_sys.clk, self.cd_clk200.clk) + #platform.add_false_path_constraints(self.cd_clk50.clk, self.cd_sys.clk) + #platform.add_false_path_constraints(self.cd_clk100.clk, self.cd_sys.clk) + #platform.add_false_path_constraints(self.cd_clk200.clk, self.cd_sys.clk) + + # Power on reset, reset propagate from SBus to SYS +# por_count = Signal(16, reset=2**16-1) +# por_done = Signal() +# self.comb += self.cd_por.clk.eq(clk48) +# self.comb += por_done.eq(por_count == 0) +# self.sync.por += If(~por_done, por_count.eq(por_count - 1)) +# self.comb += self.cd_por.rst.eq(~rst_sbus) +# self.comb += pll.reset.eq(~por_done | ~rst_sbus) + + # USB + if (usb): + self.submodules.usb_pll = usb_pll = S7MMCM(speedgrade=-1) + #usb_pll.register_clkin(clk48, 48e6) + usb_pll.register_clkin(self.clk48_bufg, 48e6) + usb_pll.create_clkout(self.cd_usb, 48e6, margin = 0) + platform.add_platform_command("create_generated_clock -name usbclk [get_pins {{MMCME2_ADV_2/CLKOUT0}}]") + self.comb += usb_pll.reset.eq(~rst_sbus) # | ~por_done + platform.add_false_path_constraints(self.cd_sys.clk, self.cd_usb.clk) + + self.submodules.pll_idelay = pll_idelay = S7MMCM(speedgrade=-1) + #pll_idelay.register_clkin(clk48, 48e6) + pll_idelay.register_clkin(self.clk48_bufg, 48e6) + pll_idelay.create_clkout(self.cd_idelay, 200e6, margin = 0) + platform.add_platform_command("create_generated_clock -name idelayclk [get_pins {{MMCME2_ADV_3/CLKOUT0}}]") + self.comb += pll_idelay.reset.eq(~rst_sbus) # | ~por_done + + self.submodules.idelayctrl = S7IDELAYCTRL(self.cd_idelay) + +class SBusFPGA(SoCCore): + def __init__(self, version, usb, **kwargs): + print(f"Building SBusFPGA for board version {version}") + + kwargs["cpu_type"] = "None" + kwargs["integrated_sram_size"] = 0 + kwargs["with_uart"] = False + kwargs["with_timer"] = False + + self.sys_clk_freq = sys_clk_freq = 100e6 ## 25e6 + + self.platform = platform = ztex213_sbus.Platform(variant="ztex2.13a", version = version) + + if (version == "V1.0"): + self.platform.add_extension(ztex213_sbus._usb_io_v1_0) + + SoCCore.__init__(self, + platform=platform, + sys_clk_freq=sys_clk_freq, + clk_freq=sys_clk_freq, + csr_paging=0x1000, # default is 0x800 + **kwargs) + + # This mem-map is also exposed in the FSM (matched prefixes) + # and in the PROM (to tell NetBSD where everything is) + # Currently it is a straight mapping between the two: + # the physical address here are used as offset in the SBus + # reserved area of 256 MiB + # Anything at 0x10000000 is therefore unreachable directly + # The position of the 'usb_fake_dma' is so it overlaps + # the virtual address space used by NetBSD DMA allocators + # (themselves constrained by the SBus MMU capabilities) + self.wb_mem_map = wb_mem_map = { + "prom": 0x00000000, + "csr" : 0x00040000, + "usb_host": 0x00080000, + "usb_shared_mem": 0x00090000, # unused + "curve25519engine": 0x000a0000, + "main_ram": 0x80000000, + "usb_fake_dma": 0xfc000000, + } + self.mem_map.update(wb_mem_map) + self.submodules.crg = _CRG(platform=platform, sys_clk_freq=sys_clk_freq, usb=usb) + self.platform.add_period_constraint(self.platform.lookup_request("SBUS_3V3_CLK", loose=True), 1e9/25e6) # SBus max + + if (version == "V1.0"): + self.submodules.leds = LedChaser( + pads = platform.request("SBUS_DATA_OE_LED_2"), #platform.request("user_led", 7), + sys_clk_freq = sys_clk_freq) + self.add_csr("leds") + + if (usb): + self.add_usb_host(pads=platform.request("usb"), usb_clk_freq=48e6) + if (version == "V1.0"): + pad_usb_interrupt = platform.request("SBUS_3V3_INT1s") ## only one usable + elif (version == "V1.2"): + pad_usb_interrupt = platform.request("SBUS_3V3_INT3s") ## can be 1-6, beware others + sig_usb_interrupt = Signal(reset=1) + # the 74LVC2G07 takes care of the Z state: 1 -> Z on the bus, 0 -> 0 on the bus (asserted interrupt) + self.comb += pad_usb_interrupt.eq(sig_usb_interrupt) + self.comb += sig_usb_interrupt.eq(~self.usb_host.interrupt) ## + + + #pad_SBUS_DATA_OE_LED = platform.request("SBUS_DATA_OE_LED") + #SBUS_DATA_OE_LED_o = Signal() + #self.comb += pad_SBUS_DATA_OE_LED.eq(SBUS_DATA_OE_LED_o) + #pad_SBUS_DATA_OE_LED_2 = platform.request("SBUS_DATA_OE_LED_2") + #SBUS_DATA_OE_LED_2_o = Signal() + #self.comb += pad_SBUS_DATA_OE_LED_2.eq(SBUS_DATA_OE_LED_2_o) + #self.comb += SBUS_DATA_OE_LED_o.eq(~SBUS_3V3_INT1s_o) + + prom_file = "prom_migen.fc" + prom_data = soc_core.get_mem_data(prom_file, "big") + # prom = Array(prom_data) + #print("\n****************************************\n") + #for i in range(len(prom)): + # print(hex(prom[i])) + #print("\n****************************************\n") + self.add_ram("prom", origin=self.mem_map["prom"], size=2**16, contents=prom_data, mode="r") + #getattr(self,"prom").mem.init = prom_data + #getattr(self,"prom").mem.depth = 2**14 + + self.submodules.ddrphy = s7ddrphy.A7DDRPHY(platform.request("ddram"), + memtype = "DDR3", + nphases = 4, + sys_clk_freq = sys_clk_freq) + self.add_sdram("sdram", + phy = self.ddrphy, + module = MT41J128M16(sys_clk_freq, "1:4"), + l2_cache_size = 0, + ) + # don't enable anything on the SBus side for 20 seconds after power up + # this avoids FPGA initialization messing with the cold boot process + # requires us to reset the SPARCstation afterward so the FPGA board + # is properly identified + # This is in the 'native' ClockDomain that is never reset + hold_reset_ctr = Signal(30, reset=960000000) + self.sync.native += If(hold_reset_ctr>0, hold_reset_ctr.eq(hold_reset_ctr - 1)) + hold_reset = Signal(reset=1) + self.comb += hold_reset.eq(~(hold_reset_ctr == 0)) + + # Interface SBus to wishbone + # we need to cross clock domains + wishbone_slave_sbus = wishbone.Interface(data_width=self.bus.data_width) + wishbone_master_sys = wishbone.Interface(data_width=self.bus.data_width) + self.submodules.wishbone_master_sbus = wishbone.WishboneDomainCrossingMaster(platform=self.platform, slave=wishbone_master_sys, cd_master="sbus", cd_slave="sys") + self.submodules.wishbone_slave_sys = wishbone.WishboneDomainCrossingMaster(platform=self.platform, slave=wishbone_slave_sbus, cd_master="sys", cd_slave="sbus") + + # SPARCstation 20 slave interface to the main memory are limited to 32-bytes burst (32-bits wide, 8 word long) + # burst_size=16 should work on Ultra systems, but then they probably should go for 64-bits ET as well... + # Older systems are probably limited to burst_size=4, (it should always be available) + burst_size=8 + self.submodules.tosbus_fifo = ClockDomainsRenamer({"read": "sbus", "write": "sys"})(AsyncFIFOBuffered(width=(32+burst_size*32), depth=burst_size)) + self.submodules.fromsbus_fifo = ClockDomainsRenamer({"write": "sbus", "read": "sys"})(AsyncFIFOBuffered(width=((30-log2_int(burst_size))+burst_size*32), depth=burst_size)) + self.submodules.fromsbus_req_fifo = ClockDomainsRenamer({"read": "sbus", "write": "sys"})(AsyncFIFOBuffered(width=((30-log2_int(burst_size))+32), depth=burst_size)) + + self.submodules.dram_dma_writer = LiteDRAMDMAWriter(port=self.sdram.crossbar.get_port(mode="write", data_width=burst_size*32), + fifo_depth=4, + fifo_buffered=True) + + self.submodules.dram_dma_reader = LiteDRAMDMAReader(port=self.sdram.crossbar.get_port(mode="read", data_width=burst_size*32), + fifo_depth=4, + fifo_buffered=True) + + self.submodules.exchange_with_mem = ExchangeWithMem(soc=self, + tosbus_fifo=self.tosbus_fifo, + fromsbus_fifo=self.fromsbus_fifo, + fromsbus_req_fifo=self.fromsbus_req_fifo, + dram_dma_writer=self.dram_dma_writer, + dram_dma_reader=self.dram_dma_reader, + burst_size=burst_size, + do_checksum = True) + + _sbus_bus = SBusFPGABus(platform=self.platform, + hold_reset=hold_reset, + wishbone_slave=wishbone_slave_sbus, + wishbone_master=self.wishbone_master_sbus, + tosbus_fifo=self.tosbus_fifo, + fromsbus_fifo=self.fromsbus_fifo, + fromsbus_req_fifo=self.fromsbus_req_fifo, + burst_size=burst_size) + #self.submodules.sbus_bus = _sbus_bus + self.submodules.sbus_bus = ClockDomainsRenamer("sbus")(_sbus_bus) + self.submodules.sbus_bus_stat = SBusFPGABusStat(sbus_bus = self.sbus_bus) + + self.bus.add_master(name="SBusBridgeToWishbone", master=wishbone_master_sys) + + if (usb): + self.bus.add_slave(name="usb_fake_dma", slave=self.wishbone_slave_sys, region=SoCRegion(origin=self.mem_map.get("usb_fake_dma", None), size=0x03ffffff, cached=False)) + #self.bus.add_master(name="mem_read_master", master=self.exchange_with_mem.wishbone_r_slave) + #self.bus.add_master(name="mem_write_master", master=self.exchange_with_mem.wishbone_w_slave) + + #self.add_sdcard() + + self.submodules.trng = NeoRV32TrngWrapper(platform=platform) + + # beware the naming, as 'clk50' 'sysclk' 'clk200' are used in the original platform constraints + # the local engine.py was slightly modified to have configurable names, so we can have 'clk50', 'clk100', 'clk200' + # Beware that Engine implicitely runs in 'sys' by default, need to rename that one as well + # Actually renaming 'sys' doesn't work - unless we can CDC the CSRs as well + self.submodules.curve25519engine = ClockDomainsRenamer({"eng_clk":"clk50", "rf_clk":"clk200", "mul_clk":"clk100_gated"})(Engine(platform=platform,prefix=self.mem_map.get("curve25519engine", None))) # , "sys":"clk100" + #self.submodules.curve25519engine_wishbone_cdc = wishbone.WishboneDomainCrossingMaster(platform=self.platform, slave=self.curve25519engine.bus, cd_master="sys", cd_slave="clk100") + #self.bus.add_slave("curve25519engine", self.curve25519engine_wishbone_cdc, SoCRegion(origin=self.mem_map.get("curve25519engine", None), size=0x20000, cached=False)) + self.bus.add_slave("curve25519engine", self.curve25519engine.bus, SoCRegion(origin=self.mem_map.get("curve25519engine", None), size=0x20000, cached=False)) + self.bus.add_master(name="curve25519engineLS", master=self.curve25519engine.busls) + #self.submodules.curve25519_on_sync = BusSynchronizer(width = 1, idomain = "clk100", odomain = "sys") + #self.comb += self.curve25519_on_sync.i.eq(self.curve25519engine.power.fields.on) + #self.comb += self.crg.curve25519_on.eq(self.curve25519_on_sync.o) + self.comb += self.crg.curve25519_on.eq(self.curve25519engine.power.fields.on) + +def main(): + parser = argparse.ArgumentParser(description="SbusFPGA") + parser.add_argument("--build", action="store_true", help="Build bitstream") + parser.add_argument("--version", default="V1.0", help="SBusFPGA board version (default V1.0)") + parser.add_argument("--usb", action="store_true", help="add a USB OHCI controller") + builder_args(parser) + vivado_build_args(parser) + args = parser.parse_args() + + soc = SBusFPGA(**soc_core_argdict(args), + version=args.version, + usb=args.usb) + #soc.add_uart(name="uart", baudrate=115200, fifo_depth=16) + + builder = Builder(soc, **builder_argdict(args)) + builder.build(**vivado_build_argdict(args), run=args.build) + + # Generate modified CSR registers definitions/access functions to netbsd_csr.h. + # should be split per-device (and without base) to still work if we have identical devices in different configurations on multiple boards + csr_contents = sbus_to_fpga_export.get_csr_header( + regions = soc.csr_regions, + constants = soc.constants, + csr_base = soc.mem_regions['csr'].origin) + write_to_file(os.path.join("netbsd_csr.h"), csr_contents) + + # tells the prom where to find what + # just one, as that is board-specific + # BEWARE! then need to run 'forth_to_migen_rom.sh' *and* regenerate the bitstream with the proper PROM built-in! + # (there's surely a better way...) + csr_forth_contents = sbus_to_fpga_export.get_csr_forth_header( + csr_regions = soc.csr_regions, + mem_regions = soc.mem_regions, + constants = soc.constants, + csr_base = soc.mem_regions['csr'].origin) + write_to_file(os.path.join("prom_csr.fth"), csr_forth_contents) + +if __name__ == "__main__": + main() diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_trng.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_trng.py new file mode 100644 index 0000000..5db0f8e --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_trng.py @@ -0,0 +1,94 @@ +from migen import * +from migen.genlib.fifo import * +from litex.soc.interconnect.csr import * + +class NeoRV32TrngWrapper(Module, AutoCSR): + def __init__(self, platform): + self.add_sources(platform) + + rden_i = Signal() + wren_i = Signal() + data_i = Signal(32) + data_o = Signal(32) + + self.ctrl = CSRStorage(32, description = "CTRL register; bit 0 : disable ; bit 1 : enable") + self.data = CSRStatus(32, description = "Rnd Data or 0") + + self.submodules.ctrl_fsm = ctrl_fsm = FSM(reset_state = "Reset") + ctrl_fsm.act("Reset", + NextState("Idle") + ) + ctrl_fsm.act("Idle", + If(self.ctrl.re, # someone has written control + If(self.ctrl.storage[0], + data_i.eq(0), + wren_i.eq(1), + ).Elif(self.ctrl.storage[1], + data_i.eq(0xffffffff), + wren_i.eq(1), + ) + ), + If(self.data.we, # someone has read the data, reset so that the same value is never read twice + NextValue(self.data.status, 0), + ) + ) + + # fill out an intermediate buffer, one byte every 11 cycles + # then copy the 4 bytes to data CST and do it all over again + buf = Array(Signal(8) for a in range(4)) + idx = Signal(2) + cnt = Signal(4) + self.submodules.upd_fsm = upd_fsm = FSM(reset_state = "Reset") + upd_fsm.act("Reset", + NextValue(cnt, 11), + NextValue(idx, 0), + NextState("ByteWait") + ) + upd_fsm.act("ByteWait", + If(cnt == 0, + rden_i.eq(1), + NextState("ByteWrite"), + ).Else( + NextValue(cnt, cnt - 1) + ) + ) + upd_fsm.act("ByteWrite", + If (data_o[31] & data_o[30], + NextValue(buf[idx], data_o[0:8]), + NextValue(cnt, 11), + NextValue(idx, idx + 1), + If(idx == 3, + NextState("Copy"), + ).Else( + NextState("ByteWait"), + ) + ).Else( # try again + NextValue(cnt, 11), + NextState("ByteWait"), + ) + ) + upd_fsm.act("Copy", + NextValue(self.data.status, Cat(buf[0], buf[1], buf[2], buf[3])), + NextValue(buf[0], 0), + NextValue(buf[1], 0), + NextValue(buf[2], 0), + NextValue(buf[3], 0), + NextState("ByteWait") + ) + + + + + self.specials += Instance(self.get_netlist_name(), + i_clk_i = ClockSignal("sys"), + i_rden_i = rden_i, + i_wren_i = wren_i, + i_data_i = data_i, + o_data_o = data_o) + + def get_netlist_name(self): + return "neorv32_trng" + + def add_sources(self, platform): + platform.add_source("neorv32_trng_patched.vhd", "vhdl") + diff --git a/sbus-to-ztex-gateware-migen/sbusfpga_stat_ctl.c b/sbus-to-ztex-gateware-migen/sbusfpga_stat_ctl.c new file mode 100644 index 0000000..fe9d08b --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sbusfpga_stat_ctl.c @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define SBUSFPGA_STAT_ON _IO(0, 1) +#define SBUSFPGA_STAT_OFF _IO(0, 0) + +int main(int argc, char **argv) { + const char const * device = "/dev/sbusfpga_stat0"; + int devfd; + int onoff; + + if (argc != 2) { + fprintf(stderr, "Usage: %s on|off\n", argv[0]); + return -1; + } + + if (strncmp("on", argv[1], 2) == 0) { + onoff = 1; + } else if (strncmp("off", argv[1], 3) == 0) { + onoff = 0; + } else { + fprintf(stderr, "Usage: %s on|off\n", argv[0]); + return -1; + } + + if ( (devfd = open(device, O_RDWR)) == -1) { + perror("can't open device file"); + return -1; + } + + switch (onoff) { + case 0: + if (ioctl(devfd, SBUSFPGA_STAT_OFF, NULL)) { + perror("Turning statistics off failed."); + close(devfd); + return -1; + } + break; + case 1: + if (ioctl(devfd, SBUSFPGA_STAT_ON, NULL)) { + perror("Turning statistics on failed."); + close(devfd); + return -1; + } + break; + } + + return 0; +} diff --git a/sbus-to-ztex-gateware-migen/sdram_csr.fth b/sbus-to-ztex-gateware-migen/sdram_csr.fth new file mode 100644 index 0000000..84275c7 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sdram_csr.fth @@ -0,0 +1,228 @@ +: dphy_rst_rd ( -- csr_value ) + mregs-virt h# 1000 + l@ +; +: dphy_half_sys8x_taps_rd ( -- csr_value ) + mregs-virt h# 1004 + l@ +; +: dphy_wlevel_en_rd ( -- csr_value ) + mregs-virt h# 1008 + l@ +; +: dphy_wlevel_strobe_rd ( -- csr_value ) + mregs-virt h# 100c + l@ +; +: dphy_dly_sel_rd ( -- csr_value ) + mregs-virt h# 1010 + l@ +; +: dphy_rdly_dq_rst_rd ( -- csr_value ) + mregs-virt h# 1014 + l@ +; +: dphy_rdly_dq_inc_rd ( -- csr_value ) + mregs-virt h# 1018 + l@ +; +: dphy_rdly_dq_bitslip_rst_rd ( -- csr_value ) + mregs-virt h# 101c + l@ +; +: dphy_rdly_dq_bitslip_rd ( -- csr_value ) + mregs-virt h# 1020 + l@ +; +: dphy_wdly_dq_bitslip_rst_rd ( -- csr_value ) + mregs-virt h# 1024 + l@ +; +: dphy_wdly_dq_bitslip_rd ( -- csr_value ) + mregs-virt h# 1028 + l@ +; +: dphy_rdphase_rd ( -- csr_value ) + mregs-virt h# 102c + l@ +; +: dphy_wrphase_rd ( -- csr_value ) + mregs-virt h# 1030 + l@ +; +: sdr_dfii_control_rd ( -- csr_value ) + mregs-virt h# 2000 + l@ +; +: sdr_dfii_pi0_command_rd ( -- csr_value ) + mregs-virt h# 2004 + l@ +; +: sdr_dfii_pi0_command_issue_rd ( -- csr_value ) + mregs-virt h# 2008 + l@ +; +: sdr_dfii_pi0_address_rd ( -- csr_value ) + mregs-virt h# 200c + l@ +; +: sdr_dfii_pi0_baddress_rd ( -- csr_value ) + mregs-virt h# 2010 + l@ +; +: sdr_dfii_pi0_wrdata_rd ( -- csr_value ) + mregs-virt h# 2014 + l@ +; +: sdr_dfii_pi0_rddata_rd ( -- csr_value ) + mregs-virt h# 2018 + l@ +; +: sdr_dfii_pi1_command_rd ( -- csr_value ) + mregs-virt h# 201c + l@ +; +: sdr_dfii_pi1_command_issue_rd ( -- csr_value ) + mregs-virt h# 2020 + l@ +; +: sdr_dfii_pi1_address_rd ( -- csr_value ) + mregs-virt h# 2024 + l@ +; +: sdr_dfii_pi1_baddress_rd ( -- csr_value ) + mregs-virt h# 2028 + l@ +; +: sdr_dfii_pi1_wrdata_rd ( -- csr_value ) + mregs-virt h# 202c + l@ +; +: sdr_dfii_pi1_rddata_rd ( -- csr_value ) + mregs-virt h# 2030 + l@ +; +: sdr_dfii_pi2_command_rd ( -- csr_value ) + mregs-virt h# 2034 + l@ +; +: sdr_dfii_pi2_command_issue_rd ( -- csr_value ) + mregs-virt h# 2038 + l@ +; +: sdr_dfii_pi2_address_rd ( -- csr_value ) + mregs-virt h# 203c + l@ +; +: sdr_dfii_pi2_baddress_rd ( -- csr_value ) + mregs-virt h# 2040 + l@ +; +: sdr_dfii_pi2_wrdata_rd ( -- csr_value ) + mregs-virt h# 2044 + l@ +; +: sdr_dfii_pi2_rddata_rd ( -- csr_value ) + mregs-virt h# 2048 + l@ +; +: sdr_dfii_pi3_command_rd ( -- csr_value ) + mregs-virt h# 204c + l@ +; +: sdr_dfii_pi3_command_issue_rd ( -- csr_value ) + mregs-virt h# 2050 + l@ +; +: sdr_dfii_pi3_address_rd ( -- csr_value ) + mregs-virt h# 2054 + l@ +; +: sdr_dfii_pi3_baddress_rd ( -- csr_value ) + mregs-virt h# 2058 + l@ +; +: sdr_dfii_pi3_wrdata_rd ( -- csr_value ) + mregs-virt h# 205c + l@ +; +: sdr_dfii_pi3_rddata_rd ( -- csr_value ) + mregs-virt h# 2060 + l@ +; +: dphy_rst_wr ( value -- ) + mregs-virt h# 1000 + l! +; +: dphy_half_sys8x_taps_wr ( value -- ) + mregs-virt h# 1004 + l! +; +: dphy_wlevel_en_wr ( value -- ) + mregs-virt h# 1008 + l! +; +: dphy_wlevel_strobe_wr ( value -- ) + mregs-virt h# 100c + l! +; +: dphy_dly_sel_wr ( value -- ) + mregs-virt h# 1010 + l! +; +: dphy_rdly_dq_rst_wr ( value -- ) + mregs-virt h# 1014 + l! +; +: dphy_rdly_dq_inc_wr ( value -- ) + mregs-virt h# 1018 + l! +; +: dphy_rdly_dq_bitslip_rst_wr ( value -- ) + mregs-virt h# 101c + l! +; +: dphy_rdly_dq_bitslip_wr ( value -- ) + mregs-virt h# 1020 + l! +; +: dphy_wdly_dq_bitslip_rst_wr ( value -- ) + mregs-virt h# 1024 + l! +; +: dphy_wdly_dq_bitslip_wr ( value -- ) + mregs-virt h# 1028 + l! +; +: dphy_rdphase_wr ( value -- ) + mregs-virt h# 102c + l! +; +: dphy_wrphase_wr ( value -- ) + mregs-virt h# 1030 + l! +; +: sdr_dfii_control_wr ( value -- ) + mregs-virt h# 2000 + l! +; +: sdr_dfii_pi0_command_wr ( value -- ) + mregs-virt h# 2004 + l! +; +: sdr_dfii_pi0_command_issue_wr ( value -- ) + mregs-virt h# 2008 + l! +; +: sdr_dfii_pi0_address_wr ( value -- ) + mregs-virt h# 200c + l! +; +: sdr_dfii_pi0_baddress_wr ( value -- ) + mregs-virt h# 2010 + l! +; +: sdr_dfii_pi0_wrdata_wr ( value -- ) + mregs-virt h# 2014 + l! +; +: sdr_dfii_pi0_rddata_wr ( value -- ) + mregs-virt h# 2018 + l! +; +: sdr_dfii_pi1_command_wr ( value -- ) + mregs-virt h# 201c + l! +; +: sdr_dfii_pi1_command_issue_wr ( value -- ) + mregs-virt h# 2020 + l! +; +: sdr_dfii_pi1_address_wr ( value -- ) + mregs-virt h# 2024 + l! +; +: sdr_dfii_pi1_baddress_wr ( value -- ) + mregs-virt h# 2028 + l! +; +: sdr_dfii_pi1_wrdata_wr ( value -- ) + mregs-virt h# 202c + l! +; +: sdr_dfii_pi1_rddata_wr ( value -- ) + mregs-virt h# 2030 + l! +; +: sdr_dfii_pi2_command_wr ( value -- ) + mregs-virt h# 2034 + l! +; +: sdr_dfii_pi2_command_issue_wr ( value -- ) + mregs-virt h# 2038 + l! +; +: sdr_dfii_pi2_address_wr ( value -- ) + mregs-virt h# 203c + l! +; +: sdr_dfii_pi2_baddress_wr ( value -- ) + mregs-virt h# 2040 + l! +; +: sdr_dfii_pi2_wrdata_wr ( value -- ) + mregs-virt h# 2044 + l! +; +: sdr_dfii_pi2_rddata_wr ( value -- ) + mregs-virt h# 2048 + l! +; +: sdr_dfii_pi3_command_wr ( value -- ) + mregs-virt h# 204c + l! +; +: sdr_dfii_pi3_command_issue_wr ( value -- ) + mregs-virt h# 2050 + l! +; +: sdr_dfii_pi3_address_wr ( value -- ) + mregs-virt h# 2054 + l! +; +: sdr_dfii_pi3_baddress_wr ( value -- ) + mregs-virt h# 2058 + l! +; +: sdr_dfii_pi3_wrdata_wr ( value -- ) + mregs-virt h# 205c + l! +; +: sdr_dfii_pi3_rddata_wr ( value -- ) + mregs-virt h# 2060 + l! +; diff --git a/sbus-to-ztex-gateware-migen/sdram_init.fth b/sbus-to-ztex-gateware-migen/sdram_init.fth new file mode 100644 index 0000000..e86ed69 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/sdram_init.fth @@ -0,0 +1,533 @@ +headers + +fload sdram_csr.fth + +external + +: popcnt ( n -- u) + 0 swap + BEGIN dup WHILE tuck 1 AND + swap 1 rshift REPEAT + DROP +; + +: cdelay ( count -- ) + \ Forth loop always have a least one iteration + dup 0<> if + 0 do noop loop + else drop then +; + +headers + +: sdram_software_control_on ( -- ) + sdr_dfii_control_rd + h# e <> if h# e sdr_dfii_control_wr then +; + +: sdram_software_control_off ( -- ) + sdr_dfii_control_rd + h# 1 <> if h# 1 sdr_dfii_control_wr then +; + +: command_p0 ( cmd -- ) + sdr_dfii_pi0_command_wr + 1 sdr_dfii_pi0_command_issue_wr +; +: command_p1 ( cmd -- ) + sdr_dfii_pi1_command_wr + 1 sdr_dfii_pi1_command_issue_wr +; +: command_p2 ( cmd -- ) + sdr_dfii_pi2_command_wr + 1 sdr_dfii_pi2_command_issue_wr +; +: command_p3 ( cmd -- ) + sdr_dfii_pi3_command_wr + 1 sdr_dfii_pi3_command_issue_wr +; + +: init_sequence ( -- ) + .( init_sequence ) cr + h# 0 sdr_dfii_pi0_address_wr + h# 0 sdr_dfii_pi0_baddress_wr + h# c sdr_dfii_control_wr + 50 ms + + h# 0 sdr_dfii_pi0_address_wr + h# 0 sdr_dfii_pi0_baddress_wr + h# e sdr_dfii_control_wr + 10 ms + + h# 200 sdr_dfii_pi0_address_wr + h# 2 sdr_dfii_pi0_baddress_wr + h# f command_p0 + + h# 0 sdr_dfii_pi0_address_wr + h# 3 sdr_dfii_pi0_baddress_wr + h# f command_p0 + + h# 6 sdr_dfii_pi0_address_wr + h# 1 sdr_dfii_pi0_baddress_wr + h# f command_p0 + + h# 920 sdr_dfii_pi0_address_wr + h# 0 sdr_dfii_pi0_baddress_wr + h# f command_p0 + 200 cdelay + + h# 400 sdr_dfii_pi0_address_wr + 0 sdr_dfii_pi0_baddress_wr + h# 3 command_p0 + 200 cdelay +; + +: sdram_read_leveling_rst_delay ( modulenum -- ) + h# 1 swap << dphy_dly_sel_wr + h# 1 dphy_rdly_dq_rst_wr + h# 0 dphy_dly_sel_wr +; + +: sdram_read_leveling_inc_delay ( modulenum -- ) + h# 1 swap << dphy_dly_sel_wr + h# 1 dphy_rdly_dq_inc_wr + h# 0 dphy_dly_sel_wr +; + +: sdram_read_leveling_rst_bitslip ( modulenum -- ) + h# 1 swap << dphy_dly_sel_wr + h# 1 dphy_rdly_dq_bitslip_rst_wr + h# 0 dphy_dly_sel_wr +; + +: sdram_read_leveling_inc_bitslip ( modulenum -- ) + h# 1 swap << dphy_dly_sel_wr + h# 1 dphy_rdly_dq_bitslip_wr + h# 0 dphy_dly_sel_wr +; + +: lfsr ( bits prev -- res ) + dup 1 and not ( bits prev -- bits prev ~{prev&1} ) + swap 1 >> ( bits prev ~{prev&1} -- bits ~{prev&1} {prev>>1} ) + swap ( bits prev ~{prev&1} -- bits {prev>>1} ~{prev&1} ) + rot ( bits {prev>>1} ~{prev&1} -- {prev>>1} ~{prev&1} bits ) + \ assume bits is 32, 'cause it is + drop h# 80200003 ( {prev>>1} ~{prev&1} bits -- {prev>>1} ~{prev&1} lfsr_taps[bits] ) + and + xor +; + +: sdram_activate_test_row ( -- ) + h# 0 sdr_dfii_pi0_address_wr + h# 0 sdr_dfii_pi0_baddress_wr + h# 9 command_p0 + 15 cdelay +; + +: sdram_precharge_test_row ( -- ) + h# 0 sdr_dfii_pi0_address_wr + h# 0 sdr_dfii_pi0_baddress_wr + h# b command_p0 + 15 cdelay +; + +: command_px ( phase value -- ) + over 3 = if dup command_p3 then + over 2 = if dup command_p2 then + over 1 = if dup command_p1 then + over 0 = if dup command_p0 then + 2drop +; + +: command_prd ( value -- ) + dphy_rdphase_rd + swap command_px +; + +: command_pwr ( value -- ) + dphy_wrphase_rd + swap command_px +; + +: sdr_dfii_pix_address_wr ( phase value -- ) + over 3 = if dup sdr_dfii_pi3_address_wr then + over 2 = if dup sdr_dfii_pi2_address_wr then + over 1 = if dup sdr_dfii_pi1_address_wr then + over 0 = if dup sdr_dfii_pi0_address_wr then + 2drop +; + +: sdr_dfii_pird_address_wr ( value -- ) + dphy_rdphase_rd + swap sdr_dfii_pix_address_wr +; + +: sdr_dfii_piwr_address_wr ( value -- ) + dphy_wrphase_rd + swap sdr_dfii_pix_address_wr +; + +: sdr_dfii_pix_baddress_wr ( phase value -- ) + over 3 = if dup sdr_dfii_pi3_baddress_wr then + over 2 = if dup sdr_dfii_pi2_baddress_wr then + over 1 = if dup sdr_dfii_pi1_baddress_wr then + over 0 = if dup sdr_dfii_pi0_baddress_wr then + 2drop +; + +: sdr_dfii_pird_baddress_wr ( value -- ) + dphy_rdphase_rd + swap sdr_dfii_pix_baddress_wr +; + +: sdr_dfii_piwr_baddress_wr ( value -- ) + dphy_wrphase_rd + swap sdr_dfii_pix_baddress_wr +; + +: sdr_wr_rd_chk_tst_pat_get ( seed -- A B C D ) +\ .( sdr_wr_rd_chk_tst_pat_get ) cr + dup 42 = if h# 00000080 swap then + dup 42 = if h# 00000000 swap then + dup 42 = if h# 00000000 swap then + dup 42 = if h# 15090700 swap then + dup 84 = if h# 00000000 swap then + dup 84 = if h# 00000000 swap then + dup 84 = if h# 00000000 swap then + dup 84 = if h# 2a150907 swap then + drop +; + +: sdr_wr_rd_check_test_pattern ( modulenum seed -- errors ) +\ .( sdr_wr_rd_check_test_pattern ) cr + sdram_activate_test_row + dup sdr_wr_rd_chk_tst_pat_get + \ should have the 4 patterns on top of the stack: modulenum seed p0 p1 p2 p3 + sdr_dfii_pi0_wrdata_wr + sdr_dfii_pi1_wrdata_wr + sdr_dfii_pi2_wrdata_wr + sdr_dfii_pi3_wrdata_wr + \ should be back at modulenum seed + h# 0 sdr_dfii_piwr_address_wr + h# 0 sdr_dfii_piwr_baddress_wr + h# 17 command_pwr + 15 cdelay + + h# 0 sdr_dfii_pird_address_wr + h# 0 sdr_dfii_pird_baddress_wr + h# 25 command_prd + 15 cdelay + + sdram_precharge_test_row + + sdr_wr_rd_chk_tst_pat_get + \ should have the 4 patterns on top of the stack: modulenum p0 p1 p2 p3 + sdr_dfii_pi0_rddata_rd xor popcnt + \ should be at modulenum p0 p1 p2 errors + swap sdr_dfii_pi0_rddata_rd xor popcnt + + \ should be at modulenum p0 p1 errors + swap sdr_dfii_pi0_rddata_rd xor popcnt + + \ should be at modulenum p0 errors + swap sdr_dfii_pi0_rddata_rd xor popcnt + + \ should be at modulenum errors + \ drop modulenum + nip +; + +: sdram_read_leveling_scan_module ( modulenum bitslip -- score ) +\ .( sdram_read_leveling_scan_module ) cr + over sdram_read_leveling_rst_delay + \ push score + 0 + \ we should be at 'modulenum bitslip score' + 32 0 do +\ .( starting rd_lvl_scan loop with stack: ) .s cr + 2 pick 42 sdr_wr_rd_check_test_pattern + \ now we have an error count at the top + 3 pick 84 sdr_wr_rd_check_test_pattern + \ merge both error count + + + \ we should be at 'modulenum bitslip score errorcount' + dup 0= + \ we should be at 'modulenum bitslip score errorcount working?' + if 16384 else 0 then + \ we should be at 'modulenum bitslip score errorcount (0|16384)' + swap 512 swap - + \ we should be at 'modulenum bitslip score (0|16384) (512-errorcount)' + + + + + \ we should be at 'modulenum bitslip score' + 2 pick sdram_read_leveling_inc_delay + loop + nip + nip +; + +: sdr_wr_lat_cal_bitslip_loop ( modulenum bestbitslip bestscore bitslip -- modulenum bestbitslip bestscore ) +\ .( sdr_wr_lat_cal_bitslip_loop for module: ) 3 pick . .( bitslip: ) dup . cr +\ .( sdr_wr_lat_cal_bitslip_loop, stack: ) .s cr + 1 4 pick << dphy_dly_sel_wr ( '4 pick' will extract modulenum, needed as we're stacking the '1' ) + 1 dphy_wdly_dq_bitslip_rst_wr + \ Forth loop always have a least one iteration + dup 0<> if + dup 0 do + 1 dphy_wdly_dq_bitslip_wr + loop + then + 0 dphy_dly_sel_wr +\ .( sdr_wr_lat_cal_bitslip_loop after bitslip init loop, stack: ) .s cr + \ push current score + 0 ( we should be at 'modulenum bestbitslip bestscore bitslip score' ) + 4 pick sdram_read_leveling_rst_bitslip + 8 0 do + 4 pick over sdram_read_leveling_scan_module + \ we should be at 'modulenum bestbitslip bestscore bitslip score score', max will merge scores + max + \ we should be at 'modulenum bestbitslip bestscore bitslip score' again + 4 pick sdram_read_leveling_inc_bitslip + loop + .( sdr_wr_lat_cal_bitslip_loop after bitslip check loop, stack: ) .s cr + dup 3 pick > + if +\ .( lat_cal best bitslip was: ) 3 pick . .( with score: ) 2 pick . cr + 2swap + .( lat_cal best bitslip now: ) 3 pick . .( with score: ) 2 pick . cr + then + 2drop +\ .( sdr_wr_lat_cal_bitslip_loop end, stack: ) .s cr +; + +: sdr_wr_lat_cal_module_loop ( modulenum -- ) + .( sdr_wr_lat_cal_module_loop for module: ) dup . cr + \ push best_bitslip + -1 + \ push best_score + 0 + \ we should have 'modulenum 1 0' + 8 0 do + i sdr_wr_lat_cal_bitslip_loop + 2 +loop + \ we should be at 'modulenum bestbitslip bestscore' + \ we don't need score anymore + drop + \ we should be at 'modulenum bestbitslip' + 1 2 pick << dphy_dly_sel_wr + 1 dphy_wdly_dq_bitslip_rst_wr + .( sdr_wr_lat_cal_module_loop: best bitslip: ) dup . cr + \ loop that consumes bestbitslip as the upper bound + \ Forth loop always have a least one iteration + dup 0<> if + 0 do + 1 dphy_wdly_dq_bitslip_wr + loop + else drop then + 0 dphy_dly_sel_wr + \ drop the modulenum + drop +; + +: sdram_write_latency_calibration ( -- ) + .( sdram_write_latency_calibration ) cr + 2 0 do + i sdr_wr_lat_cal_module_loop + loop +; + +: sdram_leveling_center_module ( modulenum -- ) + .( sdram_leveling_center_module ) cr + dup sdram_read_leveling_rst_delay + \ push delay_min + -1 + \ push delay + 0 + \ we should be at 'modulenum delay_min delay' + begin +\ .( starting lvl_center loop with stack: ) .s cr + 2 pick 42 sdr_wr_rd_check_test_pattern + .( we should be at 'modulenum delay_min delay error' stack: ) .s cr + 3 pick 84 sdr_wr_rd_check_test_pattern + .( we should be at 'modulenum delay_min delay error error' stack: ) .s cr + + + \ we should be at 'modulenum delay_min delay error' +\ .( we should be at 'modulenum delay_min delay error' stack: ) .s cr + 0= + \ we should be at 'modulenum delay_min delay working' +\ .( we should be at 'modulenum delay_min delay working' stack: ) .s cr + 2 pick 0< and + \ we should be at 'modulenum delay_min delay {working&delay_min<0}' +\ .( we should be at 'modulenum delay_min delay {working&delay_min<0}' stack: ) .s cr + dup if rot drop 2dup rot drop then + not + \ we should be at 'modulenum new_delay_min delay !{working&delay_min<0}' +\ .( we should be at 'modulenum new_delay_min delay !{working&delay_min<0}' stack: ) .s cr + \ test delay before incrementing, if already 31 no point in continuing/incrementing + over 31 < +\ .( we should be at 'modulenum new_delay_min delay !{working&delay_min<0} <31' stack: ) .s cr + dup if rot 1+ -rot then + dup if 4 pick sdram_read_leveling_inc_delay then + \ and the conditions to signal end-of-loop + and +\ .( we should be at 'modulenum new_delay_min delay !{working&delay_min<0}&<31' stack: ) .s cr +\ .( finishing lvl_center loop with stack: ) .s cr + not until + \ we should be at 'modulenum new_delay_min delay', the while has consumed the condition + .( we should be at 'modulenum new_delay_min delay' stack: ) .s cr + 1+ + 2 pick sdram_read_leveling_inc_delay + \ build a clean stack, startin with a copy of modulenum + 2 pick + \ push delay_max + -1 + \ we're at 'modulenum new_delay_min delay modulenum delay_max' + \ push delay + 2 pick + \ we're at 'modulenum new_delay_min delay modulenum delay_max delay' + .( we should be at 'modulenum new_delay_min delay modulenum delay_max delay ' stack: ) .s cr + \ this is almost the same loop, except with !working instead of working and delay_max instead of delay_min + begin + 2 pick 42 sdr_wr_rd_check_test_pattern + 3 pick 84 sdr_wr_rd_check_test_pattern + + + \ we should be at 'modulenum delay_max delay error' + 0<> + \ we should be at 'modulenum delay_max delay !working' + 2 pick 0< and + \ we should be at 'modulenum delay_max delay {!working&delay_max<0}' + dup if rot drop 2dup rot drop then + not + \ we should be at 'modulenum new_delay_max delay !{!working&delay_max<0}' + \ test delay before incrementing, if already 31 no point in continuing/incrementing + over 31 < + dup not if rot 1+ -rot then + dup not if 4 pick sdram_read_leveling_inc_delay then + \ and the conditions to signal end-of-loop + and + not until + \ we should be at 'modulenum new_delay_min delay modulenum new_delay_max delay', the while has consumed the condition + .( we should be at 'modulenum new_delay_min delay modulenum new_delay_max delay ' stack: ) .s cr + \ keep delay if new_delay_max<0, new_delay_max otherwise + over 0< if nip else drop then + \ we should be at 'modulenum new_delay_min delay modulenum new_delay_max' + nip + nip + \ we should be at 'modulenum new_delay_min new_delay_max' + .( we should be at 'modulenum new_delay_min new_delay_max' stack: ) .s cr + \ compute delay_mid + 2dup + 2/ 32 mod + \ we should be at 'modulenum new_delay_min new_delay_max {{new_delay_min+new_delay_max}/2%32}' + \ compute delay_range + 3dup drop swap - 2/ + \ we should be at 'modulenum new_delay_min new_delay_max {{new_delay_min+new_delay_max}/2%32} {{new_delay_max-new_delay_min}/2}' + .( we should be at 'modulenum new_delay_min new_delay_max delay_mid delay_range ' stack: ) .s cr + 4 pick sdram_read_leveling_rst_delay + 100 cdelay + \ Forth loop always have a least one iteration + over 0<> if + over 0 do + 4 pick sdram_read_leveling_inc_delay + 100 cdelay + loop + then + drop + drop + drop + drop + drop +; + +: sdr_rd_lvl_bitslip_loop ( modulenum bestbitslip bestscore bitslip -- modulenum bestbitslip bestscore ) +\ .( sdr_rd_lvl_bitslip_loop, stack: ) .s cr + 3 pick over sdram_read_leveling_scan_module + \ we should be at 'modulenum bestbitslip bestscore bitslip score' + 4 pick sdram_leveling_center_module + \ preserve a bitslip for the later test + over + \ (we should be at 'modulenum bestbitslip bestscore bitslip score bitslip') move it out of the way + .( we should be at 'modulenum bestbitslip bestscore bitslip score bitslip' stack: ) .s cr + 5 roll ( 'modulenum bestscore bitslip score bitslip bestbitslip' ) + 5 roll ( 'modulenum bitslip score bitslip bestbitslip bestscore' ) + 5 roll ( 'modulenum score bitslip bestbitslip bestscore bitslip' ) + 5 roll ( 'modulenum bitslip bestbitslip bestscore bitslip score' ) + .( we should be at 'modulenum bitslip bestbitslip bestscore bitslip score' stack: ) .s cr + \ compare the score and bestcore + dup 3 pick > + if + 2swap + .( rd_lvl best bitslip now: ) 3 pick . .( with score: ) 2 pick . cr + then + 2drop + \ we should be at 'modulenum bitslip bestbitslip bestscore' + rot + \ we should be at 'modulenum bestbitslip bestscore bitslip' + .( we should be at 'modulenum bestbitslip bestscore bitslip' stack: ) .s cr + 7 <> if 2 pick sdram_read_leveling_inc_bitslip then +; + +: sdr_rd_lvl_module_loop ( modulenum -- ) + .( sdr_rd_lvl_module_loop ) cr + 1 over << sdram_read_leveling_rst_bitslip + \ push best_bitslip + 0 + \ push best_score + 0 + \ we should have 'modulenum 0 0' + 8 0 do + i sdr_rd_lvl_bitslip_loop + loop + \ don't need the score anymore + drop + 2 pick sdram_read_leveling_rst_bitslip + .( sdr_rd_lvl_module_loop, best bitslip: ) dup . cr + \ Forth loop always have a least one iteration + dup 0<> if + \ consume best_bitslip as loop upper bound + 0 do + dup sdram_leveling_center_module + loop + else drop then + drop +; + +: sdram_read_leveling ( -- ) + .( sdram_read_leveling ) cr + 2 0 do + i sdr_rd_lvl_module_loop + loop +; + +: sdram_leveling ( -- ) + .( sdram_leveling ) cr + sdram_software_control_on + 2 0 do + i sdram_read_leveling_rst_delay + i sdram_read_leveling_rst_bitslip + loop + sdram_write_latency_calibration + sdram_read_leveling + sdram_software_control_off +; + +external + +: init_sdram ( -- ) + .( init_sdram ) cr + 2 dphy_rdphase_wr + 3 dphy_wrphase_wr + sdram_software_control_on + 1 dphy_rst_wr + 1 ms + 0 dphy_rst_wr + 1 ms + .( going to init_sequence ) cr + init_sequence + .( going to sdram_leveling ) cr + sdram_leveling + \ redundant + sdram_software_control_off +; + +: init! ( -- ) + .( init ) cr + map-in-mregs + init_sdram + map-out-mregs +; diff --git a/sbus-to-ztex-gateware-migen/ztex213_sbus.py b/sbus-to-ztex-gateware-migen/ztex213_sbus.py new file mode 100644 index 0000000..5ced7f6 --- /dev/null +++ b/sbus-to-ztex-gateware-migen/ztex213_sbus.py @@ -0,0 +1,237 @@ +# +# This file is part of LiteX-Boards. +# +# Support for the ZTEX USB-FGPA Module 2.13: +# +# With (no-so-optional) expansion, either the ZTEX Debug board: +# +# Or the SBusFPGA adapter board: +# +# +# Copyright (c) 2015 Yann Sionneau +# Copyright (c) 2015-2019 Florent Kermarrec +# Copyright (c) 2020-2021 Romain Dolbeau +# SPDX-License-Identifier: BSD-2-Clause + +from litex.build.generic_platform import * +from litex.build.xilinx import XilinxPlatform +from litex.build.openocd import OpenOCD + +# IOs ---------------------------------------------------------------------------------------------- + +# FPGA daughterboard I/O + +_io = [ + ## 48 MHz clock reference + ("clk48", 0, Pins("P15"), IOStandard("LVCMOS33")), + ## embedded 256 MiB DDR3 DRAM + ("ddram", 0, + Subsignal("a", Pins("C5 B6 C7 D5 A3 E7 A4 C6", "A6 D8 B2 A5 B3 B7"), + IOStandard("SSTL135")), + Subsignal("ba", Pins("E5 A1 E6"), IOStandard("SSTL135")), + Subsignal("ras_n", Pins("E3"), IOStandard("SSTL135")), + Subsignal("cas_n", Pins("D3"), IOStandard("SSTL135")), + Subsignal("we_n", Pins("D4"), IOStandard("SSTL135")), +# Subsignal("cs_n", Pins(""), IOStandard("SSTL135")), + Subsignal("dm", Pins("G1 G6"), IOStandard("SSTL135")), + Subsignal("dq", Pins( + "H1 F1 E2 E1 F4 C1 F3 D2", + "G4 H5 G3 H6 J2 J3 K1 K2"), + IOStandard("SSTL135"), + Misc("IN_TERM=UNTUNED_SPLIT_40")), + Subsignal("dqs_p", Pins("H2 J4"), + IOStandard("DIFF_SSTL135"), + Misc("IN_TERM=UNTUNED_SPLIT_40")), + Subsignal("dqs_n", Pins("G2 H4"), + IOStandard("DIFF_SSTL135"), + Misc("IN_TERM=UNTUNED_SPLIT_40")), + Subsignal("clk_p", Pins("C4"), IOStandard("DIFF_SSTL135")), + Subsignal("clk_n", Pins("B4"), IOStandard("DIFF_SSTL135")), + Subsignal("cke", Pins("B1"), IOStandard("SSTL135")), + Subsignal("odt", Pins("F5"), IOStandard("SSTL135")), + Subsignal("reset_n", Pins("J5"), IOStandard("SSTL135")), + Misc("SLEW=FAST"), + ), +] + +# SBusFPGA I/O + +_sbus_io_v1_0 = [ + ## leds on the SBus board + ("user_led", 0, Pins("U8"), IOStandard("lvcmos33")), #LED0 + ("user_led", 1, Pins("U7"), IOStandard("lvcmos33")), #LED1 + ("user_led", 2, Pins("U6"), IOStandard("lvcmos33")), #LED2 + ("user_led", 3, Pins("T8"), IOStandard("lvcmos33")), #LED3 + ("user_led", 4, Pins("P4"), IOStandard("lvcmos33")), #LED4 + ("user_led", 5, Pins("P3"), IOStandard("lvcmos33")), #LED5 + ("user_led", 6, Pins("T1"), IOStandard("lvcmos33")), #LED6 + ("user_led", 7, Pins("R1"), IOStandard("lvcmos33")), #LED7 + #("user_led", 8, Pins("U1"), IOStandard("lvcmos33")), #SBUS_DATA_OE_LED + #("user_led", 9, Pins("T3"), IOStandard("lvcmos33")), #SBUS_DATA_OE_LED_2 + ## serial header for console + ("serial", 0, + Subsignal("tx", Pins("V9")), # FIXME: might be the other way round + Subsignal("rx", Pins("U9")), + IOStandard("LVCMOS33") + ), + ## sdcard connector + ("spisdcard", 0, + Subsignal("clk", Pins("R8")), + Subsignal("mosi", Pins("T5"), Misc("PULLUP")), + Subsignal("cs_n", Pins("V6"), Misc("PULLUP")), + Subsignal("miso", Pins("V5"), Misc("PULLUP")), + Misc("SLEW=FAST"), + IOStandard("LVCMOS33"), + ), + ("sdcard", 0, + Subsignal("data", Pins("V5 V4 V7 V6"), Misc("PULLUP")), + Subsignal("cmd", Pins("T5"), Misc("PULLUP")), + Subsignal("clk", Pins("R8")), + #Subsignal("cd", Pins("V6")), + Misc("SLEW=FAST"), + IOStandard("LVCMOS33"), + ), +] + +_sbus_io_v1_2 = [ + ## leds on the SBus board + ## serial header for console + ("serial", 0, + Subsignal("tx", Pins("V9")), # FIXME: might be the other way round + Subsignal("rx", Pins("U9")), + IOStandard("LVCMOS33") + ), + ## sdcard connector + ("spisdcard", 0, + Subsignal("clk", Pins("R8")), + Subsignal("mosi", Pins("T5"), Misc("PULLUP")), + Subsignal("cs_n", Pins("V6"), Misc("PULLUP")), + Subsignal("miso", Pins("V5"), Misc("PULLUP")), + Misc("SLEW=FAST"), + IOStandard("LVCMOS33"), + ), + ("sdcard", 0, + Subsignal("data", Pins("V5 V4 V7 V6"), Misc("PULLUP")), + Subsignal("cmd", Pins("T5"), Misc("PULLUP")), + Subsignal("clk", Pins("R8")), + #Subsignal("cd", Pins("V6")), + Misc("SLEW=FAST"), + IOStandard("LVCMOS33"), + ), + ## USB + ("usb", 0, + Subsignal("dp", Pins("U8")), # Serial TX + Subsignal("dm", Pins("U7")), # Serial RX + IOStandard("LVCMOS33")) +] + +_sbus_sbus_v1_0 = [ + ("SBUS_3V3_CLK", 0, Pins("D15"), IOStandard("lvttl")), + ("SBUS_3V3_ASs", 0, Pins("T4"), IOStandard("lvttl")), + ("SBUS_3V3_BGs", 0, Pins("T6"), IOStandard("lvttl")), + ("SBUS_3V3_BRs", 0, Pins("R6"), IOStandard("lvttl")), + ("SBUS_3V3_ERRs", 0, Pins("V2"), IOStandard("lvttl")), + ("SBUS_DATA_OE_LED", 0, Pins("U1"), IOStandard("lvttl")), + ("SBUS_DATA_OE_LED_2", 0, Pins("T3"), IOStandard("lvttl")), + ("SBUS_3V3_RSTs", 0, Pins("U2"), IOStandard("lvttl")), + ("SBUS_3V3_SELs", 0, Pins("K6"), IOStandard("lvttl")), + ("SBUS_3V3_INT1s", 0, Pins("R3"), IOStandard("lvttl")), + ("SBUS_3V3_INT7s", 0, Pins("N5"), IOStandard("lvttl")), + ("SBUS_3V3_PPRD", 0, Pins("N6"), IOStandard("lvttl")), + ("SBUS_OE", 0, Pins("P5"), IOStandard("lvttl")), + ("SBUS_3V3_ACKs", 0, Pins("M6 L6 N4"), IOStandard("lvttl")), + ("SBUS_3V3_SIZ", 0, Pins("R7 U3 V1"), IOStandard("lvttl")), + ("SBUS_3V3_D", 0, Pins("J18 K16 J17 K15 K13 J15 J13 J14 H14 H17 G14 G17 G16 G18 H16 F18 F16 E18 F15 D18 E17 G13 D17 F13 F14 E16 E15 C17 C16 A18 B18 C15"), IOStandard("lvttl")), + ("SBUS_3V3_PA", 0, Pins("B16 B17 D14 C14 D12 A16 A15 B14 B13 B12 C12 A14 A13 B11 A11 M4 R2 M3 P2 M2 N2 K5 N1 L4 M1 L3 L1 K3"), IOStandard("lvttl")), +] +_sbus_sbus_v1_2 = [ + ("SBUS_3V3_CLK", 0, Pins("D15"), IOStandard("lvttl")), + ("SBUS_3V3_ASs", 0, Pins("T4"), IOStandard("lvttl")), + ("SBUS_3V3_BGs", 0, Pins("R7"), IOStandard("lvttl")), # moved + ("SBUS_3V3_BRs", 0, Pins("R6"), IOStandard("lvttl")), + ("SBUS_3V3_ERRs", 0, Pins("D13"), IOStandard("lvttl")), # moved + ("SBUS_DATA_OE_LED", 0, Pins("U1"), IOStandard("lvttl")), + #("SBUS_DATA_OE_LED_2", 0, Pins("T3"), IOStandard("lvttl")), + ("SBUS_3V3_RSTs", 0, Pins("U2"), IOStandard("lvttl")), + ("SBUS_3V3_SELs", 0, Pins("K6"), IOStandard("lvttl")), + ("SBUS_3V3_INT1s", 0, Pins("R5"), IOStandard("lvttl")), # moved + ("SBUS_3V3_INT2s", 0, Pins("H15"), IOStandard("lvttl")), # added + ("SBUS_3V3_INT3s", 0, Pins("R3"), IOStandard("lvttl")), # added + ("SBUS_3V3_INT4s", 0, Pins("N5"), IOStandard("lvttl")), # added + ("SBUS_3V3_INT5s", 0, Pins("L5"), IOStandard("lvttl")), # added + ("SBUS_3V3_INT6s", 0, Pins("V2"), IOStandard("lvttl")), # added + #("SBUS_3V3_INT7s", 0, Pins("N5"), IOStandard("lvttl")), + ("SBUS_3V3_PPRD", 0, Pins("N6"), IOStandard("lvttl")), + ("SBUS_OE", 0, Pins("P5"), IOStandard("lvttl")), + ("SBUS_3V3_ACKs", 0, Pins("M6 L6 N4"), IOStandard("lvttl")), + ("SBUS_3V3_SIZ", 0, Pins("T6 U3 V1"), IOStandard("lvttl")), # 0 moved + ("SBUS_3V3_D", 0, Pins("J18 K16 J17 K15 K13 J15 J13 J14 H14 H17 G14 G17 G16 G18 H16 F18 F16 E18 F15 D18 E17 G13 D17 F13 F14 E16 E15 C17 C16 A18 B18 C15"), IOStandard("lvttl")), + ("SBUS_3V3_PA", 0, Pins("B16 B17 D14 C14 D12 A16 A15 B14 B13 B12 C12 A14 A13 B11 A11 M4 R2 M3 P2 M2 N2 K5 N1 L4 M1 L3 L1 K3"), IOStandard("lvttl")), +] + +# reusing the UART pins !!! +_usb_io_v1_0 = [ + ("usb", 0, + Subsignal("dp", Pins("V9")), # Serial TX + Subsignal("dm", Pins("U9")), # Serial RX + IOStandard("LVCMOS33")) +] + +# Connectors --------------------------------------------------------------------------------------- + +_connectors_v1_0 = [ +] +_connectors_v1_2 = [ + ("P1", "T8 U6 P3 P4 T1 U4 R1 T3"), +] + +# Platform ----------------------------------------------------------------------------------------- + +class Platform(XilinxPlatform): + default_clk_name = "clk48" + default_clk_period = 1e9/48e6 + + def __init__(self, variant="ztex2.13a", version="V1.0"): + device = { + "ztex2.13a": "xc7a35tcsg324-1", + "ztex2.13b": "xc7a50tcsg324-1", #untested + "ztex2.13b2": "xc7a50tcsg324-1", #untested + "ztex2.13c": "xc7a75tcsg324-2", #untested + "ztex2.13d": "xc7a100tcsg324-2" #untested + }[variant] + sbus_io = { + "V1.0" : _sbus_io_v1_0, + "V1.2" : _sbus_io_v1_2, + }[version] + sbus_sbus = { + "V1.0" : _sbus_sbus_v1_0, + "V1.2" : _sbus_sbus_v1_2, + }[version] + connectors = { + "V1.0" : _connectors_v1_0, + "V1.2" : _connectors_v1_2, + }[version] + + XilinxPlatform.__init__(self, device, _io, connectors, toolchain="vivado") + self.add_extension(sbus_io) + self.add_extension(sbus_sbus) + + self.toolchain.bitstream_commands = \ + ["set_property BITSTREAM.CONFIG.SPI_32BIT_ADDR No [current_design]", + "set_property BITSTREAM.CONFIG.SPI_BUSWIDTH 2 [current_design]", + "set_property BITSTREAM.CONFIG.CONFIGRATE 66 [current_design]", + "set_property BITSTREAM.GENERAL.COMPRESS true [current_design]", + "set_property BITSTREAM.GENERAL.CRC DISABLE [current_design]", + "set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1]", + "set_property CONFIG_VOLTAGE 3.3 [current_design]", + "set_property CFGBVS VCCO [current_design]" +# , "set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AreaOptimized_high [get_runs synth_1]" + ] + + def create_programmer(self): + bscan_spi = "bscan_spi_xc7a35t.bit" + return OpenOCD("openocd_xc7_ft2232.cfg", bscan_spi) #FIXME + + def do_finalize(self, fragment): + XilinxPlatform.do_finalize(self, fragment) + #self.add_period_constraint(self.lookup_request("clk48", loose=True), 1e9/48e6) diff --git a/sbus-to-ztex-gateware/README.md b/sbus-to-ztex-gateware/README.md new file mode 100644 index 0000000..738cf27 --- /dev/null +++ b/sbus-to-ztex-gateware/README.md @@ -0,0 +1,9 @@ +## Current status + +2021-03-21: The adapter board seems to work fine in two different SS20. Currently the embedded PROM code exposes three devices in the FPGA: + +* "RDOL,cryptoengine": exposes a (way too large) polynomial multiplier to implement GCM mode and a AES block. Currently used to implement DMA-based acceleration of AES-256-CBC through /dev/crypto. Unfortunately OpenSSL doesn't support AES-256-GCM in the cryptodev engine, and disagree with NetBSD's /dev/crypto on how to implement AES-256-CTR. And the default SSH cannot use cryptodev, it closes all file descriptors after cryptodev has opened /dev/crypto... still WiP. + +* "RDOL,trng": exposes a 5 MHz counter (didn't realize the SS20 already had a good counter) and a so-far-not-true TRNG (implemented by a PRNG). The 'true' random generators I've found make Vivado screams very loudly when synthesizing... anyway both works fine in NetBSD 9.0 as a timecounter and an entropy source (which a PRNG really isn't, I know). still WiP. + +* "RDOL,sdcard": trying to expose the micro-sd card slot as a storage device, at first using SPI mode. So far reading seems to work, and NetBSD can see a Sun disklabel on the micro-sd card if it has been partitioned that way. Mounting a FAT filesystem read-only now works (with very little testing as of yet). Writing not working yet. Very much WiP.