diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c index bfbcc19..106c6d1 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c @@ -79,10 +79,10 @@ const struct cdevsw sbusfpga_c29e_cdevsw = { extern struct cfdriver sbusfpga_c29e_cd; struct sbusfpga_curve25519engine_montgomeryjob { - uint32_t x0_u[8]; - uint32_t x0_w[8]; - uint32_t x1_u[8]; - uint32_t x1_w[8]; + /* uint32_t x0_u[8]; */ + /* uint32_t x0_w[8]; */ + /* uint32_t x1_u[8]; */ + /* uint32_t x1_w[8]; */ uint32_t affine_u[8]; uint32_t scalar[8]; }; @@ -123,7 +123,7 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st err = start_job(sc); if (err) return err; - delay(10); + delay(1); err = wait_job(sc); if (err) return err; @@ -169,322 +169,8 @@ sbusfpga_curve25519engine_match(device_t parent, cfdata_t cf, void *aux) return (strcmp("betrustedc25519e", sa->sa_name) == 0); } -#if 1 -#if 0 -static const uint32_t program[192] = {0x00480800, - 0x007407cc, - 0x007c07cb, - 0x0049d483, - 0x0079b643, - 0x0079e482, - 0x00659783, - 0x006db783, - 0x0079c683, - 0x0079e482, - 0x0069a783, - 0x0071c783, - 0x00480740, - 0x00500640, - 0x00540680, - 0x005806c0, - 0x005c0700, - 0x00015505, - 0x00780008, - 0x0001e006, - 0x005558c6, - 0x00055505, - 0x00780048, - 0x0005e046, - 0x00097585, - 0x00780088, - 0x0009e086, - 0x005d78c6, - 0x000d7585, - 0x007800c8, - 0x000de0c6, - 0x00100007, - 0x00141047, - 0x007458c6, - 0x0019d105, - 0x00780188, - 0x0019e186, - 0x001c3007, - 0x00202047, - 0x002481c5, - 0x00780248, - 0x0025e246, - 0x007488c6, - 0x0029d1c5, - 0x00780288, - 0x0029e286, - 0x002c9247, - 0x0030a287, - 0x00346907, - 0x00385107, - 0x003c5345, - 0x007803c8, - 0x003de3c6, - 0x0040f187, - 0x0044c607, - 0x00500380, - 0x00540400, - 0x005802c0, - 0x005c0440, - 0x00640500, - 0x00680540, - 0x006c0580, - 0x007005c0, - 0x010004c9, - 0x004e14c6, - 0xdf800809, - 0x0079b643, - 0x0079e482, - 0x00659783, - 0x006db783, - 0x0079c683, - 0x0079e482, - 0x0069a783, - 0x0071c783, - 0x00740640, - 0x00780680, - 0x0001e787, - 0x00040007, - 0x00041047, - 0x00081787, - 0x000c2007, - 0x001030c7, - 0x00144087, - 0x00700940, - 0x00185147, - 0x00721706, - 0x01000709, - 0x00186187, - 0xfe000809, - 0x001c5187, - 0x00700980, - 0x002071c7, - 0x00721706, - 0x01000709, - 0x00208207, - 0xfe000809, - 0x00247207, - 0x007009c0, - 0x00289247, - 0x00721706, - 0x01000709, - 0x0028a287, - 0xfe000809, - 0x002c9287, - 0x00700980, - 0x0030b2c7, - 0x00721706, - 0x01000709, - 0x0030c307, - 0xfe000809, - 0x00347307, - 0x00700a00, - 0x0038d347, - 0x00721706, - 0x01000709, - 0x0038e387, - 0xfe000809, - 0x003cd387, - 0x00700a40, - 0x0040f3c7, - 0x00721706, - 0x01000709, - 0x00410407, - 0xfe000809, - 0x0044f407, - 0x00700a00, - 0x00491447, - 0x00721706, - 0x01000709, - 0x00492487, - 0xfe000809, - 0x004cd487, - 0x00700940, - 0x005134c7, - 0x00721706, - 0x01000709, - 0x00514507, - 0xfe000809, - 0x00543507, - 0x007d5747, - 0x0000000a, - 0x0000000a, - 0x0000000a, - 0x0000000a, -}; -static const uint32_t program_len = 141; -#else -static const uint32_t program[192] = {0x00640840, - 0x00680800, - 0x006c0600, - 0x00700840, - 0x00500a40, - 0x00554505, - 0x00500a00, - 0x00554545, - 0x00500940, - 0x00554545, - 0x00500840, - 0x004d4546, - 0x00480800, - 0x007407cc, - 0x007c07cb, - 0x0049d483, - 0x0079b643, - 0x0079e482, - 0x00659783, - 0x006db783, - 0x0079c683, - 0x0079e482, - 0x0069a783, - 0x0071c783, - 0x00480740, - 0x00500640, - 0x00540680, - 0x005806c0, - 0x005c0700, - 0x00015505, - 0x00780008, - 0x0001e006, - 0x005558c6, - 0x00055505, - 0x00780048, - 0x0005e046, - 0x00097585, - 0x00780088, - 0x0009e086, - 0x005d78c6, - 0x000d7585, - 0x007800c8, - 0x000de0c6, - 0x00100007, - 0x00141047, - 0x007458c6, - 0x0019d105, - 0x00780188, - 0x0019e186, - 0x001c3007, - 0x00202047, - 0x002481c5, - 0x00780248, - 0x0025e246, - 0x007488c6, - 0x0029d1c5, - 0x00780288, - 0x0029e286, - 0x002c9247, - 0x0030a287, - 0x00346907, - 0x00385107, - 0x003c5345, - 0x007803c8, - 0x003de3c6, - 0x0040f187, - 0x0044c607, - 0x00500380, - 0x00540400, - 0x005802c0, - 0x005c0440, - 0x00640500, - 0x00680540, - 0x006c0580, - 0x007005c0, - 0x010004c9, - 0x004e14c6, - 0xdf800809, - 0x0079b643, - 0x0079e482, - 0x00659783, - 0x006db783, - 0x0079c683, - 0x0079e482, - 0x0069a783, - 0x0071c783, - 0x00740640, - 0x00780680, - 0x0001e787, - 0x00040007, - 0x00041047, - 0x00081787, - 0x000c2007, - 0x001030c7, - 0x00144087, - 0x00700940, - 0x00185147, - 0x00721706, - 0x01000709, - 0x00186187, - 0xfe000809, - 0x001c5187, - 0x00700980, - 0x002071c7, - 0x00721706, - 0x01000709, - 0x00208207, - 0xfe000809, - 0x00247207, - 0x007009c0, - 0x00289247, - 0x00721706, - 0x01000709, - 0x0028a287, - 0xfe000809, - 0x002c9287, - 0x00700980, - 0x0030b2c7, - 0x00721706, - 0x01000709, - 0x0030c307, - 0xfe000809, - 0x00347307, - 0x00700a00, - 0x0038d347, - 0x00721706, - 0x01000709, - 0x0038e387, - 0xfe000809, - 0x003cd387, - 0x00700a40, - 0x0040f3c7, - 0x00721706, - 0x01000709, - 0x00410407, - 0xfe000809, - 0x0044f407, - 0x00700a00, - 0x00491447, - 0x00721706, - 0x01000709, - 0x00492487, - 0xfe000809, - 0x004cd487, - 0x00700940, - 0x005134c7, - 0x00721706, - 0x01000709, - 0x00514507, - 0xfe000809, - 0x00543507, - 0x007d5747, - 0x0000000a, - 0x0000000a, - 0x0000000a, -}; -static const uint32_t program_len = 153; -#endif -#else -static const uint32_t program[16] = { - 0x00640a40, - 0x00680840, - 0x0000000a, - 0x0000000a -}; -static const uint32_t program_len = 3; -#endif +static const uint32_t program[192] = {0x00640840, 0x00680800, 0x006c0600, 0x00700840, 0x004c0a80, 0x00480800, 0x007407cc, 0x007c07cb, 0x0049d483, 0x0079b643, 0x0079e482, 0x00659783, 0x006db783, 0x0079c683, 0x0079e482, 0x0069a783, 0x0071c783, 0x00480740, 0x0001a645, 0x00780008, 0x0001e006, 0x0069a8c6, 0x0005a645, 0x00780048, 0x0005e046, 0x0009c6c5, 0x00780088, 0x0009e086, 0x0071c8c6, 0x000dc6c5, 0x007800c8, 0x000de0c6, 0x00100007, 0x00141047, 0x007458c6, 0x0019d105, 0x00780188, 0x0019e186, 0x001c3007, 0x00202047, 0x002481c5, 0x00780248, 0x0025e246, 0x007488c6, 0x0029d1c5, 0x00780288, 0x0029e286, 0x006c9247, 0x0030a287, 0x00346907, 0x00645107, 0x003c5345, 0x007803c8, 0x003de3c6, 0x0068f187, 0x0070c607, 0x010004c9, 0x004e14c6, 0xe5800809, 0x0079b643, 0x0079e482, 0x00659783, 0x006db783, 0x0079c683, 0x0079e482, 0x0069a783, 0x0071c783, 0x00740640, 0x00780680, 0x0001e787, 0x00040007, 0x00041047, 0x00081787, 0x000c2007, 0x001030c7, 0x00144087, 0x00700940, 0x00185147, 0x00721706, 0x01000709, 0x00186187, 0xfe000809, 0x001c5187, 0x00700980, 0x002071c7, 0x00721706, 0x01000709, 0x00208207, 0xfe000809, 0x00247207, 0x007009c0, 0x00289247, 0x00721706, 0x01000709, 0x0028a287, 0xfe000809, 0x002c9287, 0x00700980, 0x0030b2c7, 0x00721706, 0x01000709, 0x0030c307, 0xfe000809, 0x00347307, 0x00700a00, 0x0038d347, 0x00721706, 0x01000709, 0x0038e387, 0xfe000809, 0x003cd387, 0x00700a40, 0x0040f3c7, 0x00721706, 0x01000709, 0x00410407, 0xfe000809, 0x0044f407, 0x00700a00, 0x00491447, 0x00721706, 0x01000709, 0x00492487, 0xfe000809, 0x004cd487, 0x00700940, 0x005134c7, 0x00721706, 0x01000709, 0x00514507, 0xfe000809, 0x00543507, 0x007d5747, 0x0000000a, 0x0000000a, 0x0000000a}; +static const uint32_t program_len = 134; /* * Attach all the sub-devices we can find @@ -622,7 +308,7 @@ static int power_on(struct sbusfpga_curve25519engine_softc *sc) { int err = 0; if ((curve25519engine_power_read(sc) & 1) == 0) { curve25519engine_power_write(sc, 1); - delay(2); + delay(1); } return err; } @@ -698,18 +384,17 @@ static int write_inputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusf return -ENXIO; } - #define REG_BASE(reg) (base + (reg * 32)) #define SUBREG_ADDR(reg, off) (REG_BASE(reg) + (off)*4) for (i = 0 ; i < 8 ; i ++) { bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(24,i), job->affine_u[i]); - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i), job->x0_u[i]); - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i), job->x0_w[i]); - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i), job->x1_u[i]); - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i), job->x1_w[i]); + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i), job->x0_u[i]); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i), job->x0_w[i]); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i), job->x1_u[i]); */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i), job->x1_w[i]); */ bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(31,i), job->scalar[i]); - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(19,i), ((i == 0) ? 254 : 0)); - delay(1); + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(19,i), ((i == 0) ? 254 : 0)); */ + /* delay(1); */ } #undef SUBREG_ADDR #undef REG_BASE @@ -719,12 +404,12 @@ static int write_inputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusf #define SUBREG_ADDR(reg, off) (REG_BASE(reg) + (off)*4) for (i = 0 ; i < 8 && !err; i ++) { if (job->affine_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(24,i))) err = EIO; - if (job->x0_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i))) err = EIO; - if (job->x0_w[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i))) err = EIO; - if (job->x1_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i))) err = EIO; - if (job->x1_w[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i))) err = EIO; + /* if (job->x0_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i))) err = EIO; */ + /* if (job->x0_w[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i))) err = EIO; */ + /* if (job->x1_u[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i))) err = EIO; */ + /* if (job->x1_w[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i))) err = EIO; */ if (job->scalar[i] != bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(31,i))) err = EIO; - delay(1); + /* delay(1); */ } if (err) aprint_error_dev(sc->sc_dev, "WRITE - data did not read-write properly\n"); #undef SUBREG_ADDR @@ -752,7 +437,7 @@ static int wait_job(struct sbusfpga_curve25519engine_softc *sc) { while ((status & 1) && (count < 50)) { aprint_normal_dev(sc->sc_dev, "WAIT - ongoing, Curve25519Engine status: 0x%08x [%d]\n", status, count); count ++; - delay(20); + delay(1); status = curve25519engine_status_read(sc); } //curve25519engine_control_write(sc, 0); @@ -778,13 +463,13 @@ static int read_outputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusf #define REG_BASE(reg) (base + (reg * 32)) #define SUBREG_ADDR(reg, off) (REG_BASE(reg) + (off)*4) for (i = 0 ; i < 8 ; i ++) { - job->affine_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(24,i)); - job->x0_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i)); - job->x0_w[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i)); - job->x1_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i)); - job->x1_w[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i)); + /* job->affine_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(24,i)); */ + /* job->x0_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(25,i)); */ + /* job->x0_w[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(26,i)); */ + /* job->x1_u[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(27,i)); */ + /* job->x1_w[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(28,i)); */ job->scalar[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(31,i)); - delay(1); + /* delay(1); */ } aprint_normal_dev(sc->sc_dev, "READ - Curve25519Engine 19 low 32 bits: 0x%08x\n", bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(19,0))); #undef SUBREG_ADDR diff --git a/README.md b/README.md index 7f6d26c..a969c94 100644 --- a/README.md +++ b/README.md @@ -14,27 +14,44 @@ To save on PCB cost, the board is smaller than a 'true' SBus board; the hardware 2021-07-18: The old VHDL gateware has been replaced by a new Migen-based gateware, see below for details. -Short version: the board enables a 256 MiB SDRAM disk (for fast swapping) and a USH OHCI host controller (for USB peripherals). +2021-08-22: Short version: the board enables a 256 MiB SDRAM disk (for fast swapping), a TRNG, a USB OHCI host controller (for USB peripherals) and a Curve25519 accelerator. ## The hardware Directory 'sbus-to-ztex' -The custom board is a SBus-compliant (I hope...) board, designed to receive a [ZTex USB-FPGA Module 2.13](https://www.ztex.de/usb-fpga-2/usb-fpga-2.13.e.html) as a daughterboard. The ZTex module contains the actual FPGA (Artix-7), some RAM, programming hardware, etc. The SBus board contains level-shifters ICs to interface between the SBus signals and the FPGA, a serial header, some Leds, a JTAG header, and a micro-sd card slot. +The custom board is a SBus-compliant (I hope...) board, designed to receive a [ZTex USB-FPGA Module 2.13](https://www.ztex.de/usb-fpga-2/usb-fpga-2.13.e.html) as a daughterboard. The ZTex module contains the actual FPGA (Artix-7), some RAM, programming hardware, etc. The SBus board contains level-shifters ICs to interface between the SBus signals and the FPGA, a serial header, some Leds, a JTAG header, and a micro-sd card slot. It only connects interrupt line 7 (highest priority) and 1 (lowest priority), which was a mistake (more interrupts are needed and 7 is too high-priority to use at this stage, so just the level 1 is usable), but otherwise supports every SBus feature except the optional parity (i.e. it can do both slave and master modes). The PCB was designed with Kicad 5.0 ## The gateware (Migen) -The gateware was rewritten from scrach in the Migen language, choosen because that's what [Litex](https://github.com/enjoy-digital/litex/) uses. +### Intro + +The gateware was rewritten from scratch in the Migen language, choosen because that's what [Litex](https://github.com/enjoy-digital/litex/) uses. It implements a simple CPU-less Litex SoC built around a Wishbone bus, with a bridge between the SBus and the Wishbone. -A ROM, a SDRAM controller (litedram to the on-board DDR3) and an USB OHCI (host controller, using the Litex wrapper around the [SpinalHDL](https://github.com/SpinalHDL/SpinalHDL) implementation) are connected to that bus. +A ROM, a SDRAM controller ([litedram](https://github.com/enjoy-digital/litedram) to the on-board DDR3), a TRNG (using the [NeoRV32](https://github.com/stnolting/neorv32) TRNG), an USB OHCI (host controller, using the Litex wrapper around the [SpinalHDL](https://github.com/SpinalHDL/SpinalHDL) implementation) and a Curve25519 Crypto Engine (taken from the [Betrusted.IO](https://betrusted.io/) project) are connected to that bus. + +### Details + Master access to the SBus by the host are routed to the Wishbone to access the various CSRs / control registers of the devices. -The USB OHCI DMA is bridged from the Wishbone to the SBus by having the physical addresses of the Wishbone (that match the virtual addresses from NetBSD DVMA allocations) to the bridge. Reads are buffered by block of 16 bytes; currently writes are unbuffered (and somwhat slow, as they need a full SBus master cycle for every transaction of 32 bits or less). The standard NetBSD OHCI driver is used, with just a small custom SBus-OHCI driver mirroring the PCI-OHCI one. +The ROM doesn't do much beyond exposing the devices' existence and specifications to the host. -The SDRAM has its own custom DMA controller, using native Litedram DMA to the memory, and some FIFO to/from the SBus. A custom NetBSD driver exposes it as a drive on which you can swap. It might also be usable as a 'fast', volatile disk, but I haven't tried that yet. +The SDRAM has its own custom DMA controller, using native Litedram DMA to the memory, and some FIFO to/from the SBus. A custom NetBSD driver exposes it as a drive on which you can swap. It's also usable as a 'fast', volatile disk (for e.g. /tmp or similar temporary filesystem). It could use a interrupt line, but the only usable one in the current HW design is in use by the USB. + +The TRNG has a NetBSD driver to add entropy to the entropy pool. + +The USB OHCI DMA is bridged from the Wishbone to the SBus by having the physical addresses of the Wishbone (that match the virtual addresses from NetBSD DVMA allocations) to the bridge. Reads are buffered by block of 16 bytes; currently writes are unbuffered (and somewhat slow, as they need a full SBus master cycle for every transaction of 32 bits or less). The standard NetBSD OHCI driver is used, with just a small custom SBus-OHCI driver mirroring the PCI-OHCI one. It uses the interrupt level 1 available on the board. As the board has no USB connectors, the D+ and D- lines are routed to the Serial header pins, those (and GND) are connected to a pair of pins of [Dolu1990's USB PMod](https://github.com/Dolu1990/pmod_usb_host_x4), and the associated USB port is connected to an external self-powered USB hub (which is the one supplying the VBus). It's quite ugly but it works (of course I should redesign the PCB with a proper USB connector and a VBus). + +The Curve25519 Engine currently exposes an IOCTL to do the computation, which has yet to be integrated usefully in e.g. OpenSSL. It could use a interrupt line, but the only usable one in the current HW design is in use by the USB. + +### Special Notes + +Currently the design uses a Wishbone Crossbar Interconnect from Litex instead of a Shared Interconnect, as for some reason using a Shared Interconnect causes issues between devices (disabling the USB OHCI seem also to solve the issue, it generates a lot of cycles on the buses). I might be misusing Wishbone. With the Crossbar, all devices are usable simultaneously. + +As not everything lives in the same clock domain, the design also use a Wishbone CDC, a wrapper around the one from [Verilog Wishbone Components](https://github.com/alexforencich/verilog-wishbone). ## The gateware (VHDL, obsolete) diff --git a/sbus-to-ztex-gateware-migen/engine.py b/sbus-to-ztex-gateware-migen/engine.py index 9938a75..43c0626 100644 --- a/sbus-to-ztex-gateware-migen/engine.py +++ b/sbus-to-ztex-gateware-migen/engine.py @@ -228,6 +228,7 @@ class Curve25519Const(Module, AutoDoc): 7: [20, "twenty", "The number 20 (for pow22501)"], 8: [50, "fifty", "The number 50 (for pow22501)"], 9: [100, "one hundred", "The number 100 (for pow22501)"], + 10: [254, "two hundred fifty four", "The number 254 (iteration count)"], } self.adr = Signal(5) self.const = Signal(256) diff --git a/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs index 6f9dff0..0182e10 100644 --- a/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs +++ b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs @@ -20,23 +20,6 @@ fn main() -> std::io::Result<()> { // %19 is the loop counter, starts with 254 (if 0, loop runs exactly once) // I // %31 is the scalar // I // %18 is the swap variable - // START NEW - psa %25, #1 - psa %26, #0 - psa %27, %24 - psa %28, #1 - // #9 is 100 - psa %20, #9 - add %21, %20, %20 - // #8 is 50 - psa %20, #8 - add %21, %21, %20 - // #5 is 5 - psa %20, #5 - add %21, %21, %20 - psa %20, #1 - sub %19, %21, %20 - // END NEW psa %18, #0 // for i in (0..255).rev() @@ -284,6 +267,255 @@ fn main() -> std::io::Result<()> { mul %31, %29, %21 fin // finish execution ); + let mcode_upd = assemble_engine25519!( + start: + // P.U in %20 + // P.W in %21 + // Q.U in %22 + // Q.W in %23 + // affine_PmQ in %24 // I + // %30 is the TRD scratch register and cswap dummy + // %29 is the subtraction temporary value register and k_t + // x0.U in %25 // !I + // x0.W in %26 // !I + // x1.U in %27 // !I + // x1.W in %28 // !I + // %19 is the loop counter, starts with 254 (if 0, loop runs exactly once) // I + // %31 is the scalar // I + // %18 is the swap variable + psa %25, #1 + psa %26, #0 + psa %27, %24 + psa %28, #1 + // #10 is 254 in my Engine + psa %19, #10 + psa %18, #0 + + // for i in (0..255).rev() + mainloop: + // let choice: u8 = (bits[i + 1] ^ bits[i]) as u8; + // ProjectivePoint::conditional_swap(&mut x0, &mut x1, choice.into()); + xbt %29, %31 // orignally[k_t = (k>>t) & 1] now[k_t = k[254]] + shl %31, %31 // k = k<<1 + xor %18, %18, %29 // swap ^= k_t + + // cswap x0.U (%25), x1.U (%27) + xor %30, %25, %27 + msk %30, %18, %30 + xor %25, %30, %25 + xor %27, %30, %27 + // cswap x0.W (%26), x1.W (%28) + xor %30, %26, %28 + msk %30, %18, %30 + xor %26, %30, %26 + xor %28, %30, %28 + + psa %18, %29 // swap = k_t + + // differential_add_and_double(&mut x0, &mut x1, &affine_u); + // affine_u is already in %24 + + // let t0 = &P.U + &P.W; + add %0, %25, %26 + trd %30, %0 + sub %0, %0, %30 + // let t1 = &P.U - &P.W; + sub %26, #3, %26 // negate &P.W using #FIELDPRIME (#3) + add %1, %25, %26 + trd %30, %1 + sub %1, %1, %30 + // let t2 = &Q.U + &Q.W; + add %2, %27, %28 + trd %30, %2 + sub %2, %2, %30 + // let t3 = &Q.U - &Q.W; + sub %28, #3, %28 + add %3, %27, %28 + trd %30, %3 + sub %3, %3, %30 + // let t4 = t0.square(); // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2 + mul %4, %0, %0 + // let t5 = t1.square(); // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2 + mul %5, %1, %1 + // let t6 = &t4 - &t5; // 4 U_P W_P + sub %29, #3, %5 + add %6, %4, %29 + trd %30, %6 + sub %6, %6, %30 + // let t7 = &t0 * &t3; // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q + mul %7, %0, %3 + // let t8 = &t1 * &t2; // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q + mul %8, %1, %2 + // let t9 = &t7 + &t8; // 2 (U_P U_Q - W_P W_Q) + add %9, %7, %8 + trd %30, %9 + sub %9, %9, %30 + // let t10 = &t7 - &t8; // 2 (W_P U_Q - U_P W_Q) + sub %29, #3, %8 + add %10, %7, %29 + trd %30, %10 + sub %10, %10, %30 + // let t11 = t9.square(); // 4 (U_P U_Q - W_P W_Q)^2 + mul %27, %9, %9 + // let t12 = t10.square(); // 4 (W_P U_Q - U_P W_Q)^2 + mul %12, %10, %10 + // let t13 = &APLUS2_OVER_FOUR * &t6; // (A + 2) U_P U_Q + mul %13, #4, %6 // #4 is A+2/4 + // let t14 = &t4 * &t5; // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2 + mul %25, %4, %5 + // let t15 = &t13 + &t5; // (U_P - W_P)^2 + (A + 2) U_P W_P + add %15, %13, %5 + trd %30, %15 + sub %15, %15, %30 + // let t16 = &t6 * &t15; // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P) + mul %26, %6, %15 + // let t17 = affine_PmQ * &t12; // U_D * 4 (W_P U_Q - U_P W_Q)^2 + mul %28, %24, %12 // affine_PmQ loaded into %24 + + brz end, %19 // if loop counter is 0, quit + sub %19, %19, #1 // subtract one from the loop counter and run again + brz mainloop, #0 // go back to the top + end: + // ProjectivePoint::conditional_swap(&mut x0, &mut x1, Choice::from(bits[0] as u8)); + // cswap x0.U (%25), x1.U (%27) + xor %30, %25, %27 + msk %30, %18, %30 + xor %25, %30, %25 + xor %27, %30, %27 + // cswap x0.W (%26), x1.W (%28) + xor %30, %26, %28 + msk %30, %18, %30 + xor %26, %30, %26 + xor %28, %30, %28 + + // AFFINE SPLICE -- pass arguments to the affine block + psa %29, %25 + psa %30, %26 + // W.invert() in %21 + // U in %29 + // W in %30 + // result in %31 + // loop counter in %28 + + // from FieldElement.invert() + // let (t19, t3) = self.pow22501(); // t19: 249..0 ; t3: 3,1,0 + // let t0 = self.square(); // 1 e_0 = 2^1 + mul %0, %30, %30 // self is W, e.g. %30 + // let t1 = t0.square().square(); // 3 e_1 = 2^3 + mul %1, %0, %0 + mul %1, %1, %1 + // let t2 = self * &t1; // 3,0 e_2 = 2^3 + 2^0 + mul %2, %30, %1 + // let t3 = &t0 * &t2; // 3,1,0 + mul %3, %0, %2 + // let t4 = t3.square(); // 4,2,1 + mul %4, %3, %3 + // let t5 = &t2 * &t4; // 4,3,2,1,0 + mul %5, %2, %4 + + // let t6 = t5.pow2k(5); // 9,8,7,6,5 + psa %28, #5 // coincidentally, constant #5 is the number 5 + mul %6, %5, %5 + pow2k_5: + sub %28, %28, #1 // %28 = %28 - 1 + brz pow2k_5_exit, %28 + mul %6, %6, %6 + brz pow2k_5, #0 + pow2k_5_exit: + // let t7 = &t6 * &t5; // 9,8,7,6,5,4,3,2,1,0 + mul %7, %6, %5 + + // let t8 = t7.pow2k(10); // 19..10 + psa %28, #6 // constant #6 is the number 10 + mul %8, %7, %7 + pow2k_10: + sub %28, %28, #1 + brz pow2k_10_exit, %28 + mul %8, %8, %8 + brz pow2k_10, #0 + pow2k_10_exit: + // let t9 = &t8 * &t7; // 19..0 + mul %9, %8, %7 + + // let t10 = t9.pow2k(20); // 39..20 + psa %28, #7 // constant #7 is the number 20 + mul %10, %9, %9 + pow2k_20: + sub %28, %28, #1 + brz pow2k_20_exit, %28 + mul %10, %10, %10 + brz pow2k_20, #0 + pow2k_20_exit: + // let t11 = &t10 * &t9; // 39..0 + mul %11, %10, %9 + + // let t12 = t11.pow2k(10); // 49..10 + psa %28, #6 // constant #6 is the number 10 + mul %12, %11, %11 + pow2k_10b: + sub %28, %28, #1 + brz pow2k_10b_exit, %28 + mul %12, %12, %12 + brz pow2k_10b, #0 + pow2k_10b_exit: + // let t13 = &t12 * &t7; // 49..0 + mul %13, %12, %7 + + // let t14 = t13.pow2k(50); // 99..50 + psa %28, #8 // constant #8 is the number 50 + mul %14, %13, %13 + pow2k_50a: + sub %28, %28, #1 + brz pow2k_50a_exit, %28 + mul %14, %14, %14 + brz pow2k_50a, #0 + pow2k_50a_exit: + // let t15 = &t14 * &t13; // 99..0 + mul %15, %14, %13 + + // let t16 = t15.pow2k(100); // 199..100 + psa %28, #9 // constant #9 is the number 100 + mul %16, %15, %15 + pow2k_100: + sub %28, %28, #1 + brz pow2k_100_exit, %28 + mul %16, %16, %16 + brz pow2k_100, #0 + pow2k_100_exit: + // let t17 = &t16 * &t15; // 199..0 + mul %17, %16, %15 + + // let t18 = t17.pow2k(50); // 249..50 + psa %28, #8 // constant #8 is the number 50 + mul %18, %17, %17 + pow2k_50b: + sub %28, %28, #1 + brz pow2k_50b_exit, %28 + mul %18, %18, %18 + brz pow2k_50b, #0 + pow2k_50b_exit: + // let t19 = &t18 * &t13; // 249..0 + mul %19, %18, %13 + //(t19, t3) // just a return value, values are already there, do nothing + + //let t20 = t19.pow2k(5); // 254..5 + psa %28, #5 + mul %20, %19, %19 + pow2k_5_last: + sub %28, %28, #1 + brz pow2k_5_last_exit, %28 + mul %20, %20, %20 + brz pow2k_5_last, #0 + pow2k_5_last_exit: + + //let t21 = &t20 * &t3; // 254..5,3,1,0 + mul %21, %20, %3 + + // u = &self.U * &self.W.invert() + mul %31, %29, %21 + fin // finish execution + ); + let mcode2 = assemble_engine25519!( start: // P.U in %20 @@ -302,11 +534,14 @@ fn main() -> std::io::Result<()> { // %18 is the swap variable psa %25, #9 psa %26, #1 + mul %27, %25, %26 + mul %28, %25, %25 + mul %31, %24, %24 fin ); let mut pos = 0; - while pos < mcode.len() { - println!("0x{:08x},", mcode[pos]); + while pos < mcode_upd.len() { + println!("0x{:08x},", mcode_upd[pos]); pos = pos + 1; } Ok(()) diff --git a/sbus-to-ztex-gateware-migen/netbsd_csr.h b/sbus-to-ztex-gateware-migen/netbsd_csr.h index 4f44e70..8fd6f36 100644 --- a/sbus-to-ztex-gateware-migen/netbsd_csr.h +++ b/sbus-to-ztex-gateware-migen/netbsd_csr.h @@ -1,5 +1,5 @@ //-------------------------------------------------------------------------------- -// Auto-generated by Migen (3ffd64c) & LiteX (8a644c90) on 2021-08-21 08:42:06 +// Auto-generated by Migen (3ffd64c) & LiteX (8a644c90) on 2021-08-22 03:23:02 //-------------------------------------------------------------------------------- #ifndef __GENERATED_CSR_H #define __GENERATED_CSR_H diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_blk_dma.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_blk_dma.py index 61d3eac..af0fe63 100644 --- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_blk_dma.py +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_blk_dma.py @@ -25,8 +25,6 @@ class ExchangeWithMem(Module, AutoCSR): assert(len(self.dram_dma_writer.sink.data) == data_width_bits) assert(len(self.dram_dma_reader.source.data) == data_width_bits) - assert(len(self.dram_dma_writer.sink.address) == blk_addr_width) - assert(len(self.dram_dma_reader.sink.address) == blk_addr_width) #self.wishbone_r_master = wishbone.Interface(data_width=data_width_bits) #self.wishbone_w_master = wishbone.Interface(data_width=data_width_bits) diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py index 8b92f7f..ce1462c 100644 --- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_fsm.py @@ -33,8 +33,8 @@ ENGINE_ADDR_PFXA = Signal(12, reset = 0x00a) ENGINE_ADDR_PFXB = Signal(12, reset = 0x00b) #SDRAM_ADDR_PFX = Signal(12, reset = 2048) -wishbone_default_timeout = 120 ## must be > sbus_default_timeout -sbus_default_timeout = 100 ## must be below 127 as we can wait twice on it inside the 255 cycles +wishbone_default_timeout = 120 ## +sbus_default_timeout = 50 ## must be below 255 sbus_default_master_throttle = 3 def siz_is_word(siz): @@ -582,6 +582,7 @@ class SBusFPGABus(Module): NextValue(SBUS_3V3_ACKs_o, ACK_WORD), NextValue(SBUS_3V3_ERRs_o, 1), #NextValue(self.led_display.value, 0x0000000010 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextValue(sbus_slave_timeout, sbus_default_timeout), NextState("Slave_Ack_Reg_Write_Burst") ).Else( NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), @@ -610,6 +611,7 @@ class SBusFPGABus(Module): NextValue(SBUS_3V3_ACKs_o, ACK_BYTE), NextValue(SBUS_3V3_ERRs_o, 1), #NextValue(self.led_display.value, 0x0000000010 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextValue(sbus_slave_timeout, sbus_default_timeout), NextState("Slave_Ack_Reg_Write_Byte") ).Else( NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), @@ -644,6 +646,7 @@ class SBusFPGABus(Module): NextValue(SBUS_3V3_ACKs_o, ACK_HWORD), NextValue(SBUS_3V3_ERRs_o, 1), #NextValue(self.led_display.value, 0x0000000010 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), + NextValue(sbus_slave_timeout, sbus_default_timeout), NextState("Slave_Ack_Reg_Write_HWord") ).Else( NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), @@ -888,7 +891,6 @@ class SBusFPGABus(Module): NextValue(self.wishbone_master.sel, 2**len(self.wishbone_master.sel)-1), NextValue(self.wishbone_master.we, 0), NextValue(wishbone_master_timeout, wishbone_default_timeout), - NextValue(sbus_slave_timeout, sbus_default_timeout), NextValue(self.wishbone_master.adr, Cat(index_with_wrap(burst_counter+1, burst_limit_m1, sbus_last_pa[ADDR_PHYS_LOW+2:ADDR_PHYS_LOW+6]), # 4 bits, adr FIXME sbus_last_pa[ADDR_PHYS_LOW+6:ADDR_PFX_LOW], # 10 bits, adr sbus_last_pa[ADDR_PFX_LOW:ADDR_PFX_LOW+ADDR_PFX_LENGTH], # 12 bits, adr @@ -932,7 +934,6 @@ class SBusFPGABus(Module): NextValue(self.wishbone_master.we, 0), NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[2:28], Signal(4, reset = 0))), NextValue(wishbone_master_timeout, wishbone_default_timeout), - NextValue(sbus_slave_timeout, sbus_slave_timeout), #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), NextState("Slave_Ack_Read_Reg_Burst_Wait_For_Data") ).Elif(sbus_slave_timeout == 0, ### this is taking too long @@ -994,7 +995,6 @@ class SBusFPGABus(Module): NextValue(self.wishbone_master.we, 0), NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[2:28], Signal(4, reset = 0))), NextValue(wishbone_master_timeout, wishbone_default_timeout), - NextValue(sbus_slave_timeout, sbus_slave_timeout), #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), NextState("Slave_Ack_Read_Reg_HWord_Wait_For_Data") ).Elif(sbus_slave_timeout == 0, ### this is taking too long @@ -1054,7 +1054,6 @@ class SBusFPGABus(Module): NextValue(self.wishbone_master.we, 0), NextValue(self.wishbone_master.adr, Cat(sbus_last_pa[2:28], Signal(4, reset = 0))), NextValue(wishbone_master_timeout, wishbone_default_timeout), - NextValue(sbus_slave_timeout, sbus_slave_timeout), #NextValue(self.led_display.value, 0x0000000000 | Cat(Signal(8, reset = 0), SBUS_3V3_PA_i, Signal(4, reset = 0))), NextState("Slave_Ack_Read_Reg_Byte_Wait_For_Data") ).Elif(sbus_slave_timeout == 0, ### this is taking too long @@ -1088,8 +1087,9 @@ class SBusFPGABus(Module): NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), NextState("Slave_Ack_Reg_Write_Final") ).Else( - NextValue(SBUS_3V3_ACKs_o, ACK_WORD), - NextValue(burst_counter, burst_counter + 1) + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextValue(burst_counter, burst_counter + 1), + NextState("Slave_Ack_Reg_Write_Burst_Wait_For_Wishbone"), ) ) slave_fsm.act("Slave_Ack_Reg_Write_Final", @@ -1204,12 +1204,13 @@ class SBusFPGABus(Module): ) # ##### SLAVE ERROR ##### slave_fsm.act("Slave_Error", - NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), + NextValue(SBUS_3V3_ACKs_o, ACK_IDLE), #NextValue(self.led_display.value, 0x0000000080 | self.led_display.value), If(((SBUS_3V3_ASs_i == 1) | ((SBUS_3V3_ASs_i == 0) & (SBUS_3V3_SELs_i == 1))), NextValue(sbus_oe_data, 0), NextValue(sbus_oe_slave_in, 0), NextValue(sbus_oe_master_in, 0), + NextValue(sbus_slave_timeout, 0), NextState("Idle") ) ) diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py index 6b49585..777ae4d 100644 --- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py @@ -85,7 +85,7 @@ class _CRG(Module): ##platform.add_false_path_constraints(self.cd_native.clk, self.cd_sys.clk) self.submodules.curve25519_pll = curve25519_pll = S7MMCM(speedgrade=-1) - curve25519_clk_freq = 80e6 + curve25519_clk_freq = 90e6 self.curve25519_on = Signal() #curve25519_pll.register_clkin(clk48, 48e6) curve25519_pll.register_clkin(self.clk48_bufg, 48e6)