From e57cf9d9a880277ac23421d60fde247e73ecf859 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Sat, 4 Sep 2021 05:52:11 -0400 Subject: [PATCH] preliminary work on AES256-GCM in the Engine --- .../sys/dev/sbus/sbusfpga_curve25519engine.c | 464 +++++++++++++++-- sbus-to-ztex-gateware-migen/engine.py | 484 +++++++++++++----- .../engine_code/engine_code.rs | 484 +++++++++++++++++- sbus-to-ztex-gateware-migen/netbsd_csr.h | 21 +- .../sbus_to_fpga_soc.py | 21 +- 5 files changed, 1259 insertions(+), 215 deletions(-) diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c index 4c647ba..19155a0 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c @@ -43,6 +43,9 @@ __KERNEL_RCSID(0, "$NetBSD$"); #include #include +#include +#include +#include #include @@ -97,7 +100,7 @@ struct sbusfpga_curve25519engine_aesjob { static int init_programs(struct sbusfpga_curve25519engine_softc *sc); static int write_inputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusfpga_curve25519engine_montgomeryjob *job, const int window); static int start_job(struct sbusfpga_curve25519engine_softc *sc); -static int wait_job(struct sbusfpga_curve25519engine_softc *sc); +static int wait_job(struct sbusfpga_curve25519engine_softc *sc, uint32_t param); static int read_outputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusfpga_curve25519engine_montgomeryjob *job, const int window); static int dma_init(struct sbusfpga_curve25519engine_softc *sc); @@ -107,7 +110,17 @@ static int power_off(struct sbusfpga_curve25519engine_softc *sc); int sbusfpga_curve25519engine_open(dev_t dev, int flags, int mode, struct lwp *l) { - struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, minor(dev)); + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, driver); + + if (sc == NULL) + return ENODEV; + + if ((unit != 0) && ((sc->active_sessions & (1 << unit)) == 0)) { + return ENODEV; + } + /* first we need to turn the engine power on ... */ power_on(sc); @@ -117,7 +130,18 @@ sbusfpga_curve25519engine_open(dev_t dev, int flags, int mode, struct lwp *l) int sbusfpga_curve25519engine_close(dev_t dev, int flags, int mode, struct lwp *l) { - struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, minor(dev)); + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, driver); + + if (sc == NULL) + return ENODEV; + + if ((unit != 0) && (sc->active_sessions & (1 << unit))) { + device_printf(sc->sc_dev, "warning: close() on active session\n"); + sc->active_sessions &= ~(1 << unit); + sc->mapped_sessions &= ~(1 << unit); + } if (sc->active_sessions == 0) power_off(sc); @@ -147,11 +171,18 @@ static const uint32_t program_gcm[20] = {0x0010100d, 0x0094100d, 0x0118100d, 0x0 static const uint32_t program_aes[58] = {0x0001f003, 0x0005e012, 0x00841012, 0x01041012, 0x01841012, 0x0001d052, 0x00800052, 0x01000052, 0x01800052, 0x0005c012, 0x00841012, 0x01041012, 0x01841012, 0x0001b052, 0x00800052, 0x01000052, 0x01800052, 0x0005a012, 0x00841012, 0x01041012, 0x01841012, 0x00019052, 0x00800052, 0x01000052, 0x01800052, 0x00058012, 0x00841012, 0x01041012, 0x01841012, 0x00017052, 0x00800052, 0x01000052, 0x01800052, 0x00056012, 0x00841012, 0x01041012, 0x01841012, 0x00015052, 0x00800052, 0x01000052, 0x01800052, 0x00054012, 0x00841012, 0x01041012, 0x01841012, 0x00013052, 0x00800052, 0x01000052, 0x01800052, 0x00052012, 0x00841012, 0x01041012, 0x01841012, 0x02011052, 0x02800052, 0x03000052, 0x03800052, 0x0000000a }; -static const uint32_t program_gcm_ad[70] = {0x00400800, 0x00080840, 0x0001f403, 0x0005e012, 0x00841012, 0x01041012, 0x01841012, 0x0001d052, 0x00800052, 0x01000052, 0x01800052, 0x0005c012, 0x00841012, 0x01041012, 0x01841012, 0x0001b052, 0x00800052, 0x01000052, 0x01800052, 0x0005a012, 0x00841012, 0x01041012, 0x01841012, 0x00019052, 0x00800052, 0x01000052, 0x01800052, 0x00058012, 0x00841012, 0x01041012, 0x01841012, 0x00017052, 0x00800052, 0x01000052, 0x01800052, 0x00056012, 0x00841012, 0x01041012, 0x01841012, 0x00015052, 0x00800052, 0x01000052, 0x01800052, 0x00054012, 0x00841012, 0x01041012, 0x01841012, 0x00013052, 0x00800052, 0x01000052, 0x01800052, 0x00052012, 0x00841012, 0x01041012, 0x01841012, 0x02011052, 0x02800052, 0x03000052, 0x03800052, 0x03000089, 0x003c0000, 0x01400411, 0x0042b405, 0x01400411, 0x00080800, 0xe0000809, 0x00380000, 0x01bc03d1, 0x003cf3d1, 0x0000000a }; +static const uint32_t program_gcm_pfx[72] = {0x01400411, 0x00080840, 0x00040800, 0x0001f043, 0x0005e012, 0x00841012, 0x01041012, 0x01841012, 0x0001d052, 0x00800052, 0x01000052, 0x01800052, 0x0005c012, 0x00841012, 0x01041012, 0x01841012, 0x0001b052, 0x00800052, 0x01000052, 0x01800052, 0x0005a012, 0x00841012, 0x01041012, 0x01841012, 0x00019052, 0x00800052, 0x01000052, 0x01800052, 0x00058012, 0x00841012, 0x01041012, 0x01841012, 0x00017052, 0x00800052, 0x01000052, 0x01800052, 0x00056012, 0x00841012, 0x01041012, 0x01841012, 0x00015052, 0x00800052, 0x01000052, 0x01800052, 0x00054012, 0x00841012, 0x01041012, 0x01841012, 0x00013052, 0x00800052, 0x01000052, 0x01800052, 0x00052012, 0x00841012, 0x01041012, 0x01841012, 0x02011052, 0x02800052, 0x03000052, 0x03800052, 0x03800089, 0x003c0000, 0x01400411, 0x0042b405, 0x01400411, 0x00080800, 0x00040400, 0xdf800809, 0x00380000, 0x01bc03d1, 0x003cf3d1, 0x00340800 }; -static const uint32_t* programs[5] = { program_ec25519, program_gcm, program_aes, program_gcm_ad, NULL }; -static const uint32_t program_len[5] = { 134, 20, 58, 70, 0 }; -static uint32_t program_offset[4]; +static const uint32_t program_gcm_ad[29] = {0x0d800309, 0x000000d3, 0x01800011, 0x00000011, 0x0000d003, 0x000ec0c5, 0x0032d306, 0x0010f00d, 0x0094f00d, 0x0118f00d, 0x019cf00d, 0x00186143, 0x00160191, 0x00186811, 0x001c61c3, 0x00105103, 0x008441ce, 0x0082010e, 0x00080010, 0x009a008f, 0x0112008f, 0x0396008f, 0x00086083, 0x00105103, 0x00084083, 0x00341083, 0x00800309, 0xf2800809, 0x0000000a }; + +static const uint32_t program_gcm_aes[92] = {0x2d000309, 0x01400411, 0x0042b405, 0x01400411, 0x0001f403, 0x0005e012, 0x00841012, 0x01041012, 0x01841012, 0x0001d052, 0x00800052, 0x01000052, 0x01800052, 0x0005c012, 0x00841012, 0x01041012, 0x01841012, 0x0001b052, 0x00800052, 0x01000052, 0x01800052, 0x0005a012, 0x00841012, 0x01041012, 0x01841012, 0x00019052, 0x00800052, 0x01000052, 0x01800052, 0x00058012, 0x00841012, 0x01041012, 0x01841012, 0x00017052, 0x00800052, 0x01000052, 0x01800052, 0x00056012, 0x00841012, 0x01041012, 0x01841012, 0x00015052, 0x00800052, 0x01000052, 0x01800052, 0x00054012, 0x00841012, 0x01041012, 0x01841012, 0x00013052, 0x00800052, 0x01000052, 0x01800052, 0x00052012, 0x00841012, 0x01041012, 0x01841012, 0x02011052, 0x02800052, 0x03000052, 0x03840052, 0x000000d3, 0x00001003, 0x00ac02d3, 0x01800011, 0x00000011, 0x0000d003, 0x000ec0c5, 0x002ec2c5, 0x0032d306, 0x0010f00d, 0x0094f00d, 0x0118f00d, 0x019cf00d, 0x00186143, 0x00160191, 0x00186811, 0x001c61c3, 0x00105103, 0x008441ce, 0x0082010e, 0x00080010, 0x009a008f, 0x0112008f, 0x0396008f, 0x00086083, 0x00105103, 0x00084083, 0x00341083, 0x00800309, 0xd3000809, 0x0000000a }; + +static const uint32_t program_gcm_finish[113] = {0x2b000309, 0x01400411, 0x0042b405, 0x01400411, 0x0001f403, 0x0005e012, 0x00841012, 0x01041012, 0x01841012, 0x0001d052, 0x00800052, 0x01000052, 0x01800052, 0x0005c012, 0x00841012, 0x01041012, 0x01841012, 0x0001b052, 0x00800052, 0x01000052, 0x01800052, 0x0005a012, 0x00841012, 0x01041012, 0x01841012, 0x00019052, 0x00800052, 0x01000052, 0x01800052, 0x00058012, 0x00841012, 0x01041012, 0x01841012, 0x00017052, 0x00800052, 0x01000052, 0x01800052, 0x00056012, 0x00841012, 0x01041012, 0x01841012, 0x00015052, 0x00800052, 0x01000052, 0x01800052, 0x00054012, 0x00841012, 0x01041012, 0x01841012, 0x00013052, 0x00800052, 0x01000052, 0x01800052, 0x00052012, 0x00841012, 0x01041012, 0x01841012, 0x02011052, 0x02800052, 0x03000052, 0x03840052, 0x0004a054, 0x000000d3, 0x00001003, 0x00ac02d3, 0x01800011, 0x00000011, 0x0000d003, 0x0010f00d, 0x0094f00d, 0x0118f00d, 0x019cf00d, 0x00186143, 0x00160191, 0x00186811, 0x001c61c3, 0x00105103, 0x008441ce, 0x0082010e, 0x00080010, 0x009a008f, 0x0112008f, 0x0396008f, 0x00086083, 0x00105103, 0x00084083, 0x00341083, 0x01a40251, 0x00249251, 0x0000d243, 0x0010f00d, 0x0094f00d, 0x0118f00d, 0x019cf00d, 0x00186143, 0x00160191, 0x00186811, 0x001c61c3, 0x00105103, 0x008441ce, 0x0082010e, 0x00080010, 0x009a008f, 0x0112008f, 0x0396008f, 0x00086083, 0x00105103, 0x00084083, 0x00341083, 0x01b40351, 0x0034d351, 0x0020e343, 0x0000000a }; + +// second and third are for testing and shall be removed +static const uint32_t* programs[8] = { program_ec25519, program_gcm, program_aes, program_gcm_pfx, program_gcm_ad, program_gcm_aes, program_gcm_finish, NULL }; +static const uint32_t program_len[8] = { 134, 20, 58, 72, 29, 92, 113, 0 }; +static uint32_t program_offset[8]; /* * Attach all the sub-devices we can find @@ -303,18 +334,55 @@ struct sbusfpga_curve25519engine_session { uint32_t session; uint32_t cookie; }; +struct sbusfpga_curve25519engine_session_len { + uint32_t session; + uint32_t cookie; + uint32_t len; +}; +struct sbusfpga_curve25519engine_session_len_data { + uint32_t session; + uint32_t cookie; + uint32_t len; + uint32_t data[8]; + uint32_t keys[60]; +}; +struct sbusfpga_curve25519engine_session_len_final { + uint32_t session; + uint32_t cookie; + uint32_t len; + uint32_t data[8]; +}; + +#define CHECKSESSION(ses) \ + do { \ + if ((ses->session >= MAX_ACTIVE_SESSION) || (ses->session >= MAX_SESSION)) \ + return EINVAL; \ + if (sc->sessions_cookies[ses->session] == 0) \ + return EINVAL; \ + if (sc->sessions_cookies[ses->session] != ses->cookie) \ + return EINVAL; \ + if (ses->session != unit) \ + return EINVAL; \ + if ((sc->active_sessions & (1 << ses->session)) == 0) \ + return EINVAL; \ + } while (0) #define SBUSFPGA_DO_MONTGOMERYJOB _IOWR(0, 0, struct sbusfpga_curve25519engine_montgomeryjob) #define SBUSFPGA_EC25519_CHECKGCM _IOW(0, 1, struct sbusfpga_curve25519engine_montgomeryjob) #define SBUSFPGA_EC25519_CHECKAES _IOW(0, 2, struct sbusfpga_curve25519engine_aesjob) -#define SBUSFPGA_EC25519_GCMAD _IOW(0, 3, struct sbusfpga_curve25519engine_aesjob) -#define SBUSFPGA_EC25519_OPENSESSION _IOR(1, 0, struct sbusfpga_curve25519engine_session) -#define SBUSFPGA_EC25519_CLOSESESSION _IOR(1, 1, struct sbusfpga_curve25519engine_session) +#define SBUSFPGA_EC25519_GETSESSION _IOR(1, 0, struct sbusfpga_curve25519engine_session) +#define SBUSFPGA_EC25519_OPENSESSION _IOW(1, 1, struct sbusfpga_curve25519engine_session) +#define SBUSFPGA_EC25519_CLOSESESSION _IOW(1, 2, struct sbusfpga_curve25519engine_session) +#define SBUSFPGA_EC25519_GCMPFX _IOW(1, 3, struct sbusfpga_curve25519engine_session_len_data) +#define SBUSFPGA_EC25519_GCMAD _IOW(1, 4, struct sbusfpga_curve25519engine_session_len) +#define SBUSFPGA_EC25519_GCMAES _IOW(1, 5, struct sbusfpga_curve25519engine_session_len) +#define SBUSFPGA_EC25519_GCMFINISH _IOWR(1, 6, struct sbusfpga_curve25519engine_session_len_final) static int get_session(struct sbusfpga_curve25519engine_softc *sc) { int i; /* don't use 0, we use it for testing */ + /* also minor 0 is used to request session, 1-7 to open/close/map using session # */ for (i = 1 ; (i < MAX_ACTIVE_SESSION) && (i < MAX_SESSION) ; i++) { if (((sc->active_sessions & (1<mapped_sessions & (1<active_sessions |= (1<initialized) { if (init_programs(sc)) { return ENXIO; @@ -339,6 +413,9 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st } switch (cmd) { case SBUSFPGA_DO_MONTGOMERYJOB: { + if (unit != 0) + return ENOTTY; + struct sbusfpga_curve25519engine_montgomeryjob* job = (struct sbusfpga_curve25519engine_montgomeryjob*)data; curve25519engine_mpstart_write(sc, program_offset[0]); /* EC25519 */ curve25519engine_mplen_write(sc, program_len[0]); /* EC25519 */ @@ -350,7 +427,7 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st if (err) return err; delay(1); - err = wait_job(sc); + err = wait_job(sc, 1); if (err) return err; err = read_outputs(sc, job, 0); @@ -359,6 +436,9 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st } break; case SBUSFPGA_EC25519_CHECKGCM: { + if (unit != 0) + return ENOTTY; + const uint32_t base = 0; struct sbusfpga_curve25519engine_montgomeryjob* job = (struct sbusfpga_curve25519engine_montgomeryjob*)data; int reg, i; @@ -376,7 +456,7 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st if (err) return err; delay(1); - err = wait_job(sc); + err = wait_job(sc, 1); /* if (err) */ /* return err; */ @@ -391,6 +471,9 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st } break; case SBUSFPGA_EC25519_CHECKAES: { + if (unit != 0) + return ENOTTY; + const uint32_t base = 0; struct sbusfpga_curve25519engine_aesjob* job = (struct sbusfpga_curve25519engine_aesjob*)data; int reg, i; @@ -410,7 +493,7 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st if (err) return err; delay(1); - err = wait_job(sc); + err = wait_job(sc, 1); /* if (err) */ /* return err; */ @@ -424,19 +507,50 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st } } break; - case SBUSFPGA_EC25519_GCMAD: { - const uint32_t base = 0; - struct sbusfpga_curve25519engine_aesjob* job = (struct sbusfpga_curve25519engine_aesjob*)data; - int reg, i; - curve25519engine_mpstart_write(sc, program_offset[3]); /* GCM_AD */ - curve25519engine_mplen_write(sc, program_len[3]); /* GCM_AD */ + case SBUSFPGA_EC25519_GCMPFX: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len_data* job = (struct sbusfpga_curve25519engine_session_len_data*)data; + int reg, i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + //void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 128) { + device_printf(sc->sc_dev, "job->len too big: %u", job->len); + return EINVAL; + } + + curve25519engine_mpstart_write(sc, program_offset[3]); /* GCM_PFX */ + curve25519engine_mplen_write(sc, program_len[3] + program_len[4]); /* GCM_PFX + GCM_AD */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(0,i), job->data[i]); + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_addr */ + /* for (i = 0 ; i < 8 ; i ++) { */ + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,i), ((i & 3) == 0) ? ((uint32_t)wr_ptr) : 0); */ + /* } */ + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i&3) == 0) ? ((uint32_t)job->len) : 0); + } + /* data */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(16,i), job->data[i]); } for (reg = 31 ; reg > 16 ; reg--) { for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i), job->keys[i+8*(31-reg)]); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i), job->keys[(i&3)+4*(31-reg)]); } } @@ -444,10 +558,64 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st if (err) return err; delay(1); - err = wait_job(sc); - /* if (err) */ - /* return err; */ + err = wait_job(sc, job->len); + if (err) + return err; +#if 0 + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM_PFX %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } +#endif + } + break; + + case SBUSFPGA_EC25519_GCMAD: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len* job = (struct sbusfpga_curve25519engine_session_len*)data; + int i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + //void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 128) + return EINVAL; + + curve25519engine_mpstart_write(sc, program_offset[4]); /* GCM_AES */ + curve25519engine_mplen_write(sc, program_len[4]); /* GCM_AES */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ + for (i = 0 ; i < 8 ; i ++) { + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i & 3) == 0) ? ((uint32_t)job->len) : 0); + } + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, job->len); + if (err) + return err; + +#if 0 + int reg; for (reg = 0 ; reg < 32 ; reg++) { uint32_t buf[8]; for (i = 0 ; i < 8 ; i ++) { @@ -456,9 +624,153 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st device_printf(sc->sc_dev, "GCM_AD %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); } +#endif } break; - case SBUSFPGA_EC25519_OPENSESSION:{ + + case SBUSFPGA_EC25519_GCMAES: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len* job = (struct sbusfpga_curve25519engine_session_len*)data; + int i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 128) + return EINVAL; + + curve25519engine_mpstart_write(sc, program_offset[5]); /* GCM_AES */ + curve25519engine_mplen_write(sc, program_len[5]); /* GCM_AES */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ + for (i = 0 ; i < 8 ; i ++) { + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_addr */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), ((i & 3) == 0) ? ((uint32_t)wr_ptr) : 0); + } + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i & 3) == 0) ? ((uint32_t)job->len) : 0); + } + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, job->len); + if (err) + return err; +#if 0 + int reg; + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM_AES %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } +#endif + } + break; + + + case SBUSFPGA_EC25519_GCMFINISH: { + if (unit == 0) + return ENOTTY; + + /* FIXME: need a lock!!! */ + + const uint32_t base = unit * 0x400; + struct sbusfpga_curve25519engine_session_len_final* job = (struct sbusfpga_curve25519engine_session_len_final*)data; + int i; + void* rd_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) ); + void* wr_ptr = (void*)(((vaddr_t)sc->sc_dmamap->dm_segs[0].ds_addr) + (unit * 4096) + 2048); + + CHECKSESSION(job); + + if (job->len > 15) + return EINVAL; + + curve25519engine_mpstart_write(sc, program_offset[6]); /* GCM_FINISH */ + curve25519engine_mplen_write(sc, program_len[6]); /* GCM_FINISH */ + curve25519engine_window_write(sc, unit); /* to each session its own register file */ + + /* read_addr */ + for (i = 0 ; i < 8 ; i ++) { + /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + } + /* write_addr */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), ((i & 3) == 0) ? ((uint32_t)wr_ptr) : 0); + } + /* write_len */ + for (i = 0 ; i < 8 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i & 3) == 0) ? ((uint32_t)job->len) : 0); + } + /* final block */ + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(9,i), job->data[i]); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(9,i+4), job->data[i]); + } + /* create and generate MMASK */ + for (i = 0 ; i < 4 ; i ++) { + uint32_t mask; + int idx = i; + if (job->len <= (idx*4)) { + mask = 0; + } else if (job->len >= (idx+1)*4) { + mask = 0xFFFFFFFF; + } else { + mask = 0xFFFFFFFF >> (8*(4-(job->len%4))); + } + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(10,i), mask); + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(10,(i+4)), mask); + } + + + err = start_job(sc); + if (err) + return err; + delay(1); + err = wait_job(sc, job->len); + if (err) + return err; + + /* final accum */ + for (i = 0 ; i < 8 ; i ++) { + job->data[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(8,i)); + } + +#if 0 + int reg; + for (reg = 0 ; reg < 32 ; reg++) { + uint32_t buf[8]; + for (i = 0 ; i < 8 ; i ++) { + buf[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i)); + } + device_printf(sc->sc_dev, "GCM_FINISH %d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n", reg, + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); + } +#endif + } + break; + + case SBUSFPGA_EC25519_GETSESSION:{ + if (unit != 0) + return ENOTTY; + struct sbusfpga_curve25519engine_session* ses = (struct sbusfpga_curve25519engine_session*)data; int s = get_session(sc); if (s < 0) @@ -468,16 +780,29 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st ses->cookie = sc->sessions_cookies[s]; } break; - case SBUSFPGA_EC25519_CLOSESESSION:{ + case SBUSFPGA_EC25519_OPENSESSION:{ + if (unit == 0) + return ENOTTY; + struct sbusfpga_curve25519engine_session* ses = (struct sbusfpga_curve25519engine_session*)data; - if ((ses->session >= MAX_ACTIVE_SESSION) || (ses->session >= MAX_SESSION)) - return EINVAL; - if (sc->sessions_cookies[ses->session] != ses->cookie) - return EINVAL; + CHECKSESSION(ses); if ((sc->mapped_sessions & (1 << ses->session)) != 0) - return EBUSY; + return EINVAL; + } + break; + case SBUSFPGA_EC25519_CLOSESESSION:{ + if (unit == 0) + return ENOTTY; + + struct sbusfpga_curve25519engine_session* ses = (struct sbusfpga_curve25519engine_session*)data; + + CHECKSESSION(ses); + + /* if ((sc->mapped_sessions & (1 << ses->session)) != 0) */ + /* return EBUSY; */ sc->sessions_cookies[ses->session] = 0; sc->active_sessions &= ~(1 << ses->session); + sc->mapped_sessions &= ~(1 << ses->session); // FIXME } break; @@ -570,7 +895,7 @@ static int write_inputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusf int i; uint32_t status = curve25519engine_status_read(sc); int err = 0; - if (status & 1) { + if (status & (1<sc_dev, "WRITE - Curve25519Engine status: 0x%08x, still running?\n", status); return -ENXIO; } @@ -603,31 +928,49 @@ static int write_inputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusf static int start_job(struct sbusfpga_curve25519engine_softc *sc) { uint32_t status = curve25519engine_status_read(sc); - if (status & 1) { + if (status & (1<sc_dev, "START - Curve25519Engine status: 0x%08x, still running?\n", status); return -ENXIO; } curve25519engine_control_write(sc, 1); - aprint_normal_dev(sc->sc_dev, "START - Curve25519Engine status: 0x%08x\n", curve25519engine_status_read(sc)); + //aprint_normal_dev(sc->sc_dev, "START - Curve25519Engine status: 0x%08x\n", curve25519engine_status_read(sc)); return 0; } -static int wait_job(struct sbusfpga_curve25519engine_softc *sc) { +static int wait_job(struct sbusfpga_curve25519engine_softc *sc, uint32_t param) { uint32_t status = curve25519engine_status_read(sc); int count = 0; - while ((status & 1) && (count < 50)) { - aprint_normal_dev(sc->sc_dev, "WAIT - ongoing, Curve25519Engine status: 0x%08x [%d]\n", status, count); + int max_count = 50; + int del = 1; + const int max_del = 32; + static int max_del_seen = 1; + + while ((status & (1<sc_dev, "WAIT - ongoing, Curve25519Engine status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, ls_status); count ++; - delay(1); + delay(del); + del = del < max_del ? 2*del : del; status = curve25519engine_status_read(sc); } + if (del > max_del_seen) { + max_del_seen = del; + aprint_normal_dev(sc->sc_dev, "WAIT - new max delay %d after %d count (param was %u)\n", max_del_seen, count, param); + } + //curve25519engine_control_write(sc, 0); - if (status & 1) { - aprint_error_dev(sc->sc_dev, "WAIT - Curve25519Engine status: 0x%08x, did not finish in time? [0x%08x]\n", status, curve25519engine_instruction_read(sc)); + if (status & (1<sc_dev, "WAIT - Curve25519Engine status: 0x%08x, did not finish in time? [inst: 0x%08x ls_status: 0x%08x]\n", status, curve25519engine_instruction_read(sc), curve25519engine_ls_status_read(sc)); + return -ENXIO; + } else if (status & (1<sc_dev, "WAIT - Curve25519Engine status: 0x%08x, sigill [inst: 0x%08x ls_status: 0x%08x]\n", status, curve25519engine_instruction_read(sc), curve25519engine_ls_status_read(sc)); + return -ENXIO; + } else if (status & (1<sc_dev, "WAIT - Curve25519Engine status: 0x%08x, aborted [inst: 0x%08x ls_status: 0x%08x]\n", status, curve25519engine_instruction_read(sc), curve25519engine_ls_status_read(sc)); return -ENXIO; } else { - aprint_normal_dev(sc->sc_dev, "WAIT - Curve25519Engine status: 0x%08x [%d, 0x%08x]\n", status, count, curve25519engine_instruction_read(sc)); + //aprint_normal_dev(sc->sc_dev, "WAIT - Curve25519Engine status: 0x%08x [%d] ls_status: 0x%08x\n", status, count, curve25519engine_ls_status_read(sc)); } return 0; @@ -637,7 +980,7 @@ static int read_outputs(struct sbusfpga_curve25519engine_softc *sc, struct sbusf const uint32_t base = window * 0x400; int i; uint32_t status = curve25519engine_status_read(sc); - if (status & 1) { + if (status & (1<sc_dev, "READ - Curve25519Engine status: 0x%08x, still running?\n", status); return -ENXIO; } @@ -690,31 +1033,42 @@ dma_init(struct sbusfpga_curve25519engine_softc *sc) { return 0; } - aprint_normal_dev(sc->sc_dev, "DMA: SW -> kernel address is %p, dvma address is 0x%08llx\n", sc->sc_dma_kva, sc->sc_dmamap->dm_segs[0].ds_addr); + aprint_normal_dev(sc->sc_dev, "DMA: SW -> kernel address is %p, dvma address is 0x%08llx, seg %llx / %ld\n", sc->sc_dma_kva, sc->sc_dmamap->dm_segs[0].ds_addr, sc->sc_segs.ds_addr, sc->sc_segs.ds_len); return 1; } paddr_t sbusfpga_curve25519engine_mmap(dev_t dev, off_t offset, int prot) { - struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, minor(dev)); + int unit = minor(dev) & (MAX_SESSION - 1); + int driver = unit & ~(MAX_SESSION - 1); + struct sbusfpga_curve25519engine_softc *sc = device_lookup_private(&sbusfpga_c29e_cd, driver); paddr_t addr = -1; - int ses = offset / 4096; + + device_printf(sc->sc_dev, "%s:%d: %lld %d for %d / %d\n", __PRETTY_FUNCTION__, __LINE__, offset, prot, driver, unit); - if (offset % 4096) + if (offset != 0) return -1; if (prot & PROT_EXEC) return -1; - if (sc->mapped_sessions & (1 << ses)) + /* if (sc->mapped_sessions & (1 << unit)) */ + /* return -1; */ + if ((sc->active_sessions & (1 << unit)) == 0) return -1; - if ((sc->active_sessions & (1 << ses)) == 0) + if (unit >= MAX_ACTIVE_SESSION) + return -1; + if (unit <= 0) return -1; - - addr = bus_dmamem_mmap(sc->sc_dmatag, &sc->sc_segs, 1, offset, prot, BUS_DMA_NOWAIT); - device_printf(sc->sc_dev, "mapped page %d\n", ses); + // addr = bus_dmamem_mmap(sc->sc_dmatag, sc->sc_dmamap->dm_segs, 1, (off_t)(4096*unit), prot, BUS_DMA_NOWAIT); + if (pmap_extract(pmap_kernel(), ((vaddr_t)sc->sc_dma_kva) + (unit * 4096), &addr)) { + + device_printf(sc->sc_dev, "mapped page %d to 0x%08lx [0x%08lx], kernel is %p\n", unit, addr, atop(addr), (void*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096))); - if (addr != -1) - sc->mapped_sessions |= (1 << ses); + ((uint32_t*)(((vaddr_t)sc->sc_dma_kva) + (unit * 4096)))[0] = 0xDEADBEEF; + sc->mapped_sessions |= (1 << unit); + + return addr; + } - return addr; + return -1; } diff --git a/sbus-to-ztex-gateware-migen/engine.py b/sbus-to-ztex-gateware-migen/engine.py index 891fee6..9e69d18 100644 --- a/sbus-to-ztex-gateware-migen/engine.py +++ b/sbus-to-ztex-gateware-migen/engine.py @@ -30,8 +30,10 @@ opcodes = { # mnemonic : [bit coding, docstring] "GCM_SHRMI": [15, "Shift A right by imm, insert B LSB as dest MSB; reg-reg or reg-imm; per 128-bits block"], # "GCM_CMPD": [16, "Compute D:X0 from X1:X0; reg ; per 128-bits block"], # specific "GCM_SWAP64": [17, "Swap doubleword (64 bits) ; reg-reg or imm-reg or reg-imm; per 128-bits block ; imm != 0 -> BYTEREV*"], # - "AESESMI" : [18, "AES ; reg-reg ; per 128-bits block; imm[0:2] indicates sub-round (as in rv32's aes32esmi) ; imm[2] is 1 for aesesi (shared opcode)" ], - "MAX" : [19, "Maximum opcode number (for bounds checking)"], + "AESESMI" : [18, "AES ; reg-reg ; per 128-bits block; imm[0:2] indicates sub-round (as in rv32's aes32esmi) ; imm[2] is 1 for aesesi (shared opcode)" ], + "MEM" : [19, "MEM ; imm[0] == 0 for LOAD, imm[0] == 1 for STORE (beware, store copy the address in the output reg)" ], + "AND" : [20, "Wd $\gets$ Ra & Rb // bitwise AND"], + "MAX" : [21, "Maximum opcode number (for bounds checking)"], } num_registers = 32 @@ -236,6 +238,8 @@ class Curve25519Const(Module, AutoDoc): 9: [100, "one hundred", "The number 100 (for pow22501)"], 10: [254, "two hundred fifty four", "The number 254 (iteration count)"], 11: [0x00000001_00000000_00000000_00000000_00000001_00000000_00000000_00000000, "increment for GCM counter (LE)", "increment for GCM counter (LE)"], + 12: [0x00000000_00000000_00000000_00000010_00000000_00000000_00000000_00000010, "sixteen (twice)", "The number 16 (for block-size address increment)"], + 13: [0x00000000_00000000_00000000_00000001_00000000_00000000_00000000_00000001, "decrement for GCM dual-loops (LE)", "decrement for GCM dual-loops"] } self.adr = Signal(5) self.const = Signal(256) @@ -316,7 +320,7 @@ Here is an example of how to swap the contents of `ra` and `rb` based on the val class ExecLogic(ExecUnit): def __init__(self, width=256): - ExecUnit.__init__(self, width, ["XOR", "NOT", "PSA", "PSB", "XBT", "SHL"]) + ExecUnit.__init__(self, width, ["XOR", "NOT", "PSA", "PSB", "XBT", "SHL", "AND"]) self.intro = ModuleDoc(title="Logic ExecUnit Subclass", body=f""" This execution unit implements bit-wise logic operations: XOR, NOT, and passthrough. @@ -327,6 +331,7 @@ passthrough. * PSB returns the value of B * SHL returns A << 1 * XBT returns the 255th bit of A, reported in the 0th bit of the result +* AND returns the result of A&B """) @@ -348,6 +353,8 @@ passthrough. self.q.eq(Cat(self.a[254], zeros)) ).Elif(self.instruction.opcode == opcodes["SHL"][0], self.q.eq(Cat(0, self.a[:255])), + ).Elif(self.instruction.opcode == opcodes["AND"][0], + self.q.eq(self.a & self.b), ), ] @@ -1442,7 +1449,7 @@ class ExecClmul(ExecUnit, AutoDoc): clmul64_out = Signal(64) clmul64h_out = Signal(64) nlane = width // 128 - clmul_buf = Signal((nlane-1) * 128) ## width must be a multiple of 128... + clmul_buf = Signal(nlane * 128) ## width must be a multiple of 128... lanec = Signal(log2_int(nlane, False)) assert(nlane == 2) ## fixme @@ -1460,30 +1467,37 @@ class ExecClmul(ExecUnit, AutoDoc): self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE")) seq.act("IDLE", If(self.start, + NextValue(lanec, 0), Case(self.instruction.immediate[0:2], { - 0x0: [ clmul64x_in1.eq(self.a[ 0: 64]), clmul64x_in2.eq(self.b[ 0: 64]) ], - 0x1: [ clmul64x_in1.eq(self.a[ 0: 64]), clmul64x_in2.eq(self.b[ 64:128]) ], - 0x2: [ clmul64x_in1.eq(self.a[ 64:128]), clmul64x_in2.eq(self.b[ 0: 64]) ], - 0x3: [ clmul64x_in1.eq(self.a[ 64:128]), clmul64x_in2.eq(self.b[ 64:128]) ], + 0x0: [ NextValue(clmul64x_in1, self.a[ 0: 64]), NextValue(clmul64x_in2, self.b[ 0: 64]) ], + 0x1: [ NextValue(clmul64x_in1, self.a[ 0: 64]), NextValue(clmul64x_in2, self.b[ 64:128]) ], + 0x2: [ NextValue(clmul64x_in1, self.a[ 64:128]), NextValue(clmul64x_in2, self.b[ 0: 64]) ], + 0x3: [ NextValue(clmul64x_in1, self.a[ 64:128]), NextValue(clmul64x_in2, self.b[ 64:128]) ], }), NextState("NEXT"))) seq.act("NEXT", + Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(clmul64x_in1, self.a[128:192]), NextValue(clmul64x_in2, self.b[128:192]) ], + 0x1: [ NextValue(clmul64x_in1, self.a[128:192]), NextValue(clmul64x_in2, self.b[192:256]) ], + 0x2: [ NextValue(clmul64x_in1, self.a[192:256]), NextValue(clmul64x_in2, self.b[128:192]) ], + 0x3: [ NextValue(clmul64x_in1, self.a[192:256]), NextValue(clmul64x_in2, self.b[192:256]) ], + }), + NextState("WRITE")) + seq.act("WRITE", Case(lanec, { 0: [ NextValue(clmul_buf[0:128], Cat(clmul64_out, clmul64h_out)), - Case(self.instruction.immediate[0:2], { - 0x0: [ clmul64x_in1.eq(self.a[128:192]), clmul64x_in2.eq(self.b[128:192]) ], - 0x1: [ clmul64x_in1.eq(self.a[128:192]), clmul64x_in2.eq(self.b[192:256]) ], - 0x2: [ clmul64x_in1.eq(self.a[192:256]), clmul64x_in2.eq(self.b[128:192]) ], - 0x3: [ clmul64x_in1.eq(self.a[192:256]), clmul64x_in2.eq(self.b[192:256]) ], - }), NextValue(lanec, 1), ], - 1: [ self.q_valid.eq(1), - self.q.eq(Cat(clmul_buf, clmul64_out, clmul64h_out)), - NextValue(lanec, 0), - NextState("IDLE") + 1: [ NextValue(clmul_buf[128:256], Cat(clmul64_out, clmul64h_out)), + NextState("OUT"), ], })) + seq.act("OUT", + self.q_valid.eq(1), + self.q.eq(clmul_buf), + NextState("IDLE"), + ); + class ExecGCMShifts(ExecUnit, AutoDoc): def __init__(self, width=256): @@ -1505,13 +1519,13 @@ class ExecGCMShifts(ExecUnit, AutoDoc): ).Elif(self.instruction.opcode == opcodes["GCM_SHRMI"][0], Case(self.instruction.immediate[0:3], { 0x0: self.q.eq(self.a), - 0x1: self.q.eq(Cat(self.a[1:128], self.b[0:1], self.a[129:256], self.b[0:1])), - 0x2: self.q.eq(Cat(self.a[2:128], self.b[0:2], self.a[130:256], self.b[0:2])), - 0x3: self.q.eq(Cat(self.a[3:128], self.b[0:3], self.a[131:256], self.b[0:3])), - 0x4: self.q.eq(Cat(self.a[4:128], self.b[0:4], self.a[132:256], self.b[0:4])), - 0x5: self.q.eq(Cat(self.a[5:128], self.b[0:5], self.a[133:256], self.b[0:5])), - 0x6: self.q.eq(Cat(self.a[6:128], self.b[0:6], self.a[134:256], self.b[0:6])), - 0x7: self.q.eq(Cat(self.a[7:128], self.b[0:7], self.a[135:256], self.b[0:7])), + 0x1: self.q.eq(Cat(self.a[1:128], self.b[0:1], self.a[129:256], self.b[128:129])), + 0x2: self.q.eq(Cat(self.a[2:128], self.b[0:2], self.a[130:256], self.b[128:130])), + 0x3: self.q.eq(Cat(self.a[3:128], self.b[0:3], self.a[131:256], self.b[128:131])), + 0x4: self.q.eq(Cat(self.a[4:128], self.b[0:4], self.a[132:256], self.b[128:132])), + 0x5: self.q.eq(Cat(self.a[5:128], self.b[0:5], self.a[133:256], self.b[128:133])), + 0x6: self.q.eq(Cat(self.a[6:128], self.b[0:6], self.a[134:256], self.b[128:134])), + 0x7: self.q.eq(Cat(self.a[7:128], self.b[0:7], self.a[135:256], self.b[128:135])), }) ).Elif(self.instruction.opcode == opcodes["GCM_SHLMI"][0], Case(self.instruction.immediate[0:3], { @@ -1525,12 +1539,33 @@ class ExecGCMShifts(ExecUnit, AutoDoc): 0x7: self.q.eq(Cat(self.b[121:128], self.a[0:121], self.b[249:256], self.a[128:249])), }) ).Elif(self.instruction.opcode == opcodes["GCM_SWAP64"][0], - # also gcm_brev* + # also gcm_brev*, gcm_swap32 Case(self.instruction.immediate[0:2], { - 0: self.q.eq(Cat(self.b[64:128], self.a[0:64], self.b[192:256], self.a[128:192])), - 1: self.q.eq(Cat(self.a[8:16], self.a[0:8], self.a[24:32], self.a[16:24], self.a[40:48], self.a[32:40], self.a[56:64], self.a[48:56], self.a[72:80], self.a[64:72], self.a[88:96], self.a[80:88], self.a[104:112], self.a[96:104], self.a[120:128], self.a[112:120], self.a[136:144], self.a[128:136], self.a[152:160], self.a[144:152], self.a[168:176], self.a[160:168], self.a[184:192], self.a[176:184], self.a[200:208], self.a[192:200], self.a[216:224], self.a[208:216], self.a[232:240], self.a[224:232], self.a[248:256], self.a[240:248])), - 2: self.q.eq(Cat(self.a[24:32], self.a[16:24], self.a[8:16], self.a[0:8], self.a[56:64], self.a[48:56], self.a[40:48], self.a[32:40], self.a[88:96], self.a[80:88], self.a[72:80], self.a[64:72], self.a[120:128], self.a[112:120], self.a[104:112], self.a[96:104], self.a[152:160], self.a[144:152], self.a[136:144], self.a[128:136], self.a[184:192], self.a[176:184], self.a[168:176], self.a[160:168], self.a[216:224], self.a[208:216], self.a[200:208], self.a[192:200], self.a[248:256], self.a[240:248], self.a[232:240], self.a[224:232])), - 3: self.q.eq(Cat(self.a[56:64], self.a[48:56], self.a[40:48], self.a[32:40], self.a[24:32], self.a[16:24], self.a[8:16], self.a[0:8], self.a[120:128], self.a[112:120], self.a[104:112], self.a[96:104], self.a[88:96], self.a[80:88], self.a[72:80], self.a[64:72], self.a[184:192], self.a[176:184], self.a[168:176], self.a[160:168], self.a[152:160], self.a[144:152], self.a[136:144], self.a[128:136], self.a[248:256], self.a[240:248], self.a[232:240], self.a[224:232], self.a[216:224], self.a[208:216], self.a[200:208], self.a[192:200])), + # SWAP64 + 0: self.q.eq(Cat(self.b[ 64:128], self.a[ 0: 64], + self.b[192:256], self.a[128:192])), + # SWAP32 + 4: self.q.eq(Cat(self.b[ 32: 64], self.a[ 0: 32], self.b[ 96:128], self.a[ 64: 96], + self.b[160:192], self.a[128:160], self.b[224:256], self.a[192:224])), + # BREV16 + 1: self.q.eq(Cat(self.a[ 8: 16], self.a[ 0: 8], self.a[ 24: 32], self.a[ 16: 24], self.a[ 40: 48], self.a[ 32: 40], self.a[ 56: 64], self.a[ 48: 56], + self.a[ 72: 80], self.a[ 64: 72], self.a[ 88: 96], self.a[ 80: 88], self.a[104:112], self.a[ 96:104], self.a[120:128], self.a[112:120], + self.a[136:144], self.a[128:136], self.a[152:160], self.a[144:152], self.a[168:176], self.a[160:168], self.a[184:192], self.a[176:184], + self.a[200:208], self.a[192:200], self.a[216:224], self.a[208:216], self.a[232:240], self.a[224:232], self.a[248:256], self.a[240:248])), + # BREV32 + 2: self.q.eq(Cat(self.a[ 24: 32], self.a[ 16: 24], self.a[ 8: 16], self.a[ 0: 8], + self.a[ 56: 64], self.a[ 48: 56], self.a[ 40: 48], self.a[ 32: 40], + self.a[ 88: 96], self.a[ 80: 88], self.a[ 72: 80], self.a[ 64: 72], + self.a[120:128], self.a[112:120], self.a[104:112], self.a[ 96:104], + self.a[152:160], self.a[144:152], self.a[136:144], self.a[128:136], + self.a[184:192], self.a[176:184], self.a[168:176], self.a[160:168], + self.a[216:224], self.a[208:216], self.a[200:208], self.a[192:200], + self.a[248:256], self.a[240:248], self.a[232:240], self.a[224:232])), + # BREV64 + 3: self.q.eq(Cat(self.a[ 56: 64], self.a[ 48: 56], self.a[ 40: 48], self.a[ 32: 40], self.a[ 24: 32], self.a[ 16: 24], self.a[ 8: 16], self.a[ 0: 8], + self.a[120:128], self.a[112:120], self.a[104:112], self.a[ 96:104], self.a[ 88: 96], self.a[ 80: 88], self.a[ 72: 80], self.a[ 64: 72], + self.a[184:192], self.a[176:184], self.a[168:176], self.a[160:168], self.a[152:160], self.a[144:152], self.a[136:144], self.a[128:136], + self.a[248:256], self.a[240:248], self.a[232:240], self.a[224:232], self.a[216:224], self.a[208:216], self.a[200:208], self.a[192:200])), }) ) ] @@ -1543,7 +1578,7 @@ class ExecAES(ExecUnit, AutoDoc): assert(width == 256) # fixme nlane = width // 128 - aes_buf = Signal((nlane-1) * 128) ## width must be a multiple of 128... + aes_buf = Signal(nlane * 128) ## width must be a multiple of 128... lanec = Signal(log2_int(nlane, False)) assert(nlane == 2) ## fixme @@ -1560,120 +1595,278 @@ class ExecAES(ExecUnit, AutoDoc): self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE")) seq.act("IDLE", If(self.start, + NextValue(lanec, 0), Case(self.instruction.immediate[0:2], { - 0x0: [ aes_in[0].eq(self.a[ 0: 8]), aes_in[1].eq(self.a[ 32: 40]), aes_in[2].eq(self.a[ 64: 72]), aes_in[3].eq(self.a[ 96:104]) ], - 0x1: [ aes_in[3].eq(self.a[ 8: 16]), aes_in[0].eq(self.a[ 40: 48]), aes_in[1].eq(self.a[ 72: 80]), aes_in[2].eq(self.a[104:112]) ], - 0x2: [ aes_in[2].eq(self.a[ 16: 24]), aes_in[3].eq(self.a[ 48: 56]), aes_in[0].eq(self.a[ 80: 88]), aes_in[1].eq(self.a[112:120]) ], - 0x3: [ aes_in[1].eq(self.a[ 24: 32]), aes_in[2].eq(self.a[ 56: 64]), aes_in[3].eq(self.a[ 88: 96]), aes_in[0].eq(self.a[120:128]) ], + 0x0: [ NextValue(aes_in[0], self.a[ 0: 8]), NextValue(aes_in[1], self.a[ 32: 40]), NextValue(aes_in[2], self.a[ 64: 72]), NextValue(aes_in[3], self.a[ 96:104]) ], + 0x1: [ NextValue(aes_in[3], self.a[ 8: 16]), NextValue(aes_in[0], self.a[ 40: 48]), NextValue(aes_in[1], self.a[ 72: 80]), NextValue(aes_in[2], self.a[104:112]) ], + 0x2: [ NextValue(aes_in[2], self.a[ 16: 24]), NextValue(aes_in[3], self.a[ 48: 56]), NextValue(aes_in[0], self.a[ 80: 88]), NextValue(aes_in[1], self.a[112:120]) ], + 0x3: [ NextValue(aes_in[1], self.a[ 24: 32]), NextValue(aes_in[2], self.a[ 56: 64]), NextValue(aes_in[3], self.a[ 88: 96]), NextValue(aes_in[0], self.a[120:128]) ], }), NextState("NEXT"))) seq.act("NEXT", + Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(aes_in[0], self.a[128:136]), NextValue(aes_in[1], self.a[160:168]), NextValue(aes_in[2], self.a[192:200]), NextValue(aes_in[3], self.a[224:232]) ], + 0x1: [ NextValue(aes_in[3], self.a[136:144]), NextValue(aes_in[0], self.a[168:176]), NextValue(aes_in[1], self.a[200:208]), NextValue(aes_in[2], self.a[232:240]) ], + 0x2: [ NextValue(aes_in[2], self.a[144:152]), NextValue(aes_in[3], self.a[176:184]), NextValue(aes_in[0], self.a[208:216]), NextValue(aes_in[1], self.a[240:248]) ], + 0x3: [ NextValue(aes_in[1], self.a[152:160]), NextValue(aes_in[2], self.a[184:192]), NextValue(aes_in[3], self.a[216:224]), NextValue(aes_in[0], self.a[248:256]) ], + }), + NextState("WRITE")) + seq.act("WRITE", Case(lanec, { - 0: [ Case(self.instruction.immediate[0:2], { - 0x0: [ aes_in[0].eq(self.a[128:136]), aes_in[1].eq(self.a[160:168]), aes_in[2].eq(self.a[192:200]), aes_in[3].eq(self.a[224:232]) ], - 0x1: [ aes_in[3].eq(self.a[136:144]), aes_in[0].eq(self.a[168:176]), aes_in[1].eq(self.a[200:208]), aes_in[2].eq(self.a[232:240]) ], - 0x2: [ aes_in[2].eq(self.a[144:152]), aes_in[3].eq(self.a[176:184]), aes_in[0].eq(self.a[208:216]), aes_in[1].eq(self.a[240:248]) ], - 0x3: [ aes_in[1].eq(self.a[152:160]), aes_in[2].eq(self.a[184:192]), aes_in[3].eq(self.a[216:224]), aes_in[0].eq(self.a[248:256]) ], - }), - Case(self.instruction.immediate[2:3], { - 0: Case(self.instruction.immediate[0:2], { - 0x0: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 0:16], aes_out[0][ 8:24], - aes_out[1][ 0:16], aes_out[1][ 8:24], - aes_out[2][ 0:16], aes_out[2][ 8:24], - aes_out[3][ 0:16], aes_out[3][ 8:24])), - ], - 0x1: [ NextValue(aes_buf[0:128], Cat(aes_out[0][16:24], aes_out[0][ 0:16], aes_out[0][ 8:16], - aes_out[1][16:24], aes_out[1][ 0:16], aes_out[1][ 8:16], - aes_out[2][16:24], aes_out[2][ 0:16], aes_out[2][ 8:16], - aes_out[3][16:24], aes_out[3][ 0:16], aes_out[3][ 8:16])), - ], - 0x2: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 8:24], aes_out[0][ 0:16], - aes_out[1][ 8:24], aes_out[1][ 0:16], - aes_out[2][ 8:24], aes_out[2][ 0:16], - aes_out[3][ 8:24], aes_out[3][ 0:16])), - ], - 0x3: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 8:16], aes_out[0][ 8:24], aes_out[0][ 0: 8], - aes_out[1][ 8:16], aes_out[1][ 8:24], aes_out[1][ 0: 8], - aes_out[2][ 8:16], aes_out[2][ 8:24], aes_out[2][ 0: 8], - aes_out[3][ 8:16], aes_out[3][ 8:24], aes_out[3][ 0: 8])), - ], - }), - 1: Case(self.instruction.immediate[0:2], { - 0x0: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 8:16], Signal(24, reset = 0), - aes_out[1][ 8:16], Signal(24, reset = 0), - aes_out[2][ 8:16], Signal(24, reset = 0), - aes_out[3][ 8:16], Signal(24, reset = 0))), - ], - 0x1: [ NextValue(aes_buf[0:128], Cat(Signal(8, reset = 0), aes_out[0][ 8:16], Signal(16, reset = 0), - Signal(8, reset = 0), aes_out[1][ 8:16], Signal(16, reset = 0), - Signal(8, reset = 0), aes_out[2][ 8:16], Signal(16, reset = 0), - Signal(8, reset = 0), aes_out[3][ 8:16], Signal(16, reset = 0))), - ], - 0x2: [ NextValue(aes_buf[0:128], Cat(Signal(16, reset = 0), aes_out[0][ 8:16], Signal(8, reset = 0), - Signal(16, reset = 0), aes_out[1][ 8:16], Signal(8, reset = 0), - Signal(16, reset = 0), aes_out[2][ 8:16], Signal(8, reset = 0), - Signal(16, reset = 0), aes_out[3][ 8:16], Signal(8, reset = 0))), - ], - 0x3: [ NextValue(aes_buf[0:128], Cat(Signal(24, reset = 0), aes_out[0][ 8:16], - Signal(24, reset = 0), aes_out[1][ 8:16], - Signal(24, reset = 0), aes_out[2][ 8:16], - Signal(24, reset = 0), aes_out[3][ 8:16])), - ], - }), - }), - NextValue(lanec, 1), - ], - 1: [ self.q_valid.eq(1), - Case(self.instruction.immediate[2:3], { - 0: Case(self.instruction.immediate[0:2], { - 0x0: [ self.q.eq(self.b ^ Cat(aes_buf, aes_out[0][ 0:16], aes_out[0][ 8:24], - aes_out[1][ 0:16], aes_out[1][ 8:24], - aes_out[2][ 0:16], aes_out[2][ 8:24], - aes_out[3][ 0:16], aes_out[3][ 8:24])), - ], - 0x1: [ self.q.eq(self.b ^ Cat(aes_buf, aes_out[0][16:24], aes_out[0][ 0:16], aes_out[0][ 8:16], - aes_out[1][16:24], aes_out[1][ 0:16], aes_out[1][ 8:16], - aes_out[2][16:24], aes_out[2][ 0:16], aes_out[2][ 8:16], - aes_out[3][16:24], aes_out[3][ 0:16], aes_out[3][ 8:16])), - ], - 0x2: [ self.q.eq(self.b ^ Cat(aes_buf, aes_out[0][ 8:24], aes_out[0][ 0:16], - aes_out[1][ 8:24], aes_out[1][ 0:16], - aes_out[2][ 8:24], aes_out[2][ 0:16], - aes_out[3][ 8:24], aes_out[3][ 0:16])), - ], - 0x3: [ self.q.eq(self.b ^ Cat(aes_buf, aes_out[0][ 8:16], aes_out[0][ 8:24], aes_out[0][ 0: 8], - aes_out[1][ 8:16], aes_out[1][ 8:24], aes_out[1][ 0: 8], - aes_out[2][ 8:16], aes_out[2][ 8:24], aes_out[2][ 0: 8], - aes_out[3][ 8:16], aes_out[3][ 8:24], aes_out[3][ 0: 8])), - ], - }), - 1: Case(self.instruction.immediate[0:2], { - 0x0: [ self.q.eq(self.b ^ Cat(aes_buf, aes_out[0][ 8:16], Signal(24, reset = 0), - aes_out[1][ 8:16], Signal(24, reset = 0), - aes_out[2][ 8:16], Signal(24, reset = 0), - aes_out[3][ 8:16], Signal(24, reset = 0))), - ], - 0x1: [ self.q.eq(self.b ^ Cat(aes_buf, Signal(8, reset = 0), aes_out[0][ 8:16], Signal(16, reset = 0), - Signal(8, reset = 0), aes_out[1][ 8:16], Signal(16, reset = 0), - Signal(8, reset = 0), aes_out[2][ 8:16], Signal(16, reset = 0), - Signal(8, reset = 0), aes_out[3][ 8:16], Signal(16, reset = 0))), - ], - 0x2: [ self.q.eq(self.b ^ Cat(aes_buf, Signal(16, reset = 0), aes_out[0][ 8:16], Signal(8, reset = 0), - Signal(16, reset = 0), aes_out[1][ 8:16], Signal(8, reset = 0), - Signal(16, reset = 0), aes_out[2][ 8:16], Signal(8, reset = 0), - Signal(16, reset = 0), aes_out[3][ 8:16], Signal(8, reset = 0))), - ], - 0x3: [ self.q.eq(self.b ^ Cat(aes_buf, Signal(24, reset = 0), aes_out[0][ 8:16], - Signal(24, reset = 0), aes_out[1][ 8:16], - Signal(24, reset = 0), aes_out[2][ 8:16], - Signal(24, reset = 0), aes_out[3][ 8:16])), - ], - }), - }), - NextValue(lanec, 0), - NextState("IDLE") + 0: [ Case(self.instruction.immediate[2:3], { + 0: Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 0:16], aes_out[0][ 8:24], + aes_out[1][ 0:16], aes_out[1][ 8:24], + aes_out[2][ 0:16], aes_out[2][ 8:24], + aes_out[3][ 0:16], aes_out[3][ 8:24])), + ], + 0x1: [ NextValue(aes_buf[0:128], Cat(aes_out[0][16:24], aes_out[0][ 0:16], aes_out[0][ 8:16], + aes_out[1][16:24], aes_out[1][ 0:16], aes_out[1][ 8:16], + aes_out[2][16:24], aes_out[2][ 0:16], aes_out[2][ 8:16], + aes_out[3][16:24], aes_out[3][ 0:16], aes_out[3][ 8:16])), + ], + 0x2: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 8:24], aes_out[0][ 0:16], + aes_out[1][ 8:24], aes_out[1][ 0:16], + aes_out[2][ 8:24], aes_out[2][ 0:16], + aes_out[3][ 8:24], aes_out[3][ 0:16])), + ], + 0x3: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 8:16], aes_out[0][ 8:24], aes_out[0][ 0: 8], + aes_out[1][ 8:16], aes_out[1][ 8:24], aes_out[1][ 0: 8], + aes_out[2][ 8:16], aes_out[2][ 8:24], aes_out[2][ 0: 8], + aes_out[3][ 8:16], aes_out[3][ 8:24], aes_out[3][ 0: 8])), + ], + }), + 1: Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(aes_buf[0:128], Cat(aes_out[0][ 8:16], Signal(24, reset = 0), + aes_out[1][ 8:16], Signal(24, reset = 0), + aes_out[2][ 8:16], Signal(24, reset = 0), + aes_out[3][ 8:16], Signal(24, reset = 0))), + ], + 0x1: [ NextValue(aes_buf[0:128], Cat(Signal(8, reset = 0), aes_out[0][ 8:16], Signal(16, reset = 0), + Signal(8, reset = 0), aes_out[1][ 8:16], Signal(16, reset = 0), + Signal(8, reset = 0), aes_out[2][ 8:16], Signal(16, reset = 0), + Signal(8, reset = 0), aes_out[3][ 8:16], Signal(16, reset = 0))), + ], + 0x2: [ NextValue(aes_buf[0:128], Cat(Signal(16, reset = 0), aes_out[0][ 8:16], Signal(8, reset = 0), + Signal(16, reset = 0), aes_out[1][ 8:16], Signal(8, reset = 0), + Signal(16, reset = 0), aes_out[2][ 8:16], Signal(8, reset = 0), + Signal(16, reset = 0), aes_out[3][ 8:16], Signal(8, reset = 0))), + ], + 0x3: [ NextValue(aes_buf[0:128], Cat(Signal(24, reset = 0), aes_out[0][ 8:16], + Signal(24, reset = 0), aes_out[1][ 8:16], + Signal(24, reset = 0), aes_out[2][ 8:16], + Signal(24, reset = 0), aes_out[3][ 8:16])), + ], + }), + }), + NextValue(lanec, 1)], + 1: [ Case(self.instruction.immediate[2:3], { + 0: Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(aes_buf[128:256], Cat(aes_out[0][ 0:16], aes_out[0][ 8:24], + aes_out[1][ 0:16], aes_out[1][ 8:24], + aes_out[2][ 0:16], aes_out[2][ 8:24], + aes_out[3][ 0:16], aes_out[3][ 8:24])), + ], + 0x1: [ NextValue(aes_buf[128:256], Cat(aes_out[0][16:24], aes_out[0][ 0:16], aes_out[0][ 8:16], + aes_out[1][16:24], aes_out[1][ 0:16], aes_out[1][ 8:16], + aes_out[2][16:24], aes_out[2][ 0:16], aes_out[2][ 8:16], + aes_out[3][16:24], aes_out[3][ 0:16], aes_out[3][ 8:16])), + ], + 0x2: [ NextValue(aes_buf[128:256], Cat(aes_out[0][ 8:24], aes_out[0][ 0:16], + aes_out[1][ 8:24], aes_out[1][ 0:16], + aes_out[2][ 8:24], aes_out[2][ 0:16], + aes_out[3][ 8:24], aes_out[3][ 0:16])), + ], + 0x3: [ NextValue(aes_buf[128:256], Cat(aes_out[0][ 8:16], aes_out[0][ 8:24], aes_out[0][ 0: 8], + aes_out[1][ 8:16], aes_out[1][ 8:24], aes_out[1][ 0: 8], + aes_out[2][ 8:16], aes_out[2][ 8:24], aes_out[2][ 0: 8], + aes_out[3][ 8:16], aes_out[3][ 8:24], aes_out[3][ 0: 8])), + ], + }), + 1: Case(self.instruction.immediate[0:2], { + 0x0: [ NextValue(aes_buf[128:256], Cat(aes_out[0][ 8:16], Signal(24, reset = 0), + aes_out[1][ 8:16], Signal(24, reset = 0), + aes_out[2][ 8:16], Signal(24, reset = 0), + aes_out[3][ 8:16], Signal(24, reset = 0))), + ], + 0x1: [ NextValue(aes_buf[128:256], Cat(Signal(8, reset = 0), aes_out[0][ 8:16], Signal(16, reset = 0), + Signal(8, reset = 0), aes_out[1][ 8:16], Signal(16, reset = 0), + Signal(8, reset = 0), aes_out[2][ 8:16], Signal(16, reset = 0), + Signal(8, reset = 0), aes_out[3][ 8:16], Signal(16, reset = 0))), + ], + 0x2: [ NextValue(aes_buf[128:256], Cat(Signal(16, reset = 0), aes_out[0][ 8:16], Signal(8, reset = 0), + Signal(16, reset = 0), aes_out[1][ 8:16], Signal(8, reset = 0), + Signal(16, reset = 0), aes_out[2][ 8:16], Signal(8, reset = 0), + Signal(16, reset = 0), aes_out[3][ 8:16], Signal(8, reset = 0))), + ], + 0x3: [ NextValue(aes_buf[128:256], Cat(Signal(24, reset = 0), aes_out[0][ 8:16], + Signal(24, reset = 0), aes_out[1][ 8:16], + Signal(24, reset = 0), aes_out[2][ 8:16], + Signal(24, reset = 0), aes_out[3][ 8:16])), + ], + }), + }), + NextState("OUT") ], })) + seq.act("OUT", + self.q_valid.eq(1), + self.q.eq(self.b ^ aes_buf), + NextState("IDLE")) + +class ExecLS(ExecUnit, AutoDoc): + def __init__(self, width=256, interface=None): + ExecUnit.__init__(self, width, ["MEM"]) + + self.notes = ModuleDoc(title=f"Load/Store ExecUnit Subclass", body=f""" + """) + + self.sync.eng_clk += [ # pipeline the instruction + self.instruction_out.eq(self.instruction_in), + ] + + assert(width == 256) # fixme + assert(len(interface.sel) == 16) # 128 bits Wishbone + + start_pipe = Signal() + self.sync.mul_clk += start_pipe.eq(self.start) # break critical path of instruction decode -> SETUP_A state muxes + self.submodules.lsseq = lsseq = ClockDomainsRenamer("mul_clk")(FSM(reset_state="IDLE")) + cpar = Signal() # to keep track of the odd-ness of our cycle, so we can align 2 mul_clk cycles of output on 1 eng_clk cycle + lbuf = Signal(width) + timeout = Signal(11) + #tries = Signal() + self.has_failure = Signal(2) + self.has_timeout = Signal(2) + + self.sync.mul_clk += If(timeout > 0, timeout.eq(timeout - 1)) + + lsseq.act("IDLE", + If(start_pipe, + #NextValue(lbuf, 0xF00FF00F_0FF00FF0_F00FF00F_0FF00FF0_F00FF00F_0FF00FF0_F00FF00F_0FF00FF0), + NextValue(cpar, 0), + NextValue(self.has_timeout, 0), + NextValue(self.has_failure, 0), + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, self.a[4:32]), + NextValue(interface.we, self.instruction.immediate[0]), + NextValue(timeout, 2047), + If(self.instruction.immediate[0], # do we need those tests or could we always update dat_w/dat_r ? + NextValue(interface.dat_w, self.b[0:128])), + NextState("MEMl") # MEMl + ) + ) + lsseq.act("MEMl", + NextValue(cpar, cpar ^ 1), + If(interface.ack, + If(~self.instruction.immediate[0], + NextValue(lbuf[0:128], interface.dat_r)), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("MEMl2") + ).Elif(interface.err, + NextValue(self.has_failure[0], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + ).Elif(timeout == 0, + NextValue(self.has_timeout[0], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + )) + lsseq.act("MEMl2", + NextValue(cpar, cpar ^ 1), + If(~interface.ack, + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, self.a[132:160]), + NextValue(interface.we, self.instruction.immediate[0]), + NextValue(timeout, 2047), + If(self.instruction.immediate[0], + NextValue(interface.dat_w, self.b[128:256])), + NextState("MEMh") + )) + lsseq.act("MEMh", + NextValue(cpar, cpar ^ 1), + If(interface.ack, + If(~self.instruction.immediate[0], + NextValue(lbuf[128:256], interface.dat_r)), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("MEMh2") + ).Elif(interface.err, + NextValue(self.has_failure[1], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + ).Elif(timeout == 0, + NextValue(self.has_timeout[1], 1), + NextValue(interface.cyc, 0), + NextValue(interface.stb, 0), + NextState("ERR"), + )) + lsseq.act("MEMh2", + NextValue(cpar, cpar ^ 1), + If(~interface.ack, + #NextValue(tries, 0), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + )) + lsseq.act("MEM_ODD", # clock alignement cycle + NextState("MEM_EVEN1")) + lsseq.act("MEM_EVEN1", + NextState("MEM_EVEN2")) + lsseq.act("MEM_EVEN2", + NextValue(cpar, 0), + NextValue(self.has_failure, 0), + NextValue(self.has_timeout, 0), + NextState("IDLE")) + lsseq.act("ERR", + #If(~tries, # second attempt + # NextValue(cpar, 0), + # NextValue(tries, 1), + # NextState("IDLE") + #).Else(NextValue(tries, 0), # no third attempt, give up + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + #) + ) + self.sync.mul_clk += [ + If(lsseq.ongoing("MEM_EVEN1") | lsseq.ongoing("MEM_EVEN2"), + self.q_valid.eq(1), + If(~self.instruction.immediate[0], + self.q.eq(lbuf), + ).Else( + # self.q.eq(Cat((self.a[0:32] + 16)[0:32], self.a[32:128], + # (self.a[128:160] + 16)[0:32], self.a[160:256])), + self.q.eq(self.a), + ), + ).Else( + self.q_valid.eq(0), + ) + ] + + self.state = Signal(32) + self.sync.mul_clk += self.state[0].eq(lsseq.ongoing("IDLE")) + self.sync.mul_clk += self.state[1].eq(lsseq.ongoing("MEMl")) + self.sync.mul_clk += self.state[2].eq(lsseq.ongoing("MEMl2")) + self.sync.mul_clk += self.state[3].eq(lsseq.ongoing("MEMh")) + self.sync.mul_clk += self.state[4].eq(lsseq.ongoing("MEMh2")) + self.sync.mul_clk += self.state[5].eq(lsseq.ongoing("MEM_ODD")) + self.sync.mul_clk += self.state[6].eq(lsseq.ongoing("MEM_EVEN1")) + self.sync.mul_clk += self.state[7].eq(lsseq.ongoing("MEM_EVEN2")) + self.sync.mul_clk += self.state[8].eq(lsseq.ongoing("MEM_ERR")) + self.sync.mul_clk += self.state[28:30].eq((self.state[28:30] & Replicate(~start_pipe, 2)) | self.has_timeout) + self.sync.mul_clk += self.state[30:32].eq((self.state[30:32] & Replicate(~start_pipe, 2)) | self.has_failure) class Engine(Module, AutoCSR, AutoDoc): @@ -1764,6 +1957,7 @@ Here are the currently implemented opcodes for The Engine: instruction = Record(instruction_layout) # current instruction to execute illegal_opcode = Signal() + abort = Signal(); ### register file rf_depth_raw = 512 @@ -1824,6 +2018,7 @@ Here are the currently implemented opcodes for The Engine: CSRField("mpc", size=log2_int(microcode_depth), description="Current location of the microcode program counter. Mostly for debug."), CSRField("pause_gnt", size=1, description="When set, the engine execution has been paused, and the RF & microcode ROM can be read out for suspend/resume"), CSRField("sigill", size=1, description="Illegal Instruction"), + CSRField("abort", size=1, description="Abort from failure"), CSRField("finished", size=1, description="Finished"), ]) pause_gnt = Signal() @@ -1834,6 +2029,7 @@ Here are the currently implemented opcodes for The Engine: self.status.fields.pause_gnt.eq(pause_gnt), self.status.fields.mpc.eq(mpc), self.status.fields.sigill.eq(illegal_opcode), + self.status.fields.abort.eq(abort), self.status.fields.finished.eq(((~running & running_r) | self.status.fields.finished) & (~(running & ~running_r))), ] @@ -1874,6 +2070,8 @@ Here are the currently implemented opcodes for The Engine: self.instruction.status.eq(micro_runport.dat_r) ] + self.ls_status = CSRStatus(32, description="Status of the L/S unit") + ### wishbone bus interface: decode the two address spaces and dispatch accordingly self.bus = bus = wishbone.Interface() wdata = Signal(32) @@ -2120,7 +2318,10 @@ Here are the currently implemented opcodes for The Engine: NextValue(running, 0), ) ).Else( - If(mpc < mpc_stop, + If(abort, + NextState("IDLE"), + NextValue(running, 0), + ).Elif(mpc < mpc_stop, NextState("FETCH"), NextValue(mpc, mpc + 1), ).Else( @@ -2136,6 +2337,7 @@ Here are the currently implemented opcodes for The Engine: ) ) + self.busls = wishbone.Interface(data_width = 128, adr_width = 28) exec_units = { "exec_mask" : ExecMask(width=rf_width_raw), "exec_logic" : ExecLogic(width=rf_width_raw), @@ -2145,6 +2347,7 @@ Here are the currently implemented opcodes for The Engine: "exec_clmul" : ExecClmul(width=rf_width_raw), "exec_gcmshifts" : ExecGCMShifts(width=rf_width_raw), "exec_aes" : ExecAES(width=rf_width_raw), + "exec_ls" : ExecLS(width=rf_width_raw,interface=self.busls) } index = 0 for name, unit in exec_units.items(): @@ -2190,6 +2393,9 @@ Here are the currently implemented opcodes for The Engine: self.comb += [ rf_write.eq(done), ] + + self.sync += abort.eq((abort & ~engine_go) | (self.exec_ls.has_failure[0] | self.exec_ls.has_failure[1] | self.exec_ls.has_timeout[0] | self.exec_ls.has_timeout[1])) + self.comb += self.ls_status.status.eq(self.exec_ls.state) ##### TIMING CONSTRAINTS -- you want these. Trust me. diff --git a/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs index b563143..3f6cbd1 100644 --- a/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs +++ b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs @@ -693,19 +693,25 @@ fn main() -> std::io::Result<()> { fin ); - let gcm_ad_code = assemble_engine25519!( + let gcm_pfx_code = assemble_engine25519!( start: - // Input: rkeys in %31-%17 (backward) + // Input: rkeys in %31-%17 (backward, LE) + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // ADLEN in %12 (in 16-byte-blocks) // Transient: // %0, %1, %2 are tmp - // init counter in %16 - // H will go in %15 - // T will go in %14 - psa %16, #0 + // Output: + // all inputs preserved + // H will go in %15 (byte-reverted) + // T will go in %14 + // accum (0) will go in %13 + gcm_brev32 %16, %16 // use %2 as a flag psa %2, #1 + psa %1, #0 genht: - xor %0, %16, %31 + xor %0, %1, %31 aesesmi %1, %0, %30, #0 aesesmi %1, %0, %1, #1 @@ -788,23 +794,479 @@ fn main() -> std::io::Result<()> { gcm_brev32 %16, %16 // clear flag & go encrypt t psa %2, #0 + psa %1, %16 brz genht, #0 afterht: // store T in %14 psa %14, %0 - // fully byte-revert H (first byte-in-dword, then dword-in-128bit) + // fully byte-revert H (first byte-in-dword, then dword-in-64bit) gcm_brev64 %15, %15 gcm_swap64 %15, %15, %15 + + psa %13, #0 - fin + // no fin; we fall directly into the AD code + //fin ); + let gcm_ad_code = assemble_engine25519!( + // Input: rkeys in %31-%17 (backward, LE) + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // ADLEN in %12 (in 16-byte-blocks) + // H in %15 (byte-reverted) + // T in %14 + // accum in %13 + // Transient: + // %0, %1, %4, %5, %6, %7 are tmp + // Output: + // all inputs preserved except ADLEN (%12) & RD_PTR (%3) + // Updated accum is in %13 + + // if no ad, finish + brz done, %12 + // do one block, repeat + do_ad: load %0, %3 + gcm_brev64 %0, %0 + gcm_swap64 %0, %0, %0 + + xor %0, %0, %13 + add %3, %3, #12 // #12 is 16 in both 128 bits halves + // #13 is 1 in both 128 bits halves + sub %12, %12, #13 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + + brz done, %12 + brz do_ad, #0 + + done: + fin + ); + let gcm_aes_code = assemble_engine25519!( + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // WR_PTR in %11 + // MLEN in %12 (in *complete* 16-byte-blocks) + // H in %15 (byte-reverted) + // T in %14 + // accum in %13 + // Transient: + // %0, %1, %4, %5, %6, %7 are tmp + // Output: + // all inputs preserved except RD_PTR (%3), WR_PTR (%11), MLEN (%12) + // accum is in %13 + + // if no msg, finish + brz done, %12 + // do one block, repeat + do_msg: + // increment counter + gcm_brev32 %16, %16 + add %16, %16, #11 + gcm_brev32 %16, %16 + + xor %0, %16, %31 + + aesesmi %1, %0, %30, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %29, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %28, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %27, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %26, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %25, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %24, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %23, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %22, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %21, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %20, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %19, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %18, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesi %0, %1, %17, #0 + aesesi %0, %1, %0, #1 + aesesi %0, %1, %0, #2 + aesesi %1, %1, %0, #3 + + //gcm_brev64 %1, %0 + //gcm_swap64 %1, %1, %1 + + load %0, %3 + xor %0, %0, %1 + store %11, %11, %0 + + gcm_brev64 %0, %0 + gcm_swap64 %0, %0, %0 + + xor %0, %0, %13 + add %3, %3, #12 // #12 is 16 in both 128 bits halves + add %11, %11, #12 // #12 is 16 in both 128 bits halves + // #13 is 1 in both 128 bits halves + sub %12, %12, #13 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + + brz done, %12 + brz do_msg, #0 + done: + fin + + ); + let gcm_finish_code = assemble_engine25519!( + // pub in %16 (0-11, 12-15 are ctr so 0, LE) + // RD_PTR in %3 + // WR_PTR in %11 + // MLEN in %12 (do one *partial* 16-byte-blocks, so 0 or non-zero) + // MMASK in %10 (could be computed from MLEN%16 but we don't have an instruction for it yet) + // finalblock in %9 (could be computed but we'd need to know the exact value of adlen) + // H in %15 (byte-reverted) + // T in %14 + // accum in %13 + // Transient: + // %0, %1, %4, %5, %6, %7 are tmp + // Output: + // all inputs preserved except RD_PTR (%3), WR_PTR (%11), MLEN (%12) + // accum is in %13 + // accum ^ T is in %8 + brz last, %12 + + finish_mlen: + // increment counter + gcm_brev32 %16, %16 + add %16, %16, #11 + gcm_brev32 %16, %16 + + xor %0, %16, %31 + + aesesmi %1, %0, %30, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %29, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %28, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %27, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %26, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %25, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %24, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %23, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %22, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %21, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %20, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesmi %0, %1, %19, #0 + aesesmi %0, %1, %0, #1 + aesesmi %0, %1, %0, #2 + aesesmi %0, %1, %0, #3 + + aesesmi %1, %0, %18, #0 + aesesmi %1, %0, %1, #1 + aesesmi %1, %0, %1, #2 + aesesmi %1, %0, %1, #3 + + aesesi %0, %1, %17, #0 + aesesi %0, %1, %0, #1 + aesesi %0, %1, %0, #2 + aesesi %1, %1, %0, #3 + + //gcm_brev64 %1, %0 + //gcm_swap64 %1, %1, %1 + + and %1, %1, %10 + load %0, %3 + xor %0, %0, %1 + + store %11, %11, %0 + + gcm_brev64 %0, %0 + gcm_swap64 %0, %0, %0 + + xor %0, %0, %13 + //add %3, %3, #12 // #12 is 16 in both 128 bits halves + //add %11, %11, #12 // #12 is 16 in both 128 bits halves + // #13 is 1 in both 128 bits halves + //sub %12, %12, #13 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + last: + // addmul of finalblock + + gcm_brev64 %9, %9 + gcm_swap64 %9, %9, %9 + xor %0, %9, %13 + //add %3, %3, #12 // #12 is 16 in both 128 bits halves + //add %11, %11, #12 // #12 is 16 in both 128 bits halves + // #13 is 1 in both 128 bits halves + //sub %12, %12, #13 + + // // poly mult accum = ((accum^ad) * H) + // C + clmul %4, %0, %15, #0 + // E + clmul %5, %0, %15, #1 + // F + clmul %6, %0, %15, #2 + // D + clmul %7, %0, %15, #3 + // E ^ F + xor %6, %5, %6 + // put low64 of E^F in high64 + gcm_swap64 %5, %6, #0 + // put high64 of E^F in low64 + gcm_swap64 %6, #0, %6 + // D xor low + xor %7, %7, %6 + // C xor high + xor %4, %4, %5 + + // // reduction + // X1:X0 in %4 + // X3:X2 in %7 + // shift everybody by 1 to the left + // high shifting in 1 bit from low + gcm_shlmi %1, %7, %4, #1 + // low + gcm_shlmi %0, %4, #0, #1 + // post-shift + // X1:X0 in %0 + // X3:X2 in %1 + // compute D + gcm_cmpd %2, %0 + // compute E, F, G + gcm_shrmi %6, %2, #0, #1 + gcm_shrmi %4, %2, #0, #2 + gcm_shrmi %5, %2, #0, #7 + // XOR everybody + xor %2, %2, %6 + xor %4, %4, %5 + xor %2, %2, %4 + xor %13, %2, %1 + + gcm_brev64 %13, %13 + gcm_swap64 %13, %13, %13 + + xor %8, %13, %14 + + fin + ); let mut pos = 0; - while pos < gcm_ad_code.len() { - println!("0x{:08x},", gcm_ad_code[pos]); + while pos < gcm_finish_code.len() { + println!("0x{:08x},", gcm_finish_code[pos]); pos = pos + 1; } Ok(()) diff --git a/sbus-to-ztex-gateware-migen/netbsd_csr.h b/sbus-to-ztex-gateware-migen/netbsd_csr.h index 277fb33..01b3798 100644 --- a/sbus-to-ztex-gateware-migen/netbsd_csr.h +++ b/sbus-to-ztex-gateware-migen/netbsd_csr.h @@ -1,5 +1,5 @@ //-------------------------------------------------------------------------------- -// Auto-generated by Migen (3ffd64c) & LiteX (8a644c90) on 2021-08-22 07:40:46 +// Auto-generated by Migen (3ffd64c) & LiteX (8a644c90) on 2021-09-03 09:40:05 //-------------------------------------------------------------------------------- #ifndef __GENERATED_CSR_H #define __GENERATED_CSR_H @@ -237,11 +237,21 @@ static inline uint32_t curve25519engine_status_sigill_read(struct sbusfpga_curve uint32_t word = curve25519engine_status_read(sc); return curve25519engine_status_sigill_extract(sc, word); } -#define CSR_CURVE25519ENGINE_STATUS_FINISHED_OFFSET 13 +#define CSR_CURVE25519ENGINE_STATUS_ABORT_OFFSET 13 +#define CSR_CURVE25519ENGINE_STATUS_ABORT_SIZE 1 +static inline uint32_t curve25519engine_status_abort_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { + uint32_t mask = ((1 << 1)-1); + return ( (oldword >> 13) & mask ); +} +static inline uint32_t curve25519engine_status_abort_read(struct sbusfpga_curve25519engine_softc *sc) { + uint32_t word = curve25519engine_status_read(sc); + return curve25519engine_status_abort_extract(sc, word); +} +#define CSR_CURVE25519ENGINE_STATUS_FINISHED_OFFSET 14 #define CSR_CURVE25519ENGINE_STATUS_FINISHED_SIZE 1 static inline uint32_t curve25519engine_status_finished_extract(struct sbusfpga_curve25519engine_softc *sc, uint32_t oldword) { uint32_t mask = ((1 << 1)-1); - return ( (oldword >> 13) & mask ); + return ( (oldword >> 14) & mask ); } static inline uint32_t curve25519engine_status_finished_read(struct sbusfpga_curve25519engine_softc *sc) { uint32_t word = curve25519engine_status_read(sc); @@ -439,6 +449,11 @@ static inline uint32_t curve25519engine_instruction_immediate_read(struct sbusfp uint32_t word = curve25519engine_instruction_read(sc); return curve25519engine_instruction_immediate_extract(sc, word); } +#define CSR_CURVE25519ENGINE_LS_STATUS_ADDR (CSR_CURVE25519ENGINE_BASE + 0x2cL) +#define CSR_CURVE25519ENGINE_LS_STATUS_SIZE 1 +static inline uint32_t curve25519engine_ls_status_read(struct sbusfpga_curve25519engine_softc *sc) { + return bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_curve25519engine, 0x2cL); +} #endif // CSR_CURVE25519ENGINE_BASE /* ddrphy */ diff --git a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py index ebe6ba2..b574d74 100644 --- a/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py +++ b/sbus-to-ztex-gateware-migen/sbus_to_fpga_soc.py @@ -139,9 +139,9 @@ class _CRG(Module): self.submodules.idelayctrl = S7IDELAYCTRL(self.cd_idelay) class SBusFPGA(SoCCore): - def __init__(self, **kwargs): - self.version = "V1.0"; - + def __init__(self, version, **kwargs): + print(f"Building SBusFPGA for board version {version}") + kwargs["cpu_type"] = "None" kwargs["integrated_sram_size"] = 0 kwargs["with_uart"] = False @@ -149,9 +149,9 @@ class SBusFPGA(SoCCore): self.sys_clk_freq = sys_clk_freq = 100e6 ## 25e6 - self.platform = platform = ztex213_sbus.Platform(variant="ztex2.13a", version = self.version) + self.platform = platform = ztex213_sbus.Platform(variant="ztex2.13a", version = version) - if (self.version == "V1.0"): + if (version == "V1.0"): self.platform.add_extension(ztex213_sbus._usb_io_v1_0) SoCCore.__init__(self, @@ -183,7 +183,7 @@ class SBusFPGA(SoCCore): self.submodules.crg = _CRG(platform=platform, sys_clk_freq=sys_clk_freq) self.platform.add_period_constraint(self.platform.lookup_request("SBUS_3V3_CLK", loose=True), 1e9/25e6) # SBus max - if (self.version == "V1.0"): + if (version == "V1.0"): self.submodules.leds = LedChaser( pads = platform.request("SBUS_DATA_OE_LED_2"), #platform.request("user_led", 7), sys_clk_freq = sys_clk_freq) @@ -299,6 +299,7 @@ class SBusFPGA(SoCCore): #self.submodules.curve25519engine_wishbone_cdc = wishbone.WishboneDomainCrossingMaster(platform=self.platform, slave=self.curve25519engine.bus, cd_master="sys", cd_slave="clk100") #self.bus.add_slave("curve25519engine", self.curve25519engine_wishbone_cdc, SoCRegion(origin=self.mem_map.get("curve25519engine", None), size=0x20000, cached=False)) self.bus.add_slave("curve25519engine", self.curve25519engine.bus, SoCRegion(origin=self.mem_map.get("curve25519engine", None), size=0x20000, cached=False)) + self.bus.add_master(name="curve25519engineLS", master=self.curve25519engine.busls) #self.submodules.curve25519_on_sync = BusSynchronizer(width = 1, idomain = "clk100", odomain = "sys") #self.comb += self.curve25519_on_sync.i.eq(self.curve25519engine.power.fields.on) #self.comb += self.crg.curve25519_on.eq(self.curve25519_on_sync.o) @@ -307,17 +308,20 @@ class SBusFPGA(SoCCore): def main(): parser = argparse.ArgumentParser(description="SbusFPGA") parser.add_argument("--build", action="store_true", help="Build bitstream") + parser.add_argument("--version", default="V1.0", help="SBusFPGA board version (default V1.0)") builder_args(parser) vivado_build_args(parser) args = parser.parse_args() - soc = SBusFPGA(**soc_core_argdict(args)) + soc = SBusFPGA(**soc_core_argdict(args), + version=args.version) #soc.add_uart(name="uart", baudrate=115200, fifo_depth=16) builder = Builder(soc, **builder_argdict(args)) builder.build(**vivado_build_argdict(args), run=args.build) # Generate modified CSR registers definitions/access functions to netbsd_csr.h. + # should be split per-device (and without base) to still work if we have identical devices in different configurations on multiple boards csr_contents = sbus_to_fpga_export.get_csr_header( regions = soc.csr_regions, constants = soc.constants, @@ -325,6 +329,9 @@ def main(): write_to_file(os.path.join("netbsd_csr.h"), csr_contents) # tells the prom where to find what + # just one, as that is board-specific + # BEWARE! then need to run 'forth_to_migen_rom.sh' *and* regenerate the bitstream with the proper PROM built-in! + # (there's surely a better way...) csr_forth_contents = sbus_to_fpga_export.get_csr_forth_header( csr_regions = soc.csr_regions, mem_regions = soc.mem_regions,