diff --git a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c index acdeaef..1f623d6 100644 --- a/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c +++ b/NetBSD/9.0/usr/src/sys/dev/sbus/sbusfpga_curve25519engine.c @@ -173,9 +173,9 @@ static const uint32_t program_aes[16] = {0x0001f003,0x0005e012,0x0001d052,0x0005 static const uint32_t program_gcm_pfx[30] = {0x01400411,0x00080840,0x00040800,0x0001f043,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00811052,0x03800089,0x003c0000,0x01400411,0x0042b405,0x01400411,0x00080800,0x00040400,0xf4800809,0x00380000,0x01bc03d1,0x003cf3d1,0x00340800 }; -static const uint32_t program_gcm_ad[29] = {0x0d800309,0x000000d3,0x01800011,0x00000011,0x0000d003,0x000ec0c5,0x0032d306,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x00800309,0xf2800809,0x0000000a }; +static const uint32_t program_gcm_ad[29] = {0x0d800309,0x000000d3,0x01800011,0x00000011,0x0000d003,0x000f00c5,0x00321306,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x00800309,0xf2800809,0x0000000a }; -static const uint32_t program_gcm_aes[50] = {0x18000309,0x01400411,0x0042b405,0x01400411,0x0001f403,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00851052,0x000000d3,0x00001003,0x00ac02d3,0x01800011,0x00000011,0x0000d003,0x000ec0c5,0x002ec2c5,0x0032d306,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x00800309,0xe8000809,0x0000000a }; +static const uint32_t program_gcm_aes[50] = {0x18000309,0x01400411,0x0042b405,0x01400411,0x0001f403,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00851052,0x000000d3,0x00001003,0x00ac02d3,0x01800011,0x00000011,0x0000d003,0x000f00c5,0x002f02c5,0x00321306,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x00800309,0xe8000809,0x0000000a }; static const uint32_t program_gcm_finish[71] = {0x16000309,0x01400411,0x0042b405,0x01400411,0x0001f403,0x0005e012,0x0001d052,0x0005c012,0x0001b052,0x0005a012,0x00019052,0x00058012,0x00017052,0x00056012,0x00015052,0x00054012,0x00013052,0x00052012,0x00851052,0x0004a054,0x000000d3,0x00001003,0x00ac02d3,0x01800011,0x00000011,0x0000d003,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x01a40251,0x00249251,0x0000d243,0x0010f00d,0x0094f00d,0x0118f00d,0x019cf00d,0x00186143,0x00160191,0x00186811,0x001c61c3,0x00105103,0x008441ce,0x0082010e,0x00080010,0x009a008f,0x0112008f,0x0396008f,0x00086083,0x00105103,0x00084083,0x00341083,0x01b40351,0x0034d351,0x0020e343,0x0000000a }; @@ -532,25 +532,20 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st curve25519engine_window_write(sc, unit); /* to each session its own register file */ /* read_addr */ - for (i = 0 ; i < 8 ; i ++) { - /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); } - /* write_addr */ - /* for (i = 0 ; i < 8 ; i ++) { */ - /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(4,i), ((i & 3) == 0) ? ((uint32_t)wr_ptr) : 0); */ - /* } */ /* write_len */ - for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i&3) == 0) ? ((uint32_t)job->len) : 0); + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); } /* data */ - for (i = 0 ; i < 8 ; i ++) { + for (i = 0 ; i < 4 ; i ++) { bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(16,i), job->data[i]); } for (reg = 31 ; reg > 16 ; reg--) { - for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i), job->keys[(i&3)+4*(31-reg)]); + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(reg,i), job->keys[i+4*(31-reg)]); } } @@ -597,13 +592,12 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st curve25519engine_window_write(sc, unit); /* to each session its own register file */ /* read_addr */ - for (i = 0 ; i < 8 ; i ++) { - /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); } /* write_len */ - for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i & 3) == 0) ? ((uint32_t)job->len) : 0); + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); } err = start_job(sc); @@ -650,17 +644,16 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st curve25519engine_window_write(sc, unit); /* to each session its own register file */ /* read_addr */ - for (i = 0 ; i < 8 ; i ++) { - /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); } /* write_addr */ - for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), ((i & 3) == 0) ? ((uint32_t)wr_ptr) : 0); + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), (i == 0) ? ((uint32_t)wr_ptr) : 0); } /* write_len */ - for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i & 3) == 0) ? ((uint32_t)job->len) : 0); + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); } err = start_job(sc); @@ -707,22 +700,20 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st curve25519engine_window_write(sc, unit); /* to each session its own register file */ /* read_addr */ - for (i = 0 ; i < 8 ; i ++) { - /* bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); */ - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), ((i & 3) == 0) ? ((uint32_t)rd_ptr) : 0); + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(3,i), (i == 0) ? ((uint32_t)rd_ptr) : 0); } /* write_addr */ - for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), ((i & 3) == 0) ? ((uint32_t)wr_ptr) : 0); + for (i = 0 ; i < 4 ; i ++) { + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(11,i), (i == 0) ? ((uint32_t)wr_ptr) : 0); } /* write_len */ - for (i = 0 ; i < 8 ; i ++) { - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), ((i & 3) == 0) ? ((uint32_t)job->len) : 0); + for (i = 0 ; i < 8 ; i ++) { // all the way to 8 to make sure we have zero in every bit checked by BRZ + bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(12,i), (i == 0) ? ((uint32_t)job->len) : 0); } /* final block */ for (i = 0 ; i < 4 ; i ++) { bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(9,i), job->data[i]); - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(9,i+4), job->data[i]); } /* create and generate MMASK */ for (i = 0 ; i < 4 ; i ++) { @@ -736,7 +727,6 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st mask = 0xFFFFFFFF >> (8*(4-(job->len%4))); } bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(10,i), mask); - bus_space_write_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(10,(i+4)), mask); } @@ -749,7 +739,7 @@ sbusfpga_curve25519engine_ioctl (dev_t dev, u_long cmd, void *data, int flag, st return err; /* final accum */ - for (i = 0 ; i < 8 ; i ++) { + for (i = 0 ; i < 4 ; i ++) { job->data[i] = bus_space_read_4(sc->sc_bustag, sc->sc_bhregs_regfile,SUBREG_ADDR(8,i)); } diff --git a/sbus-to-ztex-gateware-migen/engine.py b/sbus-to-ztex-gateware-migen/engine.py index cacf51c..d18d232 100644 --- a/sbus-to-ztex-gateware-migen/engine.py +++ b/sbus-to-ztex-gateware-migen/engine.py @@ -25,14 +25,17 @@ opcodes = { # mnemonic : [bit coding, docstring] "FIN" : [10, "halt execution and assert interrupt to host CPU that microcode execution is done"], "SHL" : [11, "Wd $\gets$ Ra << 1 // shift Ra left by one and store in Wd"], "XBT" : [12, "Wd[0] $\gets$ Ra[254] // extract the 255th bit of Ra and put it into the 0th bit of Wd"], + "AND" : [20, "Wd $\gets$ Ra & Rb // bitwise AND"], + # for CLMUL, bit #31 indicates both lanes are needed; currently same speed "CLMUL": [13, "carry-less multiplication; reg-reg only; per 128-bits block"], # basically 256-bits form of vpclmulqdq "GCM_SHLMI": [14, "Shift A left by imm, insert B MSB as dest LSB; reg-reg or reg-imm; per 128-bits block"], # make SHL redundant: SHL %rd, %ra == GCM_SHLMI %rd, %ra, #0, #1 "GCM_SHRMI": [15, "Shift A right by imm, insert B LSB as dest MSB; reg-reg or reg-imm; per 128-bits block"], # "GCM_CMPD": [16, "Compute D:X0 from X1:X0; reg ; per 128-bits block"], # specific "GCM_SWAP64": [17, "Swap doubleword (64 bits) ; reg-reg or imm-reg or reg-imm; per 128-bits block ; imm != 0 -> BYTEREV*"], # + # for AESESMI, bit #31 indicates both lanes are needed; currently same speed "AESESMI" : [18, "AES ; reg-reg ; per 128-bits block; imm[0] is 1 for aesesi (shared opcode)" ], + # for MEM, bit #31 indicates both lanes are needed; b[31] == 0 faster as the second access is not done "MEM" : [19, "MEM ; imm[0] == 0 for LOAD, imm[0] == 1 for STORE (beware, store copy the address in the output reg)" ], - "AND" : [20, "Wd $\gets$ Ra & Rb // bitwise AND"], "MAX" : [21, "Maximum opcode number (for bounds checking)"], } @@ -239,7 +242,10 @@ class Curve25519Const(Module, AutoDoc): 10: [254, "two hundred fifty four", "The number 254 (iteration count)"], 11: [0x00000001_00000000_00000000_00000000_00000001_00000000_00000000_00000000, "increment for GCM counter (LE)", "increment for GCM counter (LE)"], 12: [0x00000000_00000000_00000000_00000010_00000000_00000000_00000000_00000010, "sixteen (twice)", "The number 16 (for block-size address increment)"], - 13: [0x00000000_00000000_00000000_00000001_00000000_00000000_00000000_00000001, "decrement for GCM dual-loops (LE)", "decrement for GCM dual-loops"] + 13: [0x00000000_00000000_00000000_00000001_00000000_00000000_00000000_00000001, "decrement for GCM dual-loops (LE)", "decrement for GCM dual-loops"], + # 14 + # 15 + 16: [16, "sixteen", "The number 16"], } self.adr = Signal(5) self.const = Signal(256) @@ -1462,7 +1468,6 @@ class ExecClmul(ExecUnit, AutoDoc): #self.q_valid.eq(self.start), self.instruction_out.eq(self.instruction_in), ] - self.submodules.seq = seq = ClockDomainsRenamer("eng_clk")(FSM(reset_state="IDLE")) seq.act("IDLE", @@ -1494,9 +1499,12 @@ class ExecClmul(ExecUnit, AutoDoc): })) seq.act("OUT", self.q_valid.eq(1), - self.q.eq(clmul_buf), - NextState("IDLE"), - ); + If(self.instruction.immediate[8:9], + self.q.eq(clmul_buf), + ).Else( + self.q.eq(Cat(clmul_buf[0:128], Signal(128, reset = 0))) + ), + NextState("IDLE")); class ExecGCMShifts(ExecUnit, AutoDoc): @@ -1753,7 +1761,11 @@ class ExecAES(ExecUnit, AutoDoc): self.sync.mul_clk += [ If(seq.ongoing("AES_EVEN1") | seq.ongoing("AES_EVEN2"), self.q_valid.eq(1), - self.q.eq(aes_buf), + If(self.instruction.immediate[8:9], + self.q.eq(aes_buf), + ).Else( + self.q.eq(Cat(aes_buf[0:128], Signal(128, reset = 0))), + ) ).Else( self.q_valid.eq(0), ) @@ -1824,15 +1836,24 @@ class ExecLS(ExecUnit, AutoDoc): lsseq.act("MEMl2", NextValue(cpar, cpar ^ 1), If(~interface.ack, - NextValue(interface.cyc, 1), - NextValue(interface.stb, 1), - NextValue(interface.sel, 2**len(interface.sel)-1), - NextValue(interface.adr, self.a[132:160]), - NextValue(interface.we, self.instruction.immediate[0]), - NextValue(timeout, 2047), - If(self.instruction.immediate[0], - NextValue(interface.dat_w, self.b[128:256])), - NextState("MEMh") + If(self.instruction.immediate[8:9], + NextValue(interface.cyc, 1), + NextValue(interface.stb, 1), + NextValue(interface.sel, 2**len(interface.sel)-1), + NextValue(interface.adr, self.a[132:160]), + NextValue(interface.we, self.instruction.immediate[0]), + NextValue(timeout, 2047), + If(self.instruction.immediate[0], + NextValue(interface.dat_w, self.b[128:256])), + NextState("MEMh") + ).Else( + NextValue(lbuf[128:256], 0), + If(cpar, ## checkme + NextState("MEM_ODD") + ).Else( + NextState("MEM_EVEN1") + ) + ) )) lsseq.act("MEMh", NextValue(cpar, cpar ^ 1), diff --git a/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs index 79c9da0..42974be 100644 --- a/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs +++ b/sbus-to-ztex-gateware-migen/engine_code/engine_code.rs @@ -748,9 +748,8 @@ fn main() -> std::io::Result<()> { gcm_swap64 %0, %0, %0 xor %0, %0, %13 - add %3, %3, #12 // #12 is 16 in both 128 bits halves - // #13 is 1 in both 128 bits halves - sub %12, %12, #13 + add %3, %3, #16 + sub %12, %12, #1 // // poly mult accum = ((accum^ad) * H) // C @@ -865,10 +864,10 @@ fn main() -> std::io::Result<()> { gcm_swap64 %0, %0, %0 xor %0, %0, %13 - add %3, %3, #12 // #12 is 16 in both 128 bits halves - add %11, %11, #12 // #12 is 16 in both 128 bits halves - // #13 is 1 in both 128 bits halves - sub %12, %12, #13 + add %3, %3, #16 + add %11, %11, #16 + + sub %12, %12, #1 // // poly mult accum = ((accum^ad) * H) // C @@ -986,10 +985,10 @@ fn main() -> std::io::Result<()> { gcm_swap64 %0, %0, %0 xor %0, %0, %13 - //add %3, %3, #12 // #12 is 16 in both 128 bits halves - //add %11, %11, #12 // #12 is 16 in both 128 bits halves - // #13 is 1 in both 128 bits halves - //sub %12, %12, #13 + //add %3, %3, #16 + //add %11, %11, #16 + + //sub %12, %12, #1 // // poly mult accum = ((accum^ad) * H) // C @@ -1039,10 +1038,9 @@ fn main() -> std::io::Result<()> { gcm_brev64 %9, %9 gcm_swap64 %9, %9, %9 xor %0, %9, %13 - //add %3, %3, #12 // #12 is 16 in both 128 bits halves - //add %11, %11, #12 // #12 is 16 in both 128 bits halves - // #13 is 1 in both 128 bits halves - //sub %12, %12, #13 + //add %3, %3, #16 + //add %11, %11, #16 + //sub %12, %12, #1 // // poly mult accum = ((accum^ad) * H) // C