push masking from registers to stores
This commit is contained in:
@@ -160,40 +160,21 @@ static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, i
|
||||
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop);
|
||||
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
|
||||
static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop);
|
||||
|
||||
static const uint32_t program_fill[35] = { 0x10000089,0x0f8000c9,0x01bc0014,0x0780000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,0xc03c1033,0x00184185,0x00161146,0xfd800148,0x00226007,0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x07000089,0x001a6087,0x013c6814,0x403c0012,0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0080018d,0xc03c1033,0x000e10c6,0x010000c9,0x00004005,0xfa000809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000,0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007,0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814,0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_copy[43] = { 0x14000089,0x138000c9,0x01bc0014,0x013c2014,0x00bf0054,0x0900000d,0x002400c0,0x00180000,0x403c0192,0x80a00013,0x403c8033,0x00184185,0x00261246,0xfd000248,0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0a000089,0x013f0814,0x00049045,0x00bf0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02000249,0x603c8033,0x00270246,0x20a08015,0xfe000248,0x0180018d,0x013c6814,0x403c8033,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf8000809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_copyrev[61] = { 0x1d000089,0x1c8000c9,0x01bc0014,0x00280000,0x002c0040,0x00340080,0x003af007,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x11000349,0x013f0814,0x00bf02d4,0x001af347,0x003c6346,0x003c03d0,0x0028f285,0x002cf2c5,0x02000188,0x003c0c00,0x003c03d0,0x0028f286,0x002cf2c6,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0200018d,0x013c6814,0x10a08016,0x503c8033,0x013f0814,0x02000249,0x10a08016,0x503c8033,0x00270246,0xfd800248,0x00321306,0x01000309,0x00284285,0xf7800809,0x04800389,0x013c2014,0x00bf0054,0x002400c0,0x403c0012,0x80a00013,0x403c8033,0x00004005,0x00261246,0xfd000248,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_scroll128[12] = { 0x407c0012,0x00140080,0x201c0013,0x60fc7013,0x00170146,0xfe000148,0x000e10c6,0x010000c9,
|
||||
0x00004005,0xfb000809,0x0000000a,0x0000000a };
|
||||
static const uint32_t program_fill128[11] = { 0x407c0012,0x00140080,0x607c1013,0x00170146,0xfe800148,0x000e10c6,0x010000c9,0x00004005,
|
||||
0xfb800809,0x0000000a,0x0000000a };
|
||||
static const uint32_t program_fill256[14] = { 0x01bc0014,0x001a6087,0x013c6814,0x403c0012,0x00146086,0xe03c1013,0x00165146,0xfe800148,
|
||||
0x000e10c6,0x010000c9,0x00004005,0xfb800809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_fill[39] = { 0x12000089,0x118000c9,0x01bc0014,0x0880000d,0x013c2014,0x001400c0,0x00180000,0x403c0192,
|
||||
0x801c0013,0x001c0060,0xc03c7013,0x00184185,0x00161146,0xfc800148,0x00226007,0x00208946,
|
||||
0x0020220f,0x00008005,0x00088086,0x01048050,0x08000089,0x001a6087,0x013c6814,0x403c0012,
|
||||
0x00146086,0x01800149,0xe03c1013,0x00165146,0xfe800148,0x0180018d,0x801c0013,0x001c0060,
|
||||
0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf9000809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_fillrop[42] = { 0x13800089,0x130000c9,0x01bc0014,0x003c014c,0x0880000d,0x013c2014,0x002000c0,0x00180000,
|
||||
0x403c0192,0x801c0013,0x001c11e2,0xc03c7013,0x00184185,0x00221206,0xfc800208,0x00226007,
|
||||
0x00208946,0x0020220f,0x00008005,0x00088086,0x01048050,0x09000089,0x001a6087,0x013c6814,
|
||||
0x403c0012,0x00206086,0x02800209,0x801c0013,0x001c11c2,0xe03c7013,0x00225206,0xfd800208,
|
||||
0x0180018d,0x801c0013,0x001c11e2,0xc03c7013,0x000e10c6,0x010000c9,0x00004005,0xf8000809,
|
||||
0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_copy[48] = { 0x16800089,0x160000c9,0x01bc0014,0x013c2014,0x003f0054,0x0a00000d,0x002400c0,0x00180000,
|
||||
0x403c0192,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00184185,0x00261246,0xfc000248,
|
||||
0x0026f007,0x00249c06,0x0024224f,0x00240250,0x00009005,0x00089086,0x0b800089,0x013f0814,
|
||||
0x00049045,0x003f0054,0x001af087,0x403c0012,0x00246086,0xa0a00013,0x02800249,0x001c0220,
|
||||
0x603c7013,0x00270246,0x20a08015,0xfd800248,0x0280018d,0x013c6814,0x001c0013,0x001c0220,
|
||||
0x403c7013,0x013f0814,0x000e10c6,0x010000c9,0x00004005,0xf6800809,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t program_copyrev[66] = { 0x1f800089,0x1f0000c9,0x01bc0014,0x00280000,0x002c0040,0x00340080,0x003af007,0x03800389,0x0038ec06,0x0038238f,0x00380390,0x0028e285,0x002ce2c5,0x0034e086,0x12800349,0x013f0814,0x003f02d4,0x001af347,0x003c6346,0x003c03d0,0x0028f285,0x002cf2c5,0x02000188,0x003c0c00,0x003c03d0,0x0028f286,0x002cf2c6,0x002f02c5,0x003c0c00,0x00bc03d0,0x0028f285,0x003000c0,0x403c0292,0x00246346,0x10a00013,0x0300018d,0x013c6814,0x10a08016,0x001c0013,0x001c0220,0x503c7013,0x013f0814,0x02800249,0x10a08016,0x001c0220,0x503c7013,0x00270246,0xfd000248,0x00321306,0x01000309,0x00284285,0xf6000809,0x05800389,0x013c2014,0x003f0054,0x002400c0,0x403c0012,0x80a00013,0x001c0013,0x001c0220,0x403c7013,0x00004005,0x00261246,0xfc000248,0x0000000a,0x0000000a };
|
||||
|
||||
static const uint32_t* programs[8] = { program_scroll128, program_fill128, program_fill256, program_fill,
|
||||
program_fillrop, program_copy, program_copyrev, NULL };
|
||||
static const uint32_t program_len[8] = { 12, 11, 14, 39,
|
||||
42, 48, 66, 0 };
|
||||
static uint32_t program_offset[8];
|
||||
static const uint32_t* programs[6] = { program_fill, program_fillrop, program_copy, program_copyrev, program_scroll128, NULL };
|
||||
static const uint32_t program_len[6] = { 35, 42, 43, 61, 12, 0 };
|
||||
static uint32_t program_offset[6];
|
||||
|
||||
static void goblin_set_depth(struct goblin_softc *, int);
|
||||
|
||||
@@ -468,10 +449,10 @@ goblinioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
|
||||
return ENXIO;
|
||||
}
|
||||
switch (fn->off) {
|
||||
case JARETH_FN_NUM_FILL: pidx = 3; break;
|
||||
case JARETH_FN_NUM_FILLROP: pidx = 4; break;
|
||||
case JARETH_FN_NUM_COPY: pidx = 5; break;
|
||||
case JARETH_FN_NUM_COPYREV: pidx = 6; break;
|
||||
case JARETH_FN_NUM_FILL: pidx = 0; break;
|
||||
case JARETH_FN_NUM_FILLROP: pidx = 1; break;
|
||||
case JARETH_FN_NUM_COPY: pidx = 2; break;
|
||||
case JARETH_FN_NUM_COPYREV: pidx = 3; break;
|
||||
}
|
||||
if (pidx != -1) {
|
||||
fn->off = program_offset[pidx];
|
||||
@@ -864,7 +845,7 @@ static int wait_job(struct goblin_softc *sc, uint32_t param, enum jareth_verbosi
|
||||
|
||||
static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n) {
|
||||
const uint32_t base = 0;
|
||||
const int pidx = 0;
|
||||
const int pidx = 4;
|
||||
/* int i; */
|
||||
|
||||
power_on(sc);
|
||||
@@ -901,7 +882,7 @@ static int jareth_scroll(struct goblin_softc *sc, enum jareth_verbosity verbose,
|
||||
|
||||
static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n) {
|
||||
const uint32_t base = 0;
|
||||
const int pidx = 3; // fill
|
||||
const int pidx = 0; // fill
|
||||
int i;
|
||||
|
||||
power_on(sc);
|
||||
@@ -931,7 +912,7 @@ static int jareth_fill(struct goblin_softc *sc, enum jareth_verbosity verbose, i
|
||||
|
||||
static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int pat, int x0, int w, int n, int pm, int rop) {
|
||||
const uint32_t base = 0;
|
||||
const int pidx = 4; // fillrop
|
||||
const int pidx = 1; // fillrop
|
||||
int i;
|
||||
|
||||
power_on(sc);
|
||||
@@ -963,7 +944,7 @@ static int jareth_fillrop(struct goblin_softc *sc, enum jareth_verbosity verbose
|
||||
|
||||
static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) {
|
||||
const uint32_t base = 0;
|
||||
const int pidx = 5; // copy
|
||||
const int pidx = 2; // copy
|
||||
/* int i; */
|
||||
|
||||
/* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */
|
||||
@@ -1010,7 +991,7 @@ static int jareth_copy(struct goblin_softc *sc, enum jareth_verbosity verbose, i
|
||||
|
||||
static int jareth_copyrev(struct goblin_softc *sc, enum jareth_verbosity verbose, int y0, int y1, int x0, int w, int n, int x1, int rop) {
|
||||
const uint32_t base = 0;
|
||||
const int pidx = 6; // copyrev
|
||||
const int pidx = 3; // copyrev
|
||||
/* int i; */
|
||||
|
||||
/* device_printf(sc->sc_dev, "%s : %d %d %d %d %d %d\n", __PRETTY_FUNCTION__, y0, y1, x0, w, n, x1); */
|
||||
|
||||
@@ -551,11 +551,17 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
If(wishbone,
|
||||
NextValue(interface.cyc, 1),
|
||||
NextValue(interface.stb, 1),
|
||||
NextValue(interface.sel, 2**len(interface.sel)-1),
|
||||
NextValue(interface.adr, address),
|
||||
NextValue(interface.we, self.instruction.immediate[7]),
|
||||
NextValue(interface.sel, 2**len(interface.sel)-1),
|
||||
If(self.instruction.immediate[7], # do we need those tests or could we always update dat_w/dat_r ?
|
||||
NextValue(interface.dat_w, self.b[0:128])),
|
||||
If(self.instruction.shift,
|
||||
NextValue(interface.dat_w, (self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[0:128]),
|
||||
NextValue(interface.sel, r_dat_m[2][0:16]),
|
||||
).Else(
|
||||
NextValue(interface.dat_w, self.b[0:128]),
|
||||
),
|
||||
),
|
||||
NextState("MEMl") # MEMl
|
||||
).Else(
|
||||
memoryport.cmd.we.eq(self.instruction.immediate[7]),
|
||||
@@ -660,8 +666,13 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
NextValue(lbuf[0:128], memoryport.rdata.data),
|
||||
NextState("MEMl2"),
|
||||
).Elif(~wishbone & self.instruction.immediate[7],
|
||||
memoryport.wdata.data.eq(self.b[0:128]),
|
||||
memoryport.wdata.valid.eq(1),
|
||||
If(self.instruction.shift,
|
||||
memoryport.wdata.data.eq((self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[0:128]),
|
||||
memoryport.wdata.we.eq(r_dat_m[2][0:16]),
|
||||
).Else(
|
||||
memoryport.wdata.data.eq(self.b[0:128]),
|
||||
),
|
||||
If(memoryport.wdata.ready,
|
||||
NextState("MEMl2"),
|
||||
),
|
||||
@@ -682,12 +693,18 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
If(self.instruction.immediate[8],
|
||||
NextValue(interface.cyc, 1),
|
||||
NextValue(interface.stb, 1),
|
||||
NextValue(interface.sel, 2**len(interface.sel)-1),
|
||||
NextValue(interface.adr, address + 1),
|
||||
NextValue(interface.we, self.instruction.immediate[7]),
|
||||
NextValue(interface.sel, 2**len(interface.sel)-1),
|
||||
NextValue(timeout, 2047),
|
||||
If(self.instruction.immediate[7],
|
||||
NextValue(interface.dat_w, self.b[128:256])),
|
||||
If(self.instruction.shift,
|
||||
NextValue(interface.dat_w, (self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[128:256]),
|
||||
NextValue(interface.sel, r_dat_m[2][16:32]),
|
||||
).Else(
|
||||
NextValue(interface.dat_w, self.b[128:256]),
|
||||
),
|
||||
),
|
||||
NextState("MEMh")
|
||||
).Else(
|
||||
If(self.instruction.opcode == opcodes["MEM"][0],
|
||||
@@ -746,8 +763,13 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
NextValue(lbuf[128:256], memoryport.rdata.data),
|
||||
NextState("MEMh2"),
|
||||
).Elif(~wishbone & self.instruction.immediate[7],
|
||||
memoryport.wdata.data.eq(self.b[128:256]),
|
||||
memoryport.wdata.valid.eq(1),
|
||||
If(self.instruction.shift,
|
||||
memoryport.wdata.data.eq((self.b << (Cat(Signal(granule_bits, reset = 0), r_dat_f[2])))[128:256]),
|
||||
memoryport.wdata.we.eq(r_dat_m[2][16:32]),
|
||||
).Else(
|
||||
memoryport.wdata.data.eq(self.b[128:256]),
|
||||
),
|
||||
If(memoryport.wdata.ready,
|
||||
NextState("MEMh2"),
|
||||
),
|
||||
@@ -849,6 +871,7 @@ class ExecLS(ExecUnit, AutoDoc):
|
||||
self.sync.mul_clk += self.state[6].eq(lsseq.ongoing("MEM_EVEN1"))
|
||||
self.sync.mul_clk += self.state[7].eq(lsseq.ongoing("MEM_EVEN2"))
|
||||
self.sync.mul_clk += self.state[8].eq(lsseq.ongoing("MEM_ERR"))
|
||||
self.sync.mul_clk += self.state[9].eq(lsseq.ongoing("GENMASK_R0"))
|
||||
self.sync.mul_clk += self.state[28:30].eq((self.state[28:30] & Replicate(~start_pipe, 2)) | self.has_timeout)
|
||||
self.sync.mul_clk += self.state[30:32].eq((self.state[30:32] & Replicate(~start_pipe, 2)) | self.has_failure)
|
||||
|
||||
@@ -1433,7 +1456,7 @@ Here are the currently implemented opcodes for The Engine:
|
||||
"exec_logic": True,
|
||||
"exec_addsub": False,
|
||||
"exec_rop": True,
|
||||
"exec_ls": False,
|
||||
"exec_ls": True,
|
||||
}
|
||||
exec_unit_shift_num = { }
|
||||
index = 0
|
||||
|
||||
@@ -255,12 +255,8 @@ fn main() -> std::io::Result<()> {
|
||||
loopX_y:
|
||||
// setadr
|
||||
setadr %15, %6
|
||||
// load old data
|
||||
load256 %7, ^0
|
||||
// insert pattern
|
||||
psa* %7, %1
|
||||
// rewrite data
|
||||
store256 %15, ^0, %7
|
||||
// write partial data
|
||||
store256* %15, ^0, %1
|
||||
// increment copied $DST by stride
|
||||
add32v %6, %6, %4
|
||||
// decrement copied Y count
|
||||
@@ -308,13 +304,8 @@ fn main() -> std::io::Result<()> {
|
||||
// check for line leftovers
|
||||
loop256_x_end:
|
||||
brz4 done256_x, %6
|
||||
|
||||
// load old data
|
||||
load256 %7, ^0
|
||||
// insert pattern
|
||||
psa* %7, %1
|
||||
// rewrite data
|
||||
store256 %15, ^0, %7
|
||||
// write partial data
|
||||
store256* %15, ^0, %1
|
||||
|
||||
done256_x:
|
||||
// decrement Y count
|
||||
@@ -474,13 +465,13 @@ fn main() -> std::io::Result<()> {
|
||||
// leftover X in %6
|
||||
// // live Y count in %3
|
||||
// data in %7
|
||||
// masked data in %7
|
||||
// // masked data in %7
|
||||
// 0/scrap in %15
|
||||
// -----
|
||||
// header loop:
|
||||
// live Y count in %9
|
||||
// $SRC / $DST in %6
|
||||
// dst data in %7
|
||||
// // dst data in %7
|
||||
// src data in %8
|
||||
// 0/scrap in %15
|
||||
|
||||
@@ -493,7 +484,8 @@ fn main() -> std::io::Result<()> {
|
||||
resm %15
|
||||
// set alignement; we shift by the addr offset
|
||||
setmq %15, %0, %2
|
||||
setma %15, %1, #16
|
||||
// we use b as that's the data input for Stores
|
||||
setmb %15, %1, #16
|
||||
// if $DST is aligned on 128 bits, jump to aligned loop
|
||||
brz4 start128, %0
|
||||
|
||||
@@ -508,12 +500,8 @@ fn main() -> std::io::Result<()> {
|
||||
setadr %15, %6
|
||||
// load src
|
||||
load256 %8, ^1
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128 %15, ^0, %7
|
||||
// write partial data
|
||||
store128* %15, ^0, %8
|
||||
// increment copied $SRC / $DST by stride
|
||||
add32v %6, %6, %4
|
||||
// decrement copied Y count
|
||||
@@ -542,7 +530,7 @@ fn main() -> std::io::Result<()> {
|
||||
// add the count to the addresses, ^1 will have the proper shift for masking
|
||||
add32v %1, %1, %9
|
||||
// reset a mask to the proper shifting
|
||||
setma %15, %1, #16
|
||||
setmb %15, %1, #16
|
||||
|
||||
start128:
|
||||
// compute X leftovers (modulo 16 -> #15 is 15)
|
||||
@@ -559,10 +547,8 @@ fn main() -> std::io::Result<()> {
|
||||
brz32 loop128_x_end, %9
|
||||
|
||||
loop128_x:
|
||||
// merge data from input
|
||||
psa* %7, %8
|
||||
// store to DST w/ post-increment
|
||||
store128inc %15, ^0, %7
|
||||
store128inc* %15, ^0, %8
|
||||
// sub 16 (#16 is 16) from live rounded X count
|
||||
sub32v %9, %9, #16
|
||||
// prefetch data
|
||||
@@ -576,12 +562,8 @@ fn main() -> std::io::Result<()> {
|
||||
// set the leftovers mask (offset is 0 as we are aligned)
|
||||
// IMPROVE ME
|
||||
setmq %15, #0, %6
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert pattern
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128 %15, ^0, %7
|
||||
store128* %15, ^0, %8
|
||||
// reset the Q mask
|
||||
// IMPROVE ME
|
||||
setmq %15, #0, #16
|
||||
@@ -613,7 +595,7 @@ fn main() -> std::io::Result<()> {
|
||||
// main loop:
|
||||
// leftover X in %6
|
||||
// data in %7
|
||||
// masked data in %7
|
||||
// // masked data in %7
|
||||
// src data in %8
|
||||
// live X count in %9
|
||||
// $SRC / $DST in %10
|
||||
@@ -625,7 +607,7 @@ fn main() -> std::io::Result<()> {
|
||||
// -----
|
||||
// tail loop:
|
||||
// $SRC / $DST in %0
|
||||
// dst data in %7
|
||||
// // dst data in %7
|
||||
// src data in %8
|
||||
// live Y count in %9
|
||||
// 0/scrap in %15
|
||||
@@ -665,8 +647,8 @@ fn main() -> std::io::Result<()> {
|
||||
skip:
|
||||
// reset q mask (we will be aligned from now on)
|
||||
setmq %15, #0, #16
|
||||
// set a mask to the proper shifting
|
||||
setma %15, %11, #16
|
||||
// set b mask to the proper shifting for Stores
|
||||
setmb %15, %11, #16
|
||||
|
||||
// now we need to figure out where we start to go backward
|
||||
// currently we have the number of 'tail' (first column...) elements in %14 (0 for aligned), number of 'loop' elements in %13,
|
||||
@@ -726,12 +708,8 @@ fn main() -> std::io::Result<()> {
|
||||
setmq %15, #0, %6
|
||||
// prefetch data
|
||||
loadl128dec %8, ^1, %8
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128dec %15, ^0, %7
|
||||
// write partial data
|
||||
store128dec* %15, ^0, %8
|
||||
// reset the Q mask
|
||||
// IMPROVE ME
|
||||
setmq %15, #0, #16
|
||||
@@ -741,10 +719,8 @@ fn main() -> std::io::Result<()> {
|
||||
brz32 loop128_x_end, %9
|
||||
// prefetch data
|
||||
loadl128dec %8, ^1, %8
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// write data
|
||||
store128dec %15, ^0, %7
|
||||
store128dec* %15, ^0, %8
|
||||
// sub 16 (#16 is 16) from live rounded X count
|
||||
sub32v %9, %9, #16
|
||||
// if X count is not 0, keep looping
|
||||
@@ -766,7 +742,7 @@ fn main() -> std::io::Result<()> {
|
||||
brz32 done128, %14
|
||||
// set alignement; we shift by the addr offset
|
||||
setmq %15, %0, %2
|
||||
setma %15, %1, #16
|
||||
setmb %15, %1, #16
|
||||
// copy Y
|
||||
psa %9, %3
|
||||
loopX_y:
|
||||
@@ -774,12 +750,8 @@ fn main() -> std::io::Result<()> {
|
||||
setadr %15, %0
|
||||
// load src
|
||||
load256 %8, ^1
|
||||
// load old data
|
||||
load128 %7, ^0
|
||||
// insert data
|
||||
psa* %7, %8
|
||||
// rewrite data
|
||||
store128 %15, ^0, %7
|
||||
// write partial data
|
||||
store128* %15, ^0, %8
|
||||
// increment $SRC / $DST by stride
|
||||
add32v %0, %0, %4
|
||||
// decrement copied Y count
|
||||
|
||||
Reference in New Issue
Block a user