854 lines
21 KiB
Rust
854 lines
21 KiB
Rust
#![recursion_limit="768"]
|
|
|
|
extern crate jareth_as;
|
|
use jareth_as::*;
|
|
|
|
fn main() -> std::io::Result<()> {
|
|
let mcode = assemble_jareth!(
|
|
// 0..0 $DST / $DST / $SRC in %0
|
|
// 0..0 $DST / $SRC / $DST in %1
|
|
// size in %2
|
|
// pattern in %3
|
|
// -----
|
|
// size & 7 in %5
|
|
// size rounded down in %6
|
|
// input in %7
|
|
// output in %8
|
|
// 0 in %15
|
|
start:
|
|
resm %15
|
|
setadr %15, %0
|
|
load256inc %7, ^0
|
|
load256inc %8, ^1
|
|
// slow
|
|
setma %15, %0, #16
|
|
// slow
|
|
setmq %15, %1, #16
|
|
and %5, %2, #15
|
|
sub32v %6, %2, %5
|
|
brz32 done, %6
|
|
loop:
|
|
psa %18, %7
|
|
psa %19, %8
|
|
psa* %8, %7
|
|
psa %20, %8
|
|
store128inc %15, ^2, %8
|
|
sub32v %6, %6, #16
|
|
brz32 last, %6
|
|
loadh128inc %7, ^0, %7
|
|
loadh128inc %8, ^1, %8
|
|
brz32 loop, #0
|
|
last:
|
|
// FIXME: not if Q is aligned
|
|
loadh128inc %8, ^1, %8
|
|
store128inc %15, ^2, %8
|
|
done:
|
|
getadr %3
|
|
getm %2
|
|
fin
|
|
fin
|
|
);
|
|
|
|
let mcode_scroll256 = assemble_jareth!(
|
|
// x..x / $DST / $SRC in %0, aligned on 128 bits ; $DST < $SRC
|
|
// x..x / X size in %2, multiple of 256 bits (32 bytes)
|
|
// x..x / Y size in %3, arbitrary
|
|
// x..x / dst_stride / src_stride in %4 (screen width)
|
|
// -----
|
|
// live X count in %5
|
|
// // live Y count in %3
|
|
// data in %7
|
|
// 0/scrap in %15
|
|
start:
|
|
// reset masks (probably not necessary with the starred-instruction)
|
|
// resm %15
|
|
loop_y:
|
|
// set source and destination addresses for current Y, X=first
|
|
setadr %15, %0
|
|
psa %5, %2
|
|
loop_x:
|
|
// load from SRC w/ post-increment
|
|
load256inc %7, ^0
|
|
// store to DST w/ post-increment
|
|
store256inc %15, ^1, %7
|
|
// sub 32 (#5 is 32...) from live X count
|
|
sub32v %5, %5, #5
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop_x, %5
|
|
|
|
// decrement Y count
|
|
sub32v %3, %3, #1
|
|
// if 0, finished
|
|
brz32 done, %3
|
|
// add strides to initial addresses
|
|
add32v %0, %0, %4
|
|
// loop to do next line
|
|
brz32 loop_y, #0
|
|
done:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
let mcode_scroll128 = assemble_jareth!(
|
|
// x..x / $DST / $SRC in %0, aligned on 128 bits ; $DST < $SRC
|
|
// x..x / X size in %2, multiple of 128 bits (16 bytes)
|
|
// x..x / Y size in %3, arbitrary
|
|
// x..x / dst_stride / src_stride in %4 (screen width)
|
|
// -----
|
|
// live X count in %5
|
|
// // live Y count in %3
|
|
// data in %7
|
|
// 0/scrap in %15
|
|
start:
|
|
// reset masks (probably not necessary with the starred-instruction)
|
|
// resm %15
|
|
loop_y:
|
|
// set source and destination addresses for current Y, X=first
|
|
setadr %15, %0
|
|
psa %5, %2
|
|
loop_x:
|
|
// load from SRC w/ post-increment
|
|
load128inc %7, ^0
|
|
// store to DST w/ post-increment
|
|
store128inc %15, ^1, %7
|
|
// sub 16 (#16 is 16) from live X count
|
|
sub32v %5, %5, #16
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop_x, %5
|
|
|
|
// decrement Y count
|
|
sub32v %3, %3, #1
|
|
// if 0, finished
|
|
brz32 done, %3
|
|
// add strides to initial addresses
|
|
add32v %0, %0, %4
|
|
// loop to do next line
|
|
brz32 loop_y, #0
|
|
done:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
let mcode_fill128 = assemble_jareth!(
|
|
// x..x / $DST in %0, aligned on 128 bits
|
|
// 128-bits pattern in %1
|
|
// x..x / X size in %2, multiple of 128 bits (16 bytes)
|
|
// x..x / Y size in %3, arbitrary
|
|
// x..x / dst_stride in %4 (screen width)
|
|
// -----
|
|
// live X count in %5
|
|
// // live Y count in %3
|
|
// data in %7
|
|
// 0/scrap in %15
|
|
start:
|
|
// reset masks (probably not necessary with the starred-instruction)
|
|
// resm %15
|
|
loop_y:
|
|
// set source and destination addresses for current Y, X=first
|
|
setadr %15, %0
|
|
psa %5, %2
|
|
loop_x:
|
|
// store to DST w/ post-increment
|
|
store128inc %15, ^0, %1
|
|
// sub 16 (#16 is 16) from live X count
|
|
sub32v %5, %5, #16
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop_x, %5
|
|
|
|
// decrement Y count
|
|
sub32v %3, %3, #1
|
|
// if 0, finished
|
|
brz32 done, %3
|
|
// add strides to initial addresses
|
|
add32v %0, %0, %4
|
|
// loop to do next line
|
|
brz32 loop_y, #0
|
|
done:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
let mcode_fill256 = assemble_jareth!(
|
|
// x..x / $DST in %0, aligned on 128 bits
|
|
// 128-bits pattern in %1
|
|
// x..x / X size in %2, multiple of 128 bits (16 bytes)
|
|
// x..x / Y size in %3, arbitrary
|
|
// x..x / dst_stride in %4 (screen width)
|
|
// -----
|
|
// live X count in %5
|
|
// // live Y count in %3
|
|
// data in %7
|
|
// 0/scrap in %15
|
|
start:
|
|
// reset masks (probably not necessary with the starred-instruction)
|
|
resm %15
|
|
// compute X leftovers (modulo 32 -> #6 is 31)
|
|
and %6, %2, #6
|
|
// set the leftovers mask (offset is 0 as we are aligned)
|
|
setmq %15, #0, %6
|
|
loop_y:
|
|
// set source and destination addresses for current Y, X=first
|
|
setadr %15, %0
|
|
// then the rounded value in X
|
|
sub32v %5, %2, %6
|
|
loop_x:
|
|
// store to DST w/ post-increment
|
|
store256inc %15, ^0, %1
|
|
// sub 16 (#5 is 32) from live X count
|
|
sub32v %5, %5, #5
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop_x, %5
|
|
|
|
// decrement Y count
|
|
sub32v %3, %3, #1
|
|
// if 0, finished
|
|
brz32 done, %3
|
|
// add strides to initial addresses
|
|
add32v %0, %0, %4
|
|
// loop to do next line
|
|
brz32 loop_y, #0
|
|
done:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
// FILL ********************************************************************************************************
|
|
let mcode_fill = assemble_jareth!(
|
|
// x..x / $DST in %0
|
|
// 128-bits pattern in %1 [assumed to be alignement-homogneous]
|
|
// x..x / X size in %2
|
|
// x..x / Y size in %3,
|
|
// x..x / dst_stride in %4 (screen width?)
|
|
// -----
|
|
// main loop:
|
|
// live X count in %5
|
|
// leftover X in %6
|
|
// // live Y count in %3
|
|
// data in %7
|
|
// masked data in %7
|
|
// 0/scrap in %15
|
|
// -----
|
|
// header loop:
|
|
// live Y count in %5
|
|
// $DST in %6
|
|
// data in %7
|
|
// 0/scrap in %15
|
|
|
|
|
|
start:
|
|
// if number of line or element in line is 0, exit early
|
|
brz32 done256, %2
|
|
brz32 done256, %3
|
|
// reset masks
|
|
resm %15
|
|
// if $DST is aligned on 128 bits, jump to aligned loop
|
|
brz4 start256, %0
|
|
|
|
// do the first column
|
|
startX:
|
|
// set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes
|
|
setmq %15, %0, %2
|
|
// copy Y
|
|
psa %5, %3
|
|
// copy $DST
|
|
psa %6, %0
|
|
loopX_y:
|
|
// setadr
|
|
setadr %15, %6
|
|
// write partial data
|
|
store256* %15, ^0, %1
|
|
// increment copied $DST by stride
|
|
add32v %6, %6, %4
|
|
// decrement copied Y count
|
|
sub32v %5, %5, #1
|
|
// if not zero, continue
|
|
brnz32 loopX_y, %5
|
|
|
|
loopX_done:
|
|
// how much did we do (#6 is 31, #5 is 32)
|
|
and %8, %0, #6
|
|
// compute 32-(x&31)
|
|
sub32v %8, #5, %8
|
|
// compute the proper value
|
|
min32v %8, %8, %2
|
|
// add that to the address, which will now be aligned
|
|
add32v %0, %0, %8
|
|
// remove from X, as we have done it
|
|
sub32v %2, %2, %8
|
|
// rotate the pattern to match
|
|
rotr32v %1, %1, %8
|
|
// fall through the aligned loop if not 0
|
|
brz32 done256, %2
|
|
|
|
start256:
|
|
// compute X leftovers (modulo 32 -> #6 is 31)
|
|
and %6, %2, #6
|
|
// set the leftovers mask (offset is 0 as we are aligned)
|
|
setmq %15, #0, %6
|
|
|
|
loop256_y:
|
|
// set source and destination addresses for current Y
|
|
setadr %15, %0
|
|
// then the rounded value in X
|
|
sub32v %5, %2, %6
|
|
// already 0, bypass aligned stuff
|
|
brz32 loop256_x_end, %5
|
|
|
|
loop256_x:
|
|
// store to DST w/ post-increment
|
|
store256inc %15, ^0, %1
|
|
// sub 32 (#5 is 32) from live rounded X count
|
|
sub32v %5, %5, #5
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop256_x, %5
|
|
// check for line leftovers
|
|
loop256_x_end:
|
|
brz5 done256_x, %6
|
|
// write partial data
|
|
store256* %15, ^0, %1
|
|
|
|
done256_x:
|
|
// decrement Y count
|
|
sub32v %3, %3, #1
|
|
// if 0, finished
|
|
brz32 done256, %3
|
|
|
|
// add strides to initial addresses
|
|
add32v %0, %0, %4
|
|
// loop256 to do next line
|
|
brz32 loop256_y, #0
|
|
|
|
done256:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
// FILL ROP ********************************************************************************************************
|
|
let mcode_fillrop = assemble_jareth!(
|
|
// x..x / $DST in %0
|
|
// 128-bits pattern in %1 [assumed to be alignement-homogeneous]
|
|
// x..x / X size in %2
|
|
// x..x / Y size in %3,
|
|
// x..x / dst_stride in %4 (screen width?)
|
|
// x..x / rop / planemask in %5 [assumed to be alignement-homogenous]
|
|
// -----
|
|
// main loop:
|
|
// live X count in %8
|
|
// leftover X in %6
|
|
// // live Y count in %3
|
|
// data in %7
|
|
// masked data in %7
|
|
// 0/scrap in %15
|
|
// -----
|
|
// header loop:
|
|
// live Y count in %8
|
|
// $DST in %6
|
|
// data in %7
|
|
// 0/scrap in %15
|
|
|
|
|
|
start:
|
|
// if number of line or element in line is 0, exit early
|
|
brz32 done256, %2
|
|
brz32 done256, %3
|
|
// reset masks
|
|
resm %15
|
|
// set planemask / rop
|
|
srop %15, %5
|
|
// if $DST is aligned on 128 bits, jump to aligned loop
|
|
brz4 start256, %0
|
|
|
|
// do the first column(s)
|
|
startX:
|
|
// set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes
|
|
setmq %15, %0, %2
|
|
// copy Y
|
|
psa %8, %3
|
|
// copy $DST
|
|
psa %6, %0
|
|
loopX_y:
|
|
// setadr
|
|
setadr %15, %6
|
|
// load old data
|
|
load256 %7, ^0
|
|
// rop & insert
|
|
rop32v* %7, %7, %1
|
|
// rewrite data
|
|
store256 %15, ^0, %7
|
|
// increment copied $DST by stride
|
|
add32v %6, %6, %4
|
|
// decrement copied Y count
|
|
sub32v %8, %8, #1
|
|
// if not zero, continue
|
|
brnz32 loopX_y, %8
|
|
|
|
loopX_done:
|
|
// how much did we do (#6 is 31, #5 is 32)
|
|
and %8, %0, #6
|
|
// compute 32-(x&31) - upper bound
|
|
sub32v %8, #5, %8
|
|
// compute the proper value
|
|
min32v %8, %8, %2
|
|
// add that to the address, which will now be aligned if there's stuff left to do
|
|
add32v %0, %0, %8
|
|
// remove from X, as we have done it
|
|
sub32v %2, %2, %8
|
|
// rotate the pattern to match
|
|
rotr32v %1, %1, %8
|
|
// fall through the aligned loop if not 0, otherwise done
|
|
brz32 done256, %2
|
|
|
|
start256:
|
|
// compute X leftovers (modulo 32 -> #6 is 31)
|
|
and %6, %2, #6
|
|
// set the leftovers mask (offset is 0 as we are aligned)
|
|
setmq %15, #0, %6
|
|
|
|
loop256_y:
|
|
// set source and destination addresses for current Y
|
|
setadr %15, %0
|
|
// then the rounded value in X
|
|
sub32v %8, %2, %6
|
|
// already 0, bypass aligned stuff
|
|
brz32 loop256_x_end, %8
|
|
|
|
loop256_x:
|
|
// load data
|
|
load256 %7, ^0
|
|
// rop
|
|
rop32v %7, %7, %1
|
|
// store to DST w/ post-increment
|
|
store256inc %15, ^0, %7
|
|
// sub 32 (#5 is 32) from live rounded X count
|
|
sub32v %8, %8, #5
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop256_x, %8
|
|
// check for line leftovers
|
|
loop256_x_end:
|
|
brz5 done256_x, %6
|
|
|
|
// load old data
|
|
load256 %7, ^0
|
|
// insert pattern
|
|
rop32v* %7, %7, %1
|
|
// rewrite data
|
|
store256 %15, ^0, %7
|
|
|
|
done256_x:
|
|
// decrement Y count
|
|
sub32v %3, %3, #1
|
|
// if 0, finished
|
|
brz32 done256, %3
|
|
|
|
// add strides to initial addresses
|
|
add32v %0, %0, %4
|
|
// loop256 to do next line
|
|
brz32 loop256_y, #0
|
|
|
|
done256:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
|
|
|
|
// COPY ********************************************************************************************************
|
|
let mcode_copy = assemble_jareth!(
|
|
// x..x / $SRC / $DST in %0
|
|
// x..x / $DST / $SRC in %1
|
|
// x..x / X size in %2
|
|
// x..x / Y size in %3,
|
|
// x..x src_stride / dst_stride in %4 (screen width?)
|
|
// -----
|
|
// main loop:
|
|
// live X count in %9
|
|
// leftover X in %6
|
|
// // live Y count in %3
|
|
// data in %7
|
|
// // masked data in %7
|
|
// 0/scrap in %15
|
|
// -----
|
|
// header loop:
|
|
// live Y count in %9
|
|
// $SRC / $DST in %6
|
|
// // dst data in %7
|
|
// src data in %8
|
|
// 0/scrap in %15
|
|
|
|
|
|
start:
|
|
// if number of line or element in line is 0, exit early
|
|
brz32 done128, %2
|
|
brz32 done128, %3
|
|
// reset masks
|
|
resm %15
|
|
// set alignement; we shift by the addr offset
|
|
setmq %15, %0, %2
|
|
// we use b as that's the data input for Stores
|
|
setmb %15, %1, #16
|
|
// if $DST is aligned on 128 bits, jump to aligned loop
|
|
brz4 start128, %0
|
|
|
|
// do the first column to align $DST
|
|
startX:
|
|
// copy Y
|
|
psa %9, %3
|
|
// copy $SRC / $DST
|
|
psa %6, %0
|
|
loopX_y:
|
|
// setadr
|
|
setadr %15, %6
|
|
// load src
|
|
load256 %8, ^1
|
|
// write partial data
|
|
store128* %15, ^0, %8
|
|
// increment copied $SRC / $DST by stride
|
|
add32v %6, %6, %4
|
|
// decrement copied Y count
|
|
sub32v %9, %9, #1
|
|
// if not zero, continue
|
|
brnz32 loopX_y, %9
|
|
|
|
loopX_done:
|
|
// how much did we do (#15 is 15, #16 is 16)
|
|
and %9, %0, #15
|
|
// compute 16-(x&15)
|
|
sub32v %9, #16, %9
|
|
// compute the proper value
|
|
min32v %9, %9, %2
|
|
// more than one address to increment
|
|
bcast32 %9, %9
|
|
// add the count to the addresses, ^0 will now be aligned
|
|
add32v %0, %0, %9
|
|
// remove from X, as we have done it
|
|
sub32v %2, %2, %9
|
|
// fall through to the aligned loop if not 0
|
|
brz32 done128, %2
|
|
|
|
// reset q mask (we will be aligned from now on)
|
|
setmq %15, #0, #16
|
|
// add the count to the addresses, ^1 will have the proper shift for masking
|
|
add32v %1, %1, %9
|
|
// reset a mask to the proper shifting
|
|
setmb %15, %1, #16
|
|
|
|
start128:
|
|
// compute X leftovers (modulo 16 -> #15 is 15)
|
|
and %6, %2, #15
|
|
|
|
loop128_y:
|
|
// set source and destination addresses for current Y
|
|
setadr %15, %0
|
|
// then the rounded value in X
|
|
sub32v %9, %2, %6
|
|
// prefetch data
|
|
load256inc %8, ^1
|
|
// already 0, bypass aligned stuff
|
|
brz32 loop128_x_end, %9
|
|
|
|
loop128_x:
|
|
// store to DST w/ post-increment
|
|
store128inc* %15, ^0, %8
|
|
// sub 16 (#16 is 16) from live rounded X count
|
|
sub32v %9, %9, #16
|
|
// prefetch data
|
|
loadh128inc %8, ^1, %8
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop128_x, %9
|
|
// check for line leftovers
|
|
loop128_x_end:
|
|
brz4 done128_x, %6
|
|
|
|
// set the leftovers mask (offset is 0 as we are aligned)
|
|
// IMPROVE ME
|
|
setmq %15, #0, %6
|
|
// rewrite data
|
|
store128* %15, ^0, %8
|
|
// reset the Q mask
|
|
// IMPROVE ME
|
|
setmq %15, #0, #16
|
|
|
|
done128_x:
|
|
// decrement Y count
|
|
sub32v %3, %3, #1
|
|
// if 0, finished
|
|
brz32 done128, %3
|
|
|
|
// add strides to initial addresses
|
|
add32v %0, %0, %4
|
|
// loop128 to do next line
|
|
brz32 loop128_y, #0
|
|
|
|
done128:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
// COPYREV ********************************************************************************************************
|
|
let mcode_copyrev = assemble_jareth!(
|
|
// x..x / $SRC / $DST in %0
|
|
// x..x / $DST / $SRC in %1
|
|
// x..x / X size in %2
|
|
// x..x / Y size in %3,
|
|
// x..x src_stride / dst_stride in %4 (screen width?)
|
|
// -----
|
|
// main loop:
|
|
// leftover X in %6
|
|
// data in %7
|
|
// // masked data in %7
|
|
// src data in %8
|
|
// live X count in %9
|
|
// $SRC / $DST in %10
|
|
// $DST / $SRC in %11
|
|
// live Y count in %12, also scratch in header
|
|
// todo X count in %13
|
|
// amount of work in tail in %14
|
|
// 0/scrap in %15
|
|
// -----
|
|
// tail loop:
|
|
// $SRC / $DST in %0
|
|
// // dst data in %7
|
|
// src data in %8
|
|
// live Y count in %9
|
|
// 0/scrap in %15
|
|
|
|
|
|
start:
|
|
// if number of line or element in line is 0, exit early
|
|
brz32 done128, %2
|
|
brz32 done128, %3
|
|
// reset masks
|
|
resm %15
|
|
// copy addresses
|
|
psa %10, %0
|
|
psa %11, %1
|
|
// set todo X
|
|
psa %13, %2
|
|
// compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16), first the offset
|
|
and %14, %0, #15
|
|
// if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16)
|
|
brz32 skip, %14
|
|
|
|
// it is at most 16-($DST & 15)
|
|
sub32v %14, #16, %14
|
|
// compute the proper value by bounding to Xsize
|
|
min32v %14, %14, %2
|
|
// more than one address to increment
|
|
bcast32 %14, %14
|
|
// add the count to the addresses, DST will now be aligned
|
|
add32v %10, %10, %14
|
|
// add the count to the addresses, SRC will have the proper alignment to shift input in the aligned loop
|
|
add32v %11, %11, %14
|
|
// so, do we do everything there ?
|
|
sub32v %13, %2, %14
|
|
// if 0, we do everything in the tail skip the aligned loop
|
|
brz32 startX, %13
|
|
|
|
skip:
|
|
// reset q mask (we will be aligned from now on)
|
|
setmq %15, #0, #16
|
|
// set b mask to the proper shifting for Stores
|
|
setmb %15, %11, #16
|
|
|
|
// now we need to figure out where we start to go backward
|
|
// currently we have the number of 'tail' (first column...) elements in %14 (0 for aligned), number of 'loop' elements in %13,
|
|
// and $SRC+%14 & $DST+%14 in $10/$11 with $SRC+%14 aligned.
|
|
// compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those
|
|
and %6, %13, #15
|
|
// compute the 'aligned' number of elements
|
|
sub32v %15, %13, %6
|
|
bcast32 %15, %15
|
|
|
|
// add the aligned number of element to $SRC+%14 & $DST+%14
|
|
add32v %10, %10, %15
|
|
add32v %11, %11, %15
|
|
|
|
// if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST and $SRC
|
|
brnz32 skip2, %6
|
|
psa %15, #16
|
|
bcast32 %15, %15
|
|
sub32v %10, %10, %15
|
|
sub32v %11, %11, %15
|
|
|
|
skip2: // // if $SRC+%13 is not aligned, we also need to add 16 (for prefetch)
|
|
// add32v %15, %11, %6
|
|
// and %15, %15, #15
|
|
// brz32 skip3, %15
|
|
|
|
add32v %11, %11, #16
|
|
psa %15, #16
|
|
swap32 %15, %15
|
|
add32v %10, %10, %15
|
|
|
|
// add32v %15, %6, #16
|
|
// add32v %11, %11, %15
|
|
// swap32 %15, %15
|
|
// add32v %10, %10, %15
|
|
|
|
skip3:
|
|
// copy Y count
|
|
psa %12, %3
|
|
|
|
loop128_y:
|
|
// set source and destination addresses for current Y
|
|
setadr %15, %10
|
|
// then the rounded value in X
|
|
sub32v %9, %13, %6
|
|
// prefetch data
|
|
|
|
// prefetch data
|
|
load128dec %8, ^1
|
|
|
|
// check for line leftovers
|
|
loop128_x_begin:
|
|
brz4 loop128_x, %6
|
|
|
|
// set the leftovers mask (offset is 0 as we are aligned)
|
|
// IMPROVE ME
|
|
setmq %15, #0, %6
|
|
// prefetch data
|
|
loadl128dec %8, ^1, %8
|
|
// write partial data
|
|
store128dec* %15, ^0, %8
|
|
// reset the Q mask
|
|
// IMPROVE ME
|
|
setmq %15, #0, #16
|
|
|
|
loop128_x:
|
|
// already 0, bypass aligned stuff
|
|
brz32 loop128_x_end, %9
|
|
// prefetch data
|
|
loadl128dec %8, ^1, %8
|
|
// write data
|
|
store128dec* %15, ^0, %8
|
|
// sub 16 (#16 is 16) from live rounded X count
|
|
sub32v %9, %9, #16
|
|
// if X count is not 0, keep looping
|
|
brnz32 loop128_x, %9
|
|
|
|
loop128_x_end:
|
|
// decrement Y count
|
|
sub32v %12, %12, #1
|
|
// if 0, finished
|
|
brz32 startX, %12
|
|
|
|
// add strides to initial addresses
|
|
add32v %10, %10, %4
|
|
// loop128 to do next line
|
|
brz32 loop128_y, #0
|
|
|
|
startX:
|
|
// do the first column if we need to
|
|
brz32 done128, %14
|
|
// set alignement; we shift by the addr offset
|
|
setmq %15, %0, %2
|
|
setmb %15, %1, #16
|
|
// copy Y
|
|
psa %9, %3
|
|
loopX_y:
|
|
// setadr from the start
|
|
setadr %15, %0
|
|
// load src
|
|
load256 %8, ^1
|
|
// write partial data
|
|
store128* %15, ^0, %8
|
|
// increment $SRC / $DST by stride
|
|
add32v %0, %0, %4
|
|
// decrement copied Y count
|
|
sub32v %9, %9, #1
|
|
// if not zero, continue
|
|
brnz32 loopX_y, %9
|
|
|
|
done128:
|
|
fin
|
|
fin
|
|
);
|
|
|
|
// ****** ********************************************************************************************************
|
|
|
|
let mut pos;
|
|
|
|
pos = 0;
|
|
println!("test code:");
|
|
while pos < mcode.len() {
|
|
print!("0x{:08x},", mcode[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode.len());
|
|
|
|
pos = 0;
|
|
println!("scroll256:");
|
|
while pos < mcode_scroll256.len() {
|
|
print!("0x{:08x},", mcode_scroll256[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_scroll256.len());
|
|
|
|
pos = 0;
|
|
println!("scroll128:");
|
|
while pos < mcode_scroll128.len() {
|
|
print!("0x{:08x},", mcode_scroll128[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_scroll128.len());
|
|
|
|
pos = 0;
|
|
println!("fill128:");
|
|
while pos < mcode_fill128.len() {
|
|
print!("0x{:08x},", mcode_fill128[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_fill128.len());
|
|
|
|
pos = 0;
|
|
println!("fill256:");
|
|
while pos < mcode_fill256.len() {
|
|
print!("0x{:08x},", mcode_fill256[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_fill256.len());
|
|
|
|
pos = 0;
|
|
println!("fill:");
|
|
while pos < mcode_fill.len() {
|
|
print!("0x{:08x},", mcode_fill[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_fill.len());
|
|
|
|
pos = 0;
|
|
println!("fillrop:");
|
|
while pos < mcode_fillrop.len() {
|
|
print!("0x{:08x},", mcode_fillrop[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_fillrop.len());
|
|
|
|
pos = 0;
|
|
println!("copy:");
|
|
while pos < mcode_copy.len() {
|
|
print!("0x{:08x},", mcode_copy[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_copy.len());
|
|
|
|
pos = 0;
|
|
println!("copyrev:");
|
|
while pos < mcode_copyrev.len() {
|
|
print!("0x{:08x},", mcode_copyrev[pos]);
|
|
pos = pos + 1;
|
|
}
|
|
println!("");
|
|
println!("-> {}", mcode_copyrev.len());
|
|
|
|
Ok(())
|
|
}
|