#![recursion_limit="768"] extern crate jareth_as; use jareth_as::*; fn main() -> std::io::Result<()> { let mcode = assemble_jareth!( // 0..0 $DST / $DST / $SRC in %0 // 0..0 $DST / $SRC / $DST in %1 // size in %2 // pattern in %3 // ----- // size & 7 in %5 // size rounded down in %6 // input in %7 // output in %8 // 0 in %15 start: resm %15 setadr %15, %0 load256inc %7, ^0 load256inc %8, ^1 // slow setma %15, %0, #16 // slow setmq %15, %1, #16 and %5, %2, #15 sub32v %6, %2, %5 brz32 done, %6 loop: psa %18, %7 psa %19, %8 psa* %8, %7 psa %20, %8 store128inc %15, ^2, %8 sub32v %6, %6, #16 brz32 last, %6 loadh128inc %7, ^0, %7 loadh128inc %8, ^1, %8 brz32 loop, #0 last: // FIXME: not if Q is aligned loadh128inc %8, ^1, %8 store128inc %15, ^2, %8 done: getadr %3 getm %2 fin fin ); let mcode_scroll256 = assemble_jareth!( // x..x / $DST / $SRC in %0, aligned on 128 bits ; $DST < $SRC // x..x / X size in %2, multiple of 256 bits (32 bytes) // x..x / Y size in %3, arbitrary // x..x / dst_stride / src_stride in %4 (screen width) // ----- // live X count in %5 // // live Y count in %3 // data in %7 // 0/scrap in %15 start: // reset masks (probably not necessary with the starred-instruction) // resm %15 loop_y: // set source and destination addresses for current Y, X=first setadr %15, %0 psa %5, %2 loop_x: // load from SRC w/ post-increment load256inc %7, ^0 // store to DST w/ post-increment store256inc %15, ^1, %7 // sub 32 (#5 is 32...) from live X count sub32v %5, %5, #5 // if X count is not 0, keep looping brnz32 loop_x, %5 // decrement Y count sub32v %3, %3, #1 // if 0, finished brz32 done, %3 // add strides to initial addresses add32v %0, %0, %4 // loop to do next line brz32 loop_y, #0 done: fin fin ); let mcode_scroll128 = assemble_jareth!( // x..x / $DST / $SRC in %0, aligned on 128 bits ; $DST < $SRC // x..x / X size in %2, multiple of 128 bits (16 bytes) // x..x / Y size in %3, arbitrary // x..x / dst_stride / src_stride in %4 (screen width) // ----- // live X count in %5 // // live Y count in %3 // data in %7 // 0/scrap in %15 start: // reset masks (probably not necessary with the starred-instruction) // resm %15 loop_y: // set source and destination addresses for current Y, X=first setadr %15, %0 psa %5, %2 loop_x: // load from SRC w/ post-increment load128inc %7, ^0 // store to DST w/ post-increment store128inc %15, ^1, %7 // sub 16 (#16 is 16) from live X count sub32v %5, %5, #16 // if X count is not 0, keep looping brnz32 loop_x, %5 // decrement Y count sub32v %3, %3, #1 // if 0, finished brz32 done, %3 // add strides to initial addresses add32v %0, %0, %4 // loop to do next line brz32 loop_y, #0 done: fin fin ); let mcode_fill128 = assemble_jareth!( // x..x / $DST in %0, aligned on 128 bits // 128-bits pattern in %1 // x..x / X size in %2, multiple of 128 bits (16 bytes) // x..x / Y size in %3, arbitrary // x..x / dst_stride in %4 (screen width) // ----- // live X count in %5 // // live Y count in %3 // data in %7 // 0/scrap in %15 start: // reset masks (probably not necessary with the starred-instruction) // resm %15 loop_y: // set source and destination addresses for current Y, X=first setadr %15, %0 psa %5, %2 loop_x: // store to DST w/ post-increment store128inc %15, ^0, %1 // sub 16 (#16 is 16) from live X count sub32v %5, %5, #16 // if X count is not 0, keep looping brnz32 loop_x, %5 // decrement Y count sub32v %3, %3, #1 // if 0, finished brz32 done, %3 // add strides to initial addresses add32v %0, %0, %4 // loop to do next line brz32 loop_y, #0 done: fin fin ); let mcode_fill256 = assemble_jareth!( // x..x / $DST in %0, aligned on 128 bits // 128-bits pattern in %1 // x..x / X size in %2, multiple of 128 bits (16 bytes) // x..x / Y size in %3, arbitrary // x..x / dst_stride in %4 (screen width) // ----- // live X count in %5 // // live Y count in %3 // data in %7 // 0/scrap in %15 start: // reset masks (probably not necessary with the starred-instruction) resm %15 // compute X leftovers (modulo 32 -> #6 is 31) and %6, %2, #6 // set the leftovers mask (offset is 0 as we are aligned) setmq %15, #0, %6 loop_y: // set source and destination addresses for current Y, X=first setadr %15, %0 // then the rounded value in X sub32v %5, %2, %6 loop_x: // store to DST w/ post-increment store256inc %15, ^0, %1 // sub 16 (#5 is 32) from live X count sub32v %5, %5, #5 // if X count is not 0, keep looping brnz32 loop_x, %5 // decrement Y count sub32v %3, %3, #1 // if 0, finished brz32 done, %3 // add strides to initial addresses add32v %0, %0, %4 // loop to do next line brz32 loop_y, #0 done: fin fin ); // FILL ******************************************************************************************************** let mcode_fill = assemble_jareth!( // x..x / $DST in %0 // 128-bits pattern in %1 [assumed to be alignement-homogneous] // x..x / X size in %2 // x..x / Y size in %3, // x..x / dst_stride in %4 (screen width?) // ----- // main loop: // live X count in %5 // leftover X in %6 // // live Y count in %3 // data in %7 // masked data in %7 // 0/scrap in %15 // ----- // header loop: // live Y count in %5 // $DST in %6 // data in %7 // 0/scrap in %15 start: // if number of line or element in line is 0, exit early brz32 done256, %2 brz32 done256, %3 // reset masks resm %15 // if $DST is aligned on 128 bits, jump to aligned loop brz4 start256, %0 // do the first column startX: // set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes setmq %15, %0, %2 // copy Y psa %5, %3 // copy $DST psa %6, %0 loopX_y: // setadr setadr %15, %6 // write partial data store256* %15, ^0, %1 // increment copied $DST by stride add32v %6, %6, %4 // decrement copied Y count sub32v %5, %5, #1 // if not zero, continue brnz32 loopX_y, %5 loopX_done: // how much did we do (#6 is 31, #5 is 32) and %8, %0, #6 // compute 32-(x&31) sub32v %8, #5, %8 // compute the proper value min32v %8, %8, %2 // add that to the address, which will now be aligned add32v %0, %0, %8 // remove from X, as we have done it sub32v %2, %2, %8 // rotate the pattern to match rotr32v %1, %1, %8 // fall through the aligned loop if not 0 brz32 done256, %2 start256: // compute X leftovers (modulo 32 -> #6 is 31) and %6, %2, #6 // set the leftovers mask (offset is 0 as we are aligned) setmq %15, #0, %6 loop256_y: // set source and destination addresses for current Y setadr %15, %0 // then the rounded value in X sub32v %5, %2, %6 // already 0, bypass aligned stuff brz32 loop256_x_end, %5 loop256_x: // store to DST w/ post-increment store256inc %15, ^0, %1 // sub 32 (#5 is 32) from live rounded X count sub32v %5, %5, #5 // if X count is not 0, keep looping brnz32 loop256_x, %5 // check for line leftovers loop256_x_end: brz5 done256_x, %6 // write partial data store256* %15, ^0, %1 done256_x: // decrement Y count sub32v %3, %3, #1 // if 0, finished brz32 done256, %3 // add strides to initial addresses add32v %0, %0, %4 // loop256 to do next line brz32 loop256_y, #0 done256: fin fin ); // FILL ROP ******************************************************************************************************** let mcode_fillrop = assemble_jareth!( // x..x / $DST in %0 // 128-bits pattern in %1 [assumed to be alignement-homogeneous] // x..x / X size in %2 // x..x / Y size in %3, // x..x / dst_stride in %4 (screen width?) // x..x / rop / planemask in %5 [assumed to be alignement-homogenous] // ----- // main loop: // live X count in %8 // leftover X in %6 // // live Y count in %3 // data in %7 // masked data in %7 // 0/scrap in %15 // ----- // header loop: // live Y count in %8 // $DST in %6 // data in %7 // 0/scrap in %15 start: // if number of line or element in line is 0, exit early brz32 done256, %2 brz32 done256, %3 // reset masks resm %15 // set planemask / rop srop %15, %5 // if $DST is aligned on 128 bits, jump to aligned loop brz4 start256, %0 // do the first column(s) startX: // set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes setmq %15, %0, %2 // copy Y psa %8, %3 // copy $DST psa %6, %0 loopX_y: // setadr setadr %15, %6 // load old data load256 %7, ^0 // rop & insert rop32v* %7, %7, %1 // rewrite data store256 %15, ^0, %7 // increment copied $DST by stride add32v %6, %6, %4 // decrement copied Y count sub32v %8, %8, #1 // if not zero, continue brnz32 loopX_y, %8 loopX_done: // how much did we do (#6 is 31, #5 is 32) and %8, %0, #6 // compute 32-(x&31) - upper bound sub32v %8, #5, %8 // compute the proper value min32v %8, %8, %2 // add that to the address, which will now be aligned if there's stuff left to do add32v %0, %0, %8 // remove from X, as we have done it sub32v %2, %2, %8 // rotate the pattern to match rotr32v %1, %1, %8 // fall through the aligned loop if not 0, otherwise done brz32 done256, %2 start256: // compute X leftovers (modulo 32 -> #6 is 31) and %6, %2, #6 // set the leftovers mask (offset is 0 as we are aligned) setmq %15, #0, %6 loop256_y: // set source and destination addresses for current Y setadr %15, %0 // then the rounded value in X sub32v %8, %2, %6 // already 0, bypass aligned stuff brz32 loop256_x_end, %8 loop256_x: // load data load256 %7, ^0 // rop rop32v %7, %7, %1 // store to DST w/ post-increment store256inc %15, ^0, %7 // sub 32 (#5 is 32) from live rounded X count sub32v %8, %8, #5 // if X count is not 0, keep looping brnz32 loop256_x, %8 // check for line leftovers loop256_x_end: brz5 done256_x, %6 // load old data load256 %7, ^0 // insert pattern rop32v* %7, %7, %1 // rewrite data store256 %15, ^0, %7 done256_x: // decrement Y count sub32v %3, %3, #1 // if 0, finished brz32 done256, %3 // add strides to initial addresses add32v %0, %0, %4 // loop256 to do next line brz32 loop256_y, #0 done256: fin fin ); // COPY ******************************************************************************************************** let mcode_copy = assemble_jareth!( // x..x / $SRC / $DST in %0 // x..x / $DST / $SRC in %1 // x..x / X size in %2 // x..x / Y size in %3, // x..x src_stride / dst_stride in %4 (screen width?) // ----- // main loop: // live X count in %9 // leftover X in %6 // // live Y count in %3 // data in %7 // // masked data in %7 // 0/scrap in %15 // ----- // header loop: // live Y count in %9 // $SRC / $DST in %6 // // dst data in %7 // src data in %8 // 0/scrap in %15 start: // if number of line or element in line is 0, exit early brz32 done128, %2 brz32 done128, %3 // reset masks resm %15 // set alignement; we shift by the addr offset setmq %15, %0, %2 // we use b as that's the data input for Stores setmb %15, %1, #16 // if $DST is aligned on 128 bits, jump to aligned loop brz4 start128, %0 // do the first column to align $DST startX: // copy Y psa %9, %3 // copy $SRC / $DST psa %6, %0 loopX_y: // setadr setadr %15, %6 // load src load256 %8, ^1 // write partial data store128* %15, ^0, %8 // increment copied $SRC / $DST by stride add32v %6, %6, %4 // decrement copied Y count sub32v %9, %9, #1 // if not zero, continue brnz32 loopX_y, %9 loopX_done: // how much did we do (#15 is 15, #16 is 16) and %9, %0, #15 // compute 16-(x&15) sub32v %9, #16, %9 // compute the proper value min32v %9, %9, %2 // more than one address to increment bcast32 %9, %9 // add the count to the addresses, ^0 will now be aligned add32v %0, %0, %9 // remove from X, as we have done it sub32v %2, %2, %9 // fall through to the aligned loop if not 0 brz32 done128, %2 // reset q mask (we will be aligned from now on) setmq %15, #0, #16 // add the count to the addresses, ^1 will have the proper shift for masking add32v %1, %1, %9 // reset a mask to the proper shifting setmb %15, %1, #16 start128: // compute X leftovers (modulo 16 -> #15 is 15) and %6, %2, #15 loop128_y: // set source and destination addresses for current Y setadr %15, %0 // then the rounded value in X sub32v %9, %2, %6 // prefetch data load256inc %8, ^1 // already 0, bypass aligned stuff brz32 loop128_x_end, %9 loop128_x: // store to DST w/ post-increment store128inc* %15, ^0, %8 // sub 16 (#16 is 16) from live rounded X count sub32v %9, %9, #16 // prefetch data loadh128inc %8, ^1, %8 // if X count is not 0, keep looping brnz32 loop128_x, %9 // check for line leftovers loop128_x_end: brz4 done128_x, %6 // set the leftovers mask (offset is 0 as we are aligned) // IMPROVE ME setmq %15, #0, %6 // rewrite data store128* %15, ^0, %8 // reset the Q mask // IMPROVE ME setmq %15, #0, #16 done128_x: // decrement Y count sub32v %3, %3, #1 // if 0, finished brz32 done128, %3 // add strides to initial addresses add32v %0, %0, %4 // loop128 to do next line brz32 loop128_y, #0 done128: fin fin ); // COPYREV ******************************************************************************************************** let mcode_copyrev = assemble_jareth!( // x..x / $SRC / $DST in %0 // x..x / $DST / $SRC in %1 // x..x / X size in %2 // x..x / Y size in %3, // x..x src_stride / dst_stride in %4 (screen width?) // ----- // main loop: // leftover X in %6 // data in %7 // // masked data in %7 // src data in %8 // live X count in %9 // $SRC / $DST in %10 // $DST / $SRC in %11 // live Y count in %12, also scratch in header // todo X count in %13 // amount of work in tail in %14 // 0/scrap in %15 // ----- // tail loop: // $SRC / $DST in %0 // // dst data in %7 // src data in %8 // live Y count in %9 // 0/scrap in %15 start: // if number of line or element in line is 0, exit early brz32 done128, %2 brz32 done128, %3 // reset masks resm %15 // copy addresses psa %10, %0 psa %11, %1 // set todo X psa %13, %2 // compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16), first the offset and %14, %0, #15 // if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16) brz32 skip, %14 // it is at most 16-($DST & 15) sub32v %14, #16, %14 // compute the proper value by bounding to Xsize min32v %14, %14, %2 // more than one address to increment bcast32 %14, %14 // add the count to the addresses, DST will now be aligned add32v %10, %10, %14 // add the count to the addresses, SRC will have the proper alignment to shift input in the aligned loop add32v %11, %11, %14 // so, do we do everything there ? sub32v %13, %2, %14 // if 0, we do everything in the tail skip the aligned loop brz32 startX, %13 skip: // reset q mask (we will be aligned from now on) setmq %15, #0, #16 // set b mask to the proper shifting for Stores setmb %15, %11, #16 // now we need to figure out where we start to go backward // currently we have the number of 'tail' (first column...) elements in %14 (0 for aligned), number of 'loop' elements in %13, // and $SRC+%14 & $DST+%14 in $10/$11 with $SRC+%14 aligned. // compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those and %6, %13, #15 // compute the 'aligned' number of elements sub32v %15, %13, %6 bcast32 %15, %15 // add the aligned number of element to $SRC+%14 & $DST+%14 add32v %10, %10, %15 add32v %11, %11, %15 // if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST and $SRC brnz32 skip2, %6 psa %15, #16 bcast32 %15, %15 sub32v %10, %10, %15 sub32v %11, %11, %15 skip2: // // if $SRC+%13 is not aligned, we also need to add 16 (for prefetch) // add32v %15, %11, %6 // and %15, %15, #15 // brz32 skip3, %15 add32v %11, %11, #16 psa %15, #16 swap32 %15, %15 add32v %10, %10, %15 // add32v %15, %6, #16 // add32v %11, %11, %15 // swap32 %15, %15 // add32v %10, %10, %15 skip3: // copy Y count psa %12, %3 loop128_y: // set source and destination addresses for current Y setadr %15, %10 // then the rounded value in X sub32v %9, %13, %6 // prefetch data // prefetch data load128dec %8, ^1 // check for line leftovers loop128_x_begin: brz4 loop128_x, %6 // set the leftovers mask (offset is 0 as we are aligned) // IMPROVE ME setmq %15, #0, %6 // prefetch data loadl128dec %8, ^1, %8 // write partial data store128dec* %15, ^0, %8 // reset the Q mask // IMPROVE ME setmq %15, #0, #16 loop128_x: // already 0, bypass aligned stuff brz32 loop128_x_end, %9 // prefetch data loadl128dec %8, ^1, %8 // write data store128dec* %15, ^0, %8 // sub 16 (#16 is 16) from live rounded X count sub32v %9, %9, #16 // if X count is not 0, keep looping brnz32 loop128_x, %9 loop128_x_end: // decrement Y count sub32v %12, %12, #1 // if 0, finished brz32 startX, %12 // add strides to initial addresses add32v %10, %10, %4 // loop128 to do next line brz32 loop128_y, #0 startX: // do the first column if we need to brz32 done128, %14 // set alignement; we shift by the addr offset setmq %15, %0, %2 setmb %15, %1, #16 // copy Y psa %9, %3 loopX_y: // setadr from the start setadr %15, %0 // load src load256 %8, ^1 // write partial data store128* %15, ^0, %8 // increment $SRC / $DST by stride add32v %0, %0, %4 // decrement copied Y count sub32v %9, %9, #1 // if not zero, continue brnz32 loopX_y, %9 done128: fin fin ); // ****** ******************************************************************************************************** let mut pos; pos = 0; println!("test code:"); while pos < mcode.len() { print!("0x{:08x},", mcode[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode.len()); pos = 0; println!("scroll256:"); while pos < mcode_scroll256.len() { print!("0x{:08x},", mcode_scroll256[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_scroll256.len()); pos = 0; println!("scroll128:"); while pos < mcode_scroll128.len() { print!("0x{:08x},", mcode_scroll128[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_scroll128.len()); pos = 0; println!("fill128:"); while pos < mcode_fill128.len() { print!("0x{:08x},", mcode_fill128[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_fill128.len()); pos = 0; println!("fill256:"); while pos < mcode_fill256.len() { print!("0x{:08x},", mcode_fill256[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_fill256.len()); pos = 0; println!("fill:"); while pos < mcode_fill.len() { print!("0x{:08x},", mcode_fill[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_fill.len()); pos = 0; println!("fillrop:"); while pos < mcode_fillrop.len() { print!("0x{:08x},", mcode_fillrop[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_fillrop.len()); pos = 0; println!("copy:"); while pos < mcode_copy.len() { print!("0x{:08x},", mcode_copy[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_copy.len()); pos = 0; println!("copyrev:"); while pos < mcode_copyrev.len() { print!("0x{:08x},", mcode_copyrev[pos]); pos = pos + 1; } println!(""); println!("-> {}", mcode_copyrev.len()); Ok(()) }