#![recursion_limit="768"]

extern crate jareth_as;
use jareth_as::*;

fn main() -> std::io::Result<()> {
    let mcode = assemble_jareth!(
	// 0..0 $DST / $DST / $SRC in %0
	// 0..0 $DST / $SRC / $DST in %1
	// size in %2
	// pattern in %3
	// -----
	// size & 7 in %5
	// size rounded down in %6
	// input in %7
	// output in %8
	// 0 in %15
         start:
				resm %15
				setadr %15, %0
				load256inc %7, ^0
				load256inc %8, ^1
				// slow
				setma %15, %0, #16
				// slow
				setmq %15, %1, #16
				and %5, %2, #15
				sub32v %6, %2, %5
				brz32 done, %6
		loop:
			psa %18, %7
			psa %19, %8
				psa* %8, %7
			psa %20, %8
				store128inc %15, ^2, %8
				sub32v %6, %6, #16
				brz32 last, %6
				loadh128inc %7, ^0, %7
				loadh128inc %8, ^1, %8
				brz32 loop, #0
		last:
				// FIXME: not if Q is aligned
				loadh128inc %8, ^1, %8
				store128inc %15, ^2, %8
		done:
				getadr %3
				getm %2
				fin
				fin
	);
	
    let mcode_scroll256 = assemble_jareth!(
	// x..x / $DST / $SRC in %0, aligned on 128 bits ; $DST < $SRC
	// x..x / X size in %2, multiple of 256 bits (32 bytes)
	// x..x / Y size in %3, arbitrary
	// x..x / dst_stride / src_stride in %4 (screen width)
	// -----
	// live X count in %5
	// // live Y count in %3
	// data in %7
	// 0/scrap in %15
         start:
				// reset masks (probably not necessary with the starred-instruction)
				// resm %15
		loop_y:
				// set source and destination addresses for current Y, X=first
				setadr %15, %0
				psa %5, %2
		loop_x:
				// load from SRC w/ post-increment
				load256inc %7, ^0
				// store to DST w/ post-increment
				store256inc %15, ^1, %7
				// sub 32 (#5 is 32...) from live X count
				sub32v %5, %5, #5
				// if X count is not 0, keep looping
				brnz32 loop_x, %5

				// decrement Y count
				sub32v %3, %3, #1
				// if 0, finished
				brz32 done, %3
				// add strides to initial addresses
				add32v %0, %0, %4
				// loop to do next line
				brz32 loop_y, #0
		done:
				fin
				fin
	);
	
    let mcode_scroll128 = assemble_jareth!(
	// x..x / $DST / $SRC in %0, aligned on 128 bits ; $DST < $SRC
	// x..x / X size in %2, multiple of 128 bits (16 bytes)
	// x..x / Y size in %3, arbitrary
	// x..x / dst_stride / src_stride in %4 (screen width)
	// -----
	// live X count in %5
	// // live Y count in %3
	// data in %7
	// 0/scrap in %15
         start:
				// reset masks (probably not necessary with the starred-instruction)
				// resm %15
		loop_y:
				// set source and destination addresses for current Y, X=first
				setadr %15, %0
				psa %5, %2
		loop_x:
				// load from SRC w/ post-increment
				load128inc %7, ^0
				// store to DST w/ post-increment
				store128inc %15, ^1, %7
				// sub 16 (#16 is 16) from live X count
				sub32v %5, %5, #16
				// if X count is not 0, keep looping
				brnz32 loop_x, %5

				// decrement Y count
				sub32v %3, %3, #1
				// if 0, finished
				brz32 done, %3
				// add strides to initial addresses
				add32v %0, %0, %4
				// loop to do next line
				brz32 loop_y, #0
		done:
				fin
				fin
	);
	
    let mcode_fill128 = assemble_jareth!(
	// x..x / $DST in %0, aligned on 128 bits
	// 128-bits pattern in %1
	// x..x / X size in %2, multiple of 128 bits (16 bytes)
	// x..x / Y size in %3, arbitrary
	// x..x / dst_stride in %4 (screen width)
	// -----
	// live X count in %5
	// // live Y count in %3
	// data in %7
	// 0/scrap in %15
         start:
				// reset masks (probably not necessary with the starred-instruction)
				// resm %15
		loop_y:
				// set source and destination addresses for current Y, X=first
				setadr %15, %0
				psa %5, %2
		loop_x:
				// store to DST w/ post-increment
				store128inc %15, ^0, %1
				// sub 16 (#16 is 16) from live X count
				sub32v %5, %5, #16
				// if X count is not 0, keep looping
				brnz32 loop_x, %5

				// decrement Y count
				sub32v %3, %3, #1
				// if 0, finished
				brz32 done, %3
				// add strides to initial addresses
				add32v %0, %0, %4
				// loop to do next line
				brz32 loop_y, #0
		done:
				fin
				fin
	);
	
    let mcode_fill256 = assemble_jareth!(
	// x..x / $DST in %0, aligned on 128 bits
	// 128-bits pattern in %1
	// x..x / X size in %2, multiple of 128 bits (16 bytes)
	// x..x / Y size in %3, arbitrary
	// x..x / dst_stride in %4 (screen width)
	// -----
	// live X count in %5
	// // live Y count in %3
	// data in %7
	// 0/scrap in %15
         start:
				// reset masks (probably not necessary with the starred-instruction)
				resm %15
				// compute X leftovers (modulo 32 -> #6 is 31)
				and %6, %2, #6
				// set the leftovers mask (offset is 0 as we are aligned)
				setmq %15, #0, %6
		loop_y:
				// set source and destination addresses for current Y, X=first
				setadr %15, %0
				// then the rounded value in X
				sub32v %5, %2, %6
		loop_x:
				// store to DST w/ post-increment
				store256inc %15, ^0, %1
				// sub 16 (#5 is 32) from live X count
				sub32v %5, %5, #5
				// if X count is not 0, keep looping
				brnz32 loop_x, %5

				// decrement Y count
				sub32v %3, %3, #1
				// if 0, finished
				brz32 done, %3
				// add strides to initial addresses
				add32v %0, %0, %4
				// loop to do next line
				brz32 loop_y, #0
		done:
				fin
				fin
	);

//	FILL ********************************************************************************************************
    let mcode_fill = assemble_jareth!(
	// x..x / $DST in %0
	// 128-bits pattern in %1 [assumed to be alignement-homogneous]
	// x..x / X size in %2
	// x..x / Y size in %3,
	// x..x / dst_stride in %4 (screen width?)
	// -----
	// main loop:
	// live X count in %5
	// leftover X in %6
	// // live Y count in %3
	// data in %7
	// masked data in %7
	// 0/scrap in %15
	// -----
	// header loop:
	// live Y count in %5
	// $DST in %6
	// data in %7
	// 0/scrap in %15
	

        start:
				// if number of line or element in line is 0, exit early
				brz32 done256, %2
				brz32 done256, %3
				// reset masks
				resm %15
				// if $DST is aligned on 128 bits, jump to aligned loop
				brz4 start256, %0

				// do the first column
		startX:
				// set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes
				setmq %15, %0, %2
				// copy Y
				psa %5, %3
				// copy $DST
				psa %6, %0
		loopX_y:
				// setadr
				setadr %15, %6
				// write partial data
				store256* %15, ^0, %1
				// increment copied $DST by stride
				add32v %6, %6, %4
				// decrement copied Y count
				sub32v %5, %5, #1
				// if not zero, continue
				brnz32 loopX_y, %5

		loopX_done:
				// how much did we do (#6 is 31, #5 is 32)
				and %8, %0, #6
				// compute 32-(x&31)
				sub32v %8, #5, %8
				// compute the proper value
				min32v %8, %8, %2
				// add that to the address, which will now be aligned
				add32v %0, %0, %8
				// remove from X, as we have done it
				sub32v %2, %2, %8
				// rotate the pattern to match
				rotr32v %1, %1, %8
				// fall through the aligned loop if not 0
				brz32 done256, %2

		start256:
				// compute X leftovers (modulo 32 -> #6 is 31)
				and %6, %2, #6
				// set the leftovers mask (offset is 0 as we are aligned)
				setmq %15, #0, %6
				
		loop256_y:
				// set source and destination addresses for current Y
				setadr %15, %0
				// then the rounded value in X
				sub32v %5, %2, %6
				// already 0, bypass aligned stuff
				brz32 loop256_x_end, %5
				
		loop256_x:
				// store to DST w/ post-increment
				store256inc %15, ^0, %1
				// sub 32 (#5 is 32) from live rounded X count
				sub32v %5, %5, #5
				// if X count is not 0, keep looping
				brnz32 loop256_x, %5
				// check for line leftovers
		loop256_x_end:
				brz5 done256_x, %6
				// write partial data
				store256* %15, ^0, %1
				
		done256_x:
				// decrement Y count
				sub32v %3, %3, #1
				// if 0, finished
				brz32 done256, %3
				
				// add strides to initial addresses
				add32v %0, %0, %4
				// loop256 to do next line
				brz32 loop256_y, #0
				
		done256:		
				fin
				fin
	);
	
//	FILL ROP ********************************************************************************************************
    let mcode_fillrop = assemble_jareth!(
	// x..x / $DST in %0
	// 128-bits pattern in %1 [assumed to be alignement-homogeneous]
	// x..x / X size in %2
	// x..x / Y size in %3,
	// x..x / dst_stride in %4 (screen width?)
	// x..x / rop / planemask in %5 [assumed to be alignement-homogenous]
	// -----
	// main loop:
	// live X count in %8
	// leftover X in %6
	// // live Y count in %3
	// data in %7
	// masked data in %7
	// 0/scrap in %15
	// -----
	// header loop:
	// live Y count in %8
	// $DST in %6
	// data in %7
	// 0/scrap in %15
	

        start:
				// if number of line or element in line is 0, exit early
				brz32 done256, %2
				brz32 done256, %3
				// reset masks
				resm %15
				// set planemask / rop
				srop %15, %5
				// if $DST is aligned on 128 bits, jump to aligned loop
				brz4 start256, %0

				// do the first column(s)
		startX:
				// set alignement; we shift by the addr offset, and we mask whatever data is needed in the first 32 bytes
				setmq %15, %0, %2
				// copy Y
				psa %8, %3
				// copy $DST
				psa %6, %0
		loopX_y:
				// setadr
				setadr %15, %6
				// load old data
				load256 %7, ^0
				// rop & insert
				rop32v* %7, %7, %1
				// rewrite data
				store256 %15, ^0, %7
				// increment copied $DST by stride
				add32v %6, %6, %4
				// decrement copied Y count
				sub32v %8, %8, #1
				// if not zero, continue
				brnz32 loopX_y, %8

		loopX_done:
				// how much did we do (#6 is 31, #5 is 32)
				and %8, %0, #6
				// compute 32-(x&31) - upper bound
				sub32v %8, #5, %8
				// compute the proper value
				min32v %8, %8, %2
				// add that to the address, which will now be aligned if there's stuff left to do
				add32v %0, %0, %8
				// remove from X, as we have done it
				sub32v %2, %2, %8
				// rotate the pattern to match
				rotr32v %1, %1, %8
				// fall through the aligned loop if not 0, otherwise done
				brz32 done256, %2

		start256:
				// compute X leftovers (modulo 32 -> #6 is 31)
				and %6, %2, #6
				// set the leftovers mask (offset is 0 as we are aligned)
				setmq %15, #0, %6
				
		loop256_y:
				// set source and destination addresses for current Y
				setadr %15, %0
				// then the rounded value in X
				sub32v %8, %2, %6
				// already 0, bypass aligned stuff
				brz32 loop256_x_end, %8
				
		loop256_x:
				// load  data
				load256 %7, ^0
				// rop
				rop32v %7, %7, %1
				// store to DST w/ post-increment
				store256inc %15, ^0, %7
				// sub 32 (#5 is 32) from live rounded X count
				sub32v %8, %8, #5
				// if X count is not 0, keep looping
				brnz32 loop256_x, %8
				// check for line leftovers
		loop256_x_end:
				brz5 done256_x, %6

				// load old data
				load256 %7, ^0
				// insert pattern
				rop32v* %7, %7, %1
				// rewrite data
				store256 %15, ^0, %7
				
		done256_x:
				// decrement Y count
				sub32v %3, %3, #1
				// if 0, finished
				brz32 done256, %3
				
				// add strides to initial addresses
				add32v %0, %0, %4
				// loop256 to do next line
				brz32 loop256_y, #0
				
		done256:		
				fin
				fin
	);


//	COPY ********************************************************************************************************
    let mcode_copy = assemble_jareth!(
	// x..x / $SRC / $DST in %0
	// x..x / $DST / $SRC in %1
	// x..x / X size in %2
	// x..x / Y size in %3,
	// x..x src_stride / dst_stride in %4 (screen width?)
	// -----
	// main loop:
	// live X count in %9
	// leftover X in %6
	// // live Y count in %3
	// data in %7
	// // masked data in %7
	// 0/scrap in %15
	// -----
	// header loop:
	// live Y count in %9
	// $SRC / $DST in %6
	// // dst data in %7
	// src data in %8
	// 0/scrap in %15
	

        start:
				// if number of line or element in line is 0, exit early
				brz32 done128, %2
				brz32 done128, %3
				// reset masks
				resm %15
				// set alignement; we shift by the addr offset
				setmq %15, %0, %2
				// we use b as that's the data input for Stores
				setmb %15, %1, #16
				// if $DST is aligned on 128 bits, jump to aligned loop
				brz4 start128, %0

				// do the first column to align $DST
		startX:
				// copy Y
				psa %9, %3
				// copy $SRC / $DST
				psa %6, %0
		loopX_y:
				// setadr
				setadr %15, %6
				// load src
				load256 %8, ^1
				// write partial data
				store128* %15, ^0, %8
				// increment copied $SRC / $DST by stride
				add32v %6, %6, %4
				// decrement copied Y count
				sub32v %9, %9, #1
				// if not zero, continue
				brnz32 loopX_y, %9

		loopX_done:
				// how much did we do (#15 is 15, #16 is 16)
				and %9, %0, #15
				// compute 16-(x&15)
				sub32v %9, #16, %9
				// compute the proper value
				min32v %9, %9, %2
				// more than one address to increment
				bcast32 %9, %9
				// add the count to the addresses, ^0 will now be aligned
				add32v %0, %0, %9
				// remove from X, as we have done it
				sub32v %2, %2, %9
				// fall through to the aligned loop if not 0
				brz32 done128, %2
				
				// reset q mask (we will be aligned from now on)
				setmq %15, #0, #16
				// add the count to the addresses, ^1 will have the proper shift for masking
				add32v %1, %1, %9
				// reset a mask to the proper shifting
				setmb %15, %1, #16

		start128:
				// compute X leftovers (modulo 16 -> #15 is 15)
				and %6, %2, #15
				
		loop128_y:
				// set source and destination addresses for current Y
				setadr %15, %0
				// then the rounded value in X
				sub32v %9, %2, %6
				// prefetch data
				load256inc %8, ^1
				// already 0, bypass aligned stuff
				brz32 loop128_x_end, %9
				
		loop128_x:
				// store to DST w/ post-increment
				store128inc* %15, ^0, %8
				// sub 16 (#16 is 16) from live rounded X count
				sub32v %9, %9, #16
				// prefetch data
				loadh128inc %8, ^1, %8
				// if X count is not 0, keep looping
				brnz32 loop128_x, %9
				// check for line leftovers
		loop128_x_end:
				brz4 done128_x, %6
				
				// set the leftovers mask (offset is 0 as we are aligned)
				// IMPROVE ME
				setmq %15, #0, %6
				// rewrite data
				store128* %15, ^0, %8
				// reset the Q mask
				// IMPROVE ME
				setmq %15, #0, #16
				
		done128_x:
				// decrement Y count
				sub32v %3, %3, #1
				// if 0, finished
				brz32 done128, %3
				
				// add strides to initial addresses
				add32v %0, %0, %4
				// loop128 to do next line
				brz32 loop128_y, #0
				
		done128:		
				fin
				fin
	);

//	COPYREV  ********************************************************************************************************
    let mcode_copyrev = assemble_jareth!(
	// x..x / $SRC / $DST in %0
	// x..x / $DST / $SRC in %1
	// x..x / X size in %2
	// x..x / Y size in %3,
	// x..x src_stride / dst_stride in %4 (screen width?)
	// -----
	// main loop:
	// leftover X in %6
	// data in %7
	// // masked data in %7
	// src data in %8
	// live X count in %9
	// $SRC / $DST in %10
	// $DST / $SRC in %11
	// live Y count in %12, also scratch in header
	// todo X count in %13
	// amount of work in tail in %14
	// 0/scrap in %15
	// -----
	// tail loop:
	// $SRC / $DST in %0
	// // dst data in %7
	// src data in %8
	// live Y count in %9
	// 0/scrap in %15
	

        start:
				// if number of line or element in line is 0, exit early
				brz32 done128, %2
				brz32 done128, %3
				// reset masks
				resm %15
				// copy addresses
				psa %10, %0
				psa %11, %1
				// set todo X
				psa %13, %2
				// compute how much the tail loop will handle (first column) (#15 is 15, #16 is 16), first the offset
				and %14, %0, #15
				// if 0, then we don't need a tail loop, so skip extra computation (that would wrongly give 16)
				brz32 skip, %14
				
				// it is at most 16-($DST & 15)
				sub32v %14, #16, %14
				// compute the proper value by bounding to Xsize
				min32v %14, %14, %2
				// more than one address to increment
				bcast32 %14, %14
				// add the count to the addresses, DST will now be aligned
				add32v %10, %10, %14
				// add the count to the addresses, SRC will have the proper alignment to shift input in the aligned loop
				add32v %11, %11, %14
				// so, do we do everything there ?
				sub32v %13, %2, %14
				// if 0, we do everything in the tail skip the aligned loop
				brz32 startX, %13
				
		skip:
				// reset q mask (we will be aligned from now on)
				setmq %15, #0, #16
				// set b mask to the proper shifting for Stores
				setmb %15, %11, #16

				// now we need to figure out where we start to go backward
				// currently we have the number of 'tail' (first column...) elements in %14 (0 for aligned), number of 'loop' elements in %13,
				// and $SRC+%14 & $DST+%14 in $10/$11 with $SRC+%14 aligned.
				// compute X leftovers (%13 modulo 16 -> #15 is 15) in %6, we will have to start with those
				and %6, %13, #15
				// compute the 'aligned' number of elements
				sub32v %15, %13, %6
				bcast32 %15, %15

				// add the aligned number of element to $SRC+%14 & $DST+%14
				add32v %10, %10, %15
				add32v %11, %11, %15
				
				// if %6 is 0 (no leftovers), then $DST is pointing after the last element so need to remove 16 from $DST and $SRC
				brnz32 skip2, %6
				psa %15, #16
				bcast32 %15, %15
				sub32v %10, %10, %15
				sub32v %11, %11, %15
				
		skip2:  // // if $SRC+%13 is not aligned, we also need to add 16 (for prefetch)
				// add32v %15, %11, %6
				// and %15, %15, #15
				// brz32 skip3, %15

				add32v %11, %11, #16
				psa %15, #16
				swap32 %15, %15
				add32v %10, %10, %15
				
				// add32v %15, %6, #16
				// add32v %11, %11, %15
				// swap32 %15, %15
				// add32v %10, %10, %15

		skip3:
				// copy Y count
				psa %12, %3
				
		loop128_y:
				// set source and destination addresses for current Y
				setadr %15, %10
				// then the rounded value in X
				sub32v %9, %13, %6
				// prefetch data
				
				// prefetch data
				load128dec %8, ^1
				
				// check for line leftovers
		loop128_x_begin:
				brz4 loop128_x, %6
				
				// set the leftovers mask (offset is 0 as we are aligned)
				// IMPROVE ME
				setmq %15, #0, %6
				// prefetch data
				loadl128dec %8, ^1, %8
				// write partial data
				store128dec* %15, ^0, %8
				// reset the Q mask
				// IMPROVE ME
				setmq %15, #0, #16
				
		loop128_x:
				// already 0, bypass aligned stuff
				brz32 loop128_x_end, %9
				// prefetch data
				loadl128dec %8, ^1, %8
				// write data
				store128dec* %15, ^0, %8
				// sub 16 (#16 is 16) from live rounded X count
				sub32v %9, %9, #16
				// if X count is not 0, keep looping
				brnz32 loop128_x, %9

		loop128_x_end:
				// decrement Y count
				sub32v %12, %12, #1
				// if 0, finished
				brz32 startX, %12
				
				// add strides to initial addresses
				add32v %10, %10, %4
				// loop128 to do next line
				brz32 loop128_y, #0

		startX:
				// do the first column if we need to
				brz32 done128, %14
				// set alignement; we shift by the addr offset
				setmq %15, %0, %2
				setmb %15, %1, #16
				// copy Y
				psa %9, %3
		loopX_y:
				// setadr from the start
				setadr %15, %0
				// load src
				load256 %8, ^1
				// write partial data
				store128* %15, ^0, %8
				// increment $SRC / $DST by stride
				add32v %0, %0, %4
				// decrement copied Y count
				sub32v %9, %9, #1
				// if not zero, continue
				brnz32 loopX_y, %9
				
		done128:
				fin
				fin
	);

//	******  ********************************************************************************************************

    let mut pos;

	pos = 0;
	println!("test code:");
    while pos < mcode.len() {
		  print!("0x{:08x},", mcode[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode.len());

	pos = 0;
	println!("scroll256:");
    while pos < mcode_scroll256.len() {
		  print!("0x{:08x},", mcode_scroll256[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_scroll256.len());

	pos = 0;
	println!("scroll128:");
    while pos < mcode_scroll128.len() {
		  print!("0x{:08x},", mcode_scroll128[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_scroll128.len());

	pos = 0;
	println!("fill128:");
    while pos < mcode_fill128.len() {
		  print!("0x{:08x},", mcode_fill128[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_fill128.len());

	pos = 0;
	println!("fill256:");
    while pos < mcode_fill256.len() {
		  print!("0x{:08x},", mcode_fill256[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_fill256.len());

	pos = 0;
	println!("fill:");
    while pos < mcode_fill.len() {
		  print!("0x{:08x},", mcode_fill[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_fill.len());

	pos = 0;
	println!("fillrop:");
    while pos < mcode_fillrop.len() {
		  print!("0x{:08x},", mcode_fillrop[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_fillrop.len());

	pos = 0;
	println!("copy:");
    while pos < mcode_copy.len() {
		  print!("0x{:08x},", mcode_copy[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_copy.len());

	pos = 0;
	println!("copyrev:");
    while pos < mcode_copyrev.len() {
		  print!("0x{:08x},", mcode_copyrev[pos]);
		  pos = pos + 1;
    }
	println!("");
	println!("-> {}", mcode_copyrev.len());

	Ok(())
}