Arquivotheca.SunOS-4.1.3/sys/sun4/crt.s

!	@(#)crt.s 1.1 92/07/30 SMI
!	Copyright (c) 1988 by Sun Microsystems, Inc.

	.seg	"text"
	.align	4

#include <machine/asm_linkage.h>
#include <machine/trap.h>

/*
 * C run time subroutines.
 */

/*
 * procedure to perform a 32 by 32 multiply.
 * pass the multiplier into %o0, and the multiplicand into %o1
 * the least significant 32 bits of the result will be returned in %o0,
 * and the most significant in %o1
 *
 * This code has an optimization built in for short (less than 13 bit)
 * multipliers. Short multipliers require 26 or 27 instruction cycles, and
 * long ones require 47 to 51 instruction cycles.  For two positive numbers,
 * the most common case, a long multiply takes 47 instruction cycles.
 *
 * This code indicates that overflow has occured, by leaving the Z condition
 * code clear. The following call sequence would be used if you wish to
 * deal with overflow:
 *
 *		call	.mul
 *		nop		( or set up last parameter here )
 *		bnz	overflow_code	(or tnz to overflow handler)
 */
	RTENTRY(.mul)
	mov	%o0, %y			! multiplier to Y register
	andncc	%o0, 0xfff, %g0		! mask out lower 12 bits
	be	mul_shortway		! can do it the short way
	andcc	%g0, %g0, %o4		! zero the partial product
					! and clear N and V conditions
	!
	! long multiply
	!
	mulscc	%o4, %o1, %o4		! first iteration of 33
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4		! 32nd iteration
	mulscc	%o4, %g0, %o4		! last iteration only shifts
	!
	! if %o0 (multiplier) was negative, the result is
	!	(%o0 * %o1) + %o1 * (2**32)
	! we fix that here
	!
	tst	%o0
	rd	%y, %o0
	bge	1f
	tst	%o0			! for when we check for overflow

	sub	%o4, %o1, %o4		! bit 33 and up of the product are in
					! %o4, so we don't have to shift %o1
	!
	! We haven't overflowed if:
	!	low-order bits are positive and high-order bits are 0
	!	low-order bits are negative and high-order bits are -1
	!
	! if you are not interested in detecting overflow,
	! replace the following code with:
	!
	!	1:	jmp	%o7+8
	!		mov	%o4, %o1
	!
1:
	bge	2f			! if low-order bits were positive.
	addcc	%o4, %g0, %o1		! return most sig. bits of prod and set
					! Z appropriately (for positive product)
	jmp	%o7+8
	subcc	%o4, -1, %g0		! set Z if high order bits are -1 (for
					! negative product)
2:
	jmp	%o7+8			! leaf routine return
	nop
	!
	! short multiply
	!
mul_shortway:
	mulscc	%o4, %o1, %o4		! first iteration of 13
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4		! 12th iteration
	mulscc	%o4, %g0, %o4		! last iteration only shifts

	rd	%y, %o5
	sll	%o4, 12, %o0		! left shift middle bits by 12 bits
	srl	%o5, 20, %o5		! right shift low bits by 20 bits
	!
	! We haven't overflowed if:
	!	low-order bits are positive and high-order bits are 0
	!	low-order bits are negative and high-order bits are -1
	!
	! if you are not interested in detecting overflow,
	! replace the following code with:
	!
	!		or	%o5, %o4, %o0
	!		jmp	%o7+8
	!		mov	%o4, %o1
	!
	orcc	%o5, %o0, %o0		! merge for true product
	bge	3f			! if low-order bits were positive.
	sra	%o4, 20, %o1		! right shift high bits by 20 bits
					! and put into  %o1
	jmp	%o7+8
	subcc	%o1, -1, %g0		! set Z if high order bits are -1 (for
					! negative product)
3:
	jmp	%o7+8			! leaf routine return
	addcc	%o1, %g0, %g0		! set Z if high order bits are 0

/*
 * procedure to perform a 32 by 32 unsigned multiply.
 * pass the multiplier into %o0, and the multiplicand into %o1
 * the least significant 32 bits of the result will be returned in %o0,
 * and the most significant in %o1
 *
 * This code has an optimization built in for short (less than 13 bit)
 * multiplies. Short multiplies require 25 instruction cycles, and long ones
 * require 46 or 48 instruction cycles.
 *
 * This code indicates that overflow has occured, by leaving the Z condition
 * code clear. The following call sequence would be used if you wish to
 * deal with overflow:
 *
 *		call	.umul
 *		nop		( or set up last parameter here )
 *		bnz	overflow_code	(or tnz to overflow handler)
 */
	RTENTRY(.umul)
	or	%o0, %o1, %o4		! logical or of multiplier
					! and multiplcand
	mov	%o0, %y			! multiplier to Y register
	andncc	%o4, 0xfff, %o5		! mask out lower 12 bits
	be	umul_shortway		! can do it the short way
	andcc	%g0, %g0, %o4		! zero the partial product
					! and clear N and V conditions
	!
	! long multiply
	!
	mulscc	%o4, %o1, %o4		! first iteration of 33
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4		! 32nd iteration
	mulscc	%o4, %g0, %o4		! last iteration only shifts
	!
	! Normally, with the shifty-add approach, if both numbers are positive,
	! you get the correct result.  With 32-bit twos-complement numbers,
	! -x can be represented as ((2 - (x/(2**32)) mod 2) * 2**32.  To avoid
	! a lot of 2**32's, we can just move the radix point up to be just
	! to the left of the sign bit.  So:
	!
	!	 x *  y	= (xy) mod 2
	!	-x *  y	= (2 - x) mod 2 * y = (2y - xy) mod 2
	!	 x * -y	= x * (2 - y) mod 2 = (2x - xy) mod 2
	!	-x * -y = (2 - x) * (2 - y) = (4 - 2x - 2y + xy) mod 2
	!
	! For signed multiplies, we subtract (2**32) * x from the partial
	! product to fix this problem for negative multipliers (see multiply.s)
	! Because of the way the shift into the partial product is calculated
	! (N xor V), this term is automatically removed for the multiplicand,
	! so we don't have to adjust.
	!
	! But for unsigned multiplies, the high order bit wasn't a sign bit,
	! and the correction is wrong.  So for unsigned multiplies where the
	! high order bit is one, we end up with xy - (2**32) * y.  To fix it
	! we add y * (2**32).
	!
	tst	%o1
	bge	1f
	nop
	add	%o4, %o0, %o4
1:
	rd	%y, %o0			! return least sig. bits of prod
	jmp	%o7+8			! leaf routine return
	addcc	%o4, %g0, %o1		! delay slot; return high bits and set
					! zero bit appropriately
	!
	! short multiply
	!
umul_shortway:
	mulscc	%o4, %o1, %o4		! first iteration of 13
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4
	mulscc	%o4, %o1, %o4		! 12th iteration
	mulscc	%o4, %g0, %o4		! last iteration only shifts

	rd	%y, %o5
	sll	%o4, 12, %o4		! left shift partial product
					! by 12 bits
	srl	%o5, 20, %o5		! right shift product 20 bits
	or	%o5, %o4, %o0		! merge for true product
	!
	! The delay instruction moves zero into %o1,
	! sets the zero condition code, and clears the other conditions.
	! This is the equivalent result to a long umultiply which doesn't
	! overflow
	!
	jmp	%o7+8			! leaf routine return
	addcc	%g0, %g0, %o1

/*
 * divison/remainder
 *
 * Input is:
 *	dividend -- the thing being divided
 * divisor  -- how many ways to divide
 * Important parameters:
 *	N -- how many bits per iteration we try to get
 *		as our current guess:
 *	WORDSIZE -- how many bits altogether we're talking about:
 *		obviously:
 * A derived constant:
 *	TOPBITS -- how many bits are in the top "decade" of a number:
 *
 * Important variables are:
 *	Q -- the partial quotient under development -- initally 0
 *	R -- the remainder so far -- initially == the dividend
 *	ITER -- number of iterations of the main division loop will
 *		be required. Equal to CEIL( lg2(quotient)/4 )
 *		Note that this is log_base_(2^4) of the quotient.
 *	V -- the current comparand -- initially divisor*2^(ITER*4-1)
 * Cost:
 *	current estimate for non-large dividend is
 *		CEIL( lg2(quotient) / 4 ) x ( 10 + 74/2 ) + C
 *	a large dividend is one greater than 2^(31-4 ) and takes a
 *	different path, as the upper bits of the quotient must be developed
 *	one bit at a time.
 */
	RTENTRY(.udiv)		! UNSIGNED DIVIDE
	b	divide
	mov	0,%g1		! result always positive

	RTENTRY(.div)		! SIGNED DIVIDE
	orcc	%o1,%o0,%g0 ! are either %o0 or %o1 negative
	bge	divide		! if not, skip this junk
	xor	%o1,%o0,%g1 ! record sign of result in sign of %g1
		tst	%o1
		bge	2f
		tst	%o0
	!	%o1 < 0
		bge	divide
		neg	%o1
	2:
	!	%o0 < 0
		neg	%o0
	!	FALL THROUGH

divide:
!	compute size of quotient, scale comparand
	orcc	%o1,%g0,%o5	! movcc	%o1,%o5
	te	ST_DIV0		! if %o1 = 0
	mov	%o0,%o3
	cmp	%o3,%o5
	blu	got_result ! if %o3<%o5 already, there's no point in continuing
	mov	0,%o2
	sethi	%hi(1<<(32-4 -1)),%g2
	cmp	%o3,%g2
	blu	not_really_big
	mov	0,%o4
	!
	! here, the %o0 is >= 2^(31-4) or so. We must be careful here, as
	! our usual 4-at-a-shot divide step will cause overflow and havoc. The
	! total number of bits in the result here is 4*%o4+%g3, where %g3 <= 4.
	! compute %o4, in an unorthodox manner: know we need to Shift %o5 into
	!	the top decade: so don't even bother to compare to %o3.
	1:
		cmp	%o5,%g2
		bgeu	3f
		mov	1,%g3
		sll	%o5,4,%o5
		b	1b
		inc	%o4
	! now compute %g3
	2:	addcc	%o5,%o5,%o5
		bcc	not_too_big ! bcc	not_too_big
		add	%g3,1,%g3
			!
			! here if the %o1 overflowed when Shifting
			! this means that %o3 has the high-order bit set
			! restore %o5 and subtract from %o3
			sll	%g2,4 ,%g2 ! high order bit
			srl	%o5,1,%o5 ! rest of %o5
			add	%o5,%g2,%o5
			b	do_single_div
			sub	%g3,1,%g3
	not_too_big:
	3:	cmp	%o5,%o3
		blu	2b
		nop
		be	do_single_div
		nop
	! %o5 > %o3: went too far: back up 1 step
	!	srl	%o5,1,%o5
	!	dec	%g3
	! do single-bit divide steps
	!
	! we have to be careful here. We know that %o3 >= %o5, so we can do the
	! first divide step without thinking. BUT, the others are conditional,
	! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
	! order bit set in the first step, just falling into the regular
	! division loop will mess up the first time around.
	! So we unroll slightly...
	do_single_div:
		deccc	%g3
		bl	end_regular_divide
		nop
		sub	%o3,%o5,%o3
		mov	1,%o2
		b,a	end_single_divloop
	single_divloop:
		sll	%o2,1,%o2
		bl	1f
		srl	%o5,1,%o5
		! %o3 >= 0
		sub	%o3,%o5,%o3
		b	2f
		inc	%o2
	1:	! %o3 < 0
		add	%o3,%o5,%o3
		dec	%o2
	2:
	end_single_divloop:
		deccc	%g3
		bge	single_divloop
		tst	%o3
		b,a	end_regular_divide

not_really_big:
1:
	sll	%o5,4,%o5
	cmp	%o5,%o3
	bleu	1b
	inccc	%o4
	be	got_result
	dec	%o4
do_regular_divide:

!	do the main division iteration
	tst	%o3
!	fall through into divide loop
divloop:
	sll	%o2,4,%o2
		!depth 1, accumulated bits 0
	bl	L.1.16
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 2, accumulated bits 1
	bl	L.2.17
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 3, accumulated bits 3
	bl	L.3.19
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits 7
	bl	L.4.23
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (7*2+1), %o2

L.4.23:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (7*2-1), %o2

L.3.19:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits 5
	bl	L.4.21
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (5*2+1), %o2

L.4.21:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (5*2-1), %o2

L.2.17:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 3, accumulated bits 1
	bl	L.3.17
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits 3
	bl	L.4.19
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (3*2+1), %o2

L.4.19:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (3*2-1), %o2

L.3.17:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits 1
	bl	L.4.17
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (1*2+1), %o2

L.4.17:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (1*2-1), %o2

L.1.16:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 2, accumulated bits -1
	bl	L.2.15
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 3, accumulated bits -1
	bl	L.3.15
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits -1
	bl	L.4.15
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-1*2+1), %o2

L.4.15:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-1*2-1), %o2

L.3.15:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits -3
	bl	L.4.13
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-3*2+1), %o2

L.4.13:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-3*2-1), %o2

L.2.15:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 3, accumulated bits -3
	bl	L.3.13
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits -5
	bl	L.4.11
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-5*2+1), %o2

L.4.11:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-5*2-1), %o2

L.3.13:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits -7
	bl	L.4.9
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-7*2+1), %o2

L.4.9:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-7*2-1), %o2
	9:

end_regular_divide:
	deccc	%o4
	bge	divloop
	tst	%o3
	bl,a	got_result
	dec	%o2

got_result:
	tst	%g1
	bl,a	1f
	neg	%o2	! quotient  <- -%o2
1:
	retl
	mov	%o2,%o0	! quotient  <-  %o2


	RTENTRY(.urem)		! UNSIGNED REMAINDER
	b	rem
	mov	0,%g1		! result always positive

	RTENTRY(.rem)		! SIGNED REMAINDER
	orcc	%o1,%o0,%g0 ! are either %o0 or %o1 negative
	bge	rem		! if not, skip this junk
	mov	%o0,%g1	! record sign of result in sign of %g1
		tst	%o1
		bge	2f
		tst	%o0
	!	%o1 < 0
		bge	rem
		neg	%o1
	2:
	!	%o0 < 0
		neg	%o0
	!	FALL THROUGH

rem:
!	compute size of quotient, scale comparand
	orcc	%o1,%g0,%o5	! movcc	%o1,%o5
	te	ST_DIV0		! if %o1 = 0
	mov	%o0,%o3
	cmp	%o3,%o5
	blu	rgot_result ! if %o3<%o5 already, there's no point in continuing
	mov	0,%o2
	sethi	%hi(1<<(32-4 -1)),%g2
	cmp	%o3,%g2
	blu	rnot_really_big
	mov	0,%o4
	!
	! here, the %o0 is >= 2^(31-4) or so. We must be careful here, as
	! our usual 4-at-a-shot divide step will cause overflow and havoc. The
	! total number of bits in the result here is 4*%o4+%g3, where %g3 <= 4.
	! compute %o4, in an unorthodox manner: know we need to Shift %o5 into
	!	the top decade: so don't even bother to compare to %o3.
	1:
		cmp	%o5,%g2
		bgeu	3f
		mov	1,%g3
		sll	%o5,4,%o5
		b	1b
		inc	%o4
	! now compute %g3
	2:	addcc	%o5,%o5,%o5
		bcc	rnot_too_big ! bcc	rnot_too_big
		add	%g3,1,%g3
			!
			! here if the %o1 overflowed when Shifting
			! this means that %o3 has the high-order bit set
			! restore %o5 and subtract from %o3
			sll	%g2,4 ,%g2 ! high order bit
			srl	%o5,1,%o5 ! rest of %o5
			add	%o5,%g2,%o5
			b	do_single_rem
			sub	%g3,1,%g3
	rnot_too_big:
	3:	cmp	%o5,%o3
		blu	2b
		nop
		be	do_single_rem
		nop
	! %o5 > %o3: went too far: back up 1 step
	!	srl	%o5,1,%o5
	!	dec	%g3
	! do single-bit divide steps
	!
	! we have to be careful here. We know that %o3 >= %o5, so we can do the
	! first divide step without thinking. BUT, the others are conditional,
	! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
	! order bit set in the first step, just falling into the regular
	! division loop will mess up the first time around.
	! So we unroll slightly...
	do_single_rem:
		deccc	%g3
		bl	end_regular_remainder
		nop
		sub	%o3,%o5,%o3
		mov	1,%o2
		b,a	end_single_remloop
	single_remloop:
		sll	%o2,1,%o2
		bl	1f
		srl	%o5,1,%o5
		! %o3 >= 0
		sub	%o3,%o5,%o3
		b	2f
		inc	%o2
	1:	! %o3 < 0
		add	%o3,%o5,%o3
		dec	%o2
	2:
	end_single_remloop:
		deccc	%g3
		bge	single_remloop
		tst	%o3
		b,a	end_regular_remainder

rnot_really_big:
1:
	sll	%o5,4,%o5
	cmp	%o5,%o3
	bleu	1b
	inccc	%o4
	be	rgot_result
	dec	%o4
do_regular_remainder:

!	do the main division iteration
	tst	%o3
!	fall through into divide loop
remloop:
	sll	%o2,4,%o2
		!depth 1, accumulated bits 0
	bl	Lr.1.16
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 2, accumulated bits 1
	bl	Lr.2.17
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 3, accumulated bits 3
	bl	Lr.3.19
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits 7
	bl	Lr.4.23
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (7*2+1), %o2

Lr.4.23:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (7*2-1), %o2

Lr.3.19:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits 5
	bl	Lr.4.21
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (5*2+1), %o2

Lr.4.21:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (5*2-1), %o2

Lr.2.17:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 3, accumulated bits 1
	bl	Lr.3.17
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits 3
	bl	Lr.4.19
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (3*2+1), %o2

Lr.4.19:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (3*2-1), %o2

Lr.3.17:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits 1
	bl	Lr.4.17
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (1*2+1), %o2

Lr.4.17:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (1*2-1), %o2

Lr.1.16:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 2, accumulated bits -1
	bl	Lr.2.15
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 3, accumulated bits -1
	bl	Lr.3.15
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits -1
	bl	Lr.4.15
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-1*2+1), %o2

Lr.4.15:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-1*2-1), %o2

Lr.3.15:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits -3
	bl	Lr.4.13
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-3*2+1), %o2

Lr.4.13:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-3*2-1), %o2

Lr.2.15:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 3, accumulated bits -3
	bl	Lr.3.13
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
			!depth 4, accumulated bits -5
	bl	Lr.4.11
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-5*2+1), %o2

Lr.4.11:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-5*2-1), %o2

Lr.3.13:	! remainder is negative
	addcc	%o3,%o5,%o3
			!depth 4, accumulated bits -7
	bl	Lr.4.9
	srl	%o5,1,%o5
	! remainder is positive
	subcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-7*2+1), %o2

Lr.4.9:	! remainder is negative
	addcc	%o3,%o5,%o3
		b	9f
		add	%o2, (-7*2-1), %o2
	9:

end_regular_remainder:
	deccc	%o4
	bge	remloop
	tst	%o3
	bl,a	rgot_result
	add	%o3,%o1,%o3

rgot_result:
	tst	%g1
	bl,a	1f
	neg	%o3	! remainder <- -%o3
1:
	retl
	mov	%o3,%o0	! remainder <-  %o3

/*
 * Structure return
 */
#define UNIMP		0
#define MASK		0x00000fff
#define STRUCT_VAL_OFF	(16*4)

	RTENTRY(.stret4)
	RTENTRY(.stret8)
	!
	! see if key matches: if not, structure value not expected,
	! so just return
	!
	ld	[%i7 + 8], %o3
	and	%o1, MASK, %o4
	sethi	%hi(UNIMP), %o5
	or	%o4, %o5, %o5
	cmp	%o5, %o3
	be,a	0f
	ld	[%fp + STRUCT_VAL_OFF], %i0	! set expected return value
	ret
	restore
0:						! copy the struct
	subcc	%o1, 4, %o1
	ld	[%o0 + %o1], %o4
	bg	0b
	st	%o4, [%i0 + %o1]		! delay slot
	add	%i7, 0x4, %i7			! bump return address
	ret
	restore

	RTENTRY(.stret2)
	!
	! see if key matches: if not, structure value not expected,
	! so just return
	!
	ld	[%i7 + 8], %o3
	and	%o1, MASK, %o4
	sethi	%hi(UNIMP), %o5
	or	%o4, %o5, %o5
	cmp	%o5, %o3
	be,a	0f
	ld	[%fp + STRUCT_VAL_OFF], %i0	! set expected return value
	ret
	restore
0:						! copy the struct
	subcc	%o1, 2, %o1
	lduh	[%o0 + %o1], %o4
	bg	0b
	sth	%o4, [%i0 + %o1]		! delay slot
	add	%i7, 0x4, %i7			! bump return address
	ret
	restore

/*
 * integer multiply  __ip_umul, __ip_mul, __ip_umulcc, __ip_mulcc
 * input: 	%i0 = rs1
 *		%i1 = rs2 or simm13
 *		%i2 = address for rd
 *		%i3 = address for Y-register
 *		%i4 = address for psr (or icc)
 *
 * perform a (signed or unsigned) multiplication of a 32 bit multiplier
 * %i0 and a 32 bit multiplicand %i1 resulting in a 64-bit product. The
 * low order 32-bits of the multiply are placed in [%i2], and the
 * upper 32-bits of the multiply are placed in [%i3]. The condition
 * codes are set as follows:
 * unsigned:  	N	set if product<31> is set
 *		Z	set if product equals zero
 *		V	set if product <63:32> is not zero
 *		C	always cleared
 * signed:  	N	set if product<31> is set
 *		Z	set if product equals zero
 *		V	set if product <63:32> is not product<31>
 *		C	always cleared
 *
 * Traps: (none)
 */
	ENTRY(_ip_umul)
	save	%sp,-96,%sp
	mov	%i0,%o0
	call	.umul
	mov	%i1,%o1
	st	%o0,[%i2]
	st	%o1,[%i3]
	mov	1, %i0		! success
	ret
	restore

	ENTRY(_ip_mul)
	save	%sp,-96,%sp
	mov	%i0,%o0
	call	.mul
	mov	%i1,%o1
	st	%o0,[%i2]
	st	%o1,[%i3]
	mov	1, %i0		! success
	ret
	restore

	ENTRY(_ip_umulcc)
	save	%sp,-96,%sp
	mov	%i0,%o0
	call	.umul
	mov	%i1,%o1
	st	%o0,[%i2]
	st	%o1,[%i3]

unsigned_ccret:
	mov	0,%l0		! clear NZVC
	srl	%o0,31,%l1	! %l1 = product<31>
	sll	%l1,3,%l0	! N = product<31>
	orcc	%o0,%o1,%g0
	be,a	1f
	or	%l0,0x4,%l0	! set Z if product = 0
1:
	tst	%o1
	bnz,a	2f		! use common code in _ip_mulcc
	or	%l0,0x2,%l0	! set V if product<63:32> is not zero
	b	3f		! use common code in _ip_mulcc
	sll	%l0,20,%l0	! shift to the icc position in psr

	ENTRY(_ip_mulcc)
	save	%sp,-96,%sp
	mov	%i0,%o0
	call	.mul
	mov	%i1,%o1
	st	%o0,[%i2]
	st	%o1,[%i3]

signed_ccret:
	mov	0,%l0		! clear NZVC
	srl	%o0,31,%l1	! %l1 = result<31>
	sll	%l1,3,%l0	! N = result<31>
	orcc	%o0,%o1,%g0
	be,a	1f
	or	%l0,0x4,%l0	! set Z if result = 0
1:
	addcc	%l1,%o1,%g0
	bnz,a	2f
	or	%l0,0x2,%l0	! set V if result<63:32> is not result<31>
2:
	sll	%l0,20,%l0	! shift to the icc position in psr
3:
	ld	[%i4],%l1
	set	0x00f00000,%l2
	andn	%l1,%l2,%l1	! clear icc field
	or	%l1,%l0,%l1	! put in icc
	st	%l1,[%i4]	! store to psr
	mov	1, %i0		! success
	ret
	restore

/*
 * integer divide  __ip_udiv, __ip_div, __ip_udivcc, __ip_divcc
 * input: 	%i0 = rs1			-- lower dividend
 *		%i1 = rs2 or simm13		-- divisor
 *		%i2 = address for rd		-- quotient (lower 32-bits)
 *		%i3 = address for Y-register	-- upper dividend
 *		%i4 = address for psr (or icc)
 *
 * When return, %i1 will also contain the upper 32-bits quotient
 *
 * perform a (signed or unsigned) division of a 64 bit dividend (lower
 * 32-bits in %i0 and upper 32-bits in [%i3]) and a 32 bit divisor %i1
 * resulting in a 32-bit quotient [%i2]. Overflow is set if the quotient
 * cannot be represented in 32-bits. Here we will put the upper 32-bits
 * quotient in %i1 when return.
 * The condition codes are set as follows:
 * 	  	N	set if MSB of quotient is set
 *		Z	set if quotient equals zero
 *		V	set if division overflow
 *		C	always cleared
 *
 * Traps: division by zero
 */
	ENTRY(_ip_udiv)
	tst	%o1
	bnz,a	1f
	save	%sp,-96,%sp
	b,a	div_by_zero
1:	mov	%i0,%o0
	ld	[%i3],%o1
	call	long_udiv
	mov	%i1,%o2
	st	%o0,[%i2]
	mov	%o1,%i1
	mov	1, %i0		! success
	ret
	restore

	ENTRY(_ip_div)
	tst	%o1
	bnz,a	1f
	save	%sp,-96,%sp
	b,a	div_by_zero
1:	mov	%i0,%o0
	ld	[%i3],%o1
	call	long_div
	mov	%i1,%o2
	st	%o0,[%i2]
	mov	%o1,%i1
	mov	1, %i0		! success
	ret
	restore

	ENTRY(_ip_udivcc)
	tst	%o1
	bnz,a	1f
	save	%sp,-96,%sp
	b,a	div_by_zero
1:	mov	%i0,%o0
	ld	[%i3],%o1
	call	long_udiv
	mov	%i1,%o2
	st	%o0,[%i2]
	b	unsigned_ccret
	mov	%o1,%i1

	ENTRY(_ip_divcc)
	tst	%o1
	bnz,a	1f
	save	%sp,-96,%sp
	b,a	div_by_zero
1:	mov	%i0,%o0
	ld	[%i3],%o1
	call	long_div
	mov	%i1,%o2
	st	%o0,[%i2]
	b	signed_ccret
	mov	%o1,%i1

div_by_zero:
	retl
	mov	-2, %o0		! dvivide by zero detected, see trap.c

/*
 * long_udiv(L,H,D)	unsigned 64/32 bits divided
 * long_div (L,H,D)	  signed 64/32 bits divided
 * int L,H,D;
 *
 * Input:
 *	L = %i0 -- least 32 bits of dividend
 *	H = %i1 -- most  32 bits of dividend
 *	D = %i2 -- divisor
 * Output:
 *	%i0 -- least 32 bits of quotient
 *	%i1 -- most  32 bits of quotient
 *	%i2 -- remainder
 *
 * local register usage:
 *	%i0	least 32 bits dividend
 * 	%i1	most  32 bits dividend
 *	%i2	divisor
 *	%i3	&QL
 *	%i4	&QH
 *	%i5	&R
 *	%l0	least 32 bits quotient
 *	%l1	most  32 bits quotient
 *	%l2	remainder
 *	%l3	sign
 *	%l4	counter
 *
 * undocumented usage of .udiv:
 *	.udiv return quotient in o0, remainder in o3; if o3 is
 *	negative, then replace o3 by o3 + divisor to get the correct
 *	remainder
 */
long_udiv:
	save	%sp,-0x100,%sp
	b	ldivide
	mov	0,%l3		! sign is positive

long_div:
	save	%sp,-0x100,%sp
	orcc	%i2,%i1,%g0	! dividend or divisor negative ?
	bge	ldivide		! if not, skip this
	xor	%i2,%i1,%l3	! record sign of result
	tst	%i2
	bge	2f
	tst	%i1
    ! divisor < 0
	bge	ldivide
	neg	%i2
2:
    ! dividend < 0
	subcc	%g0,%i0,%i0
	b	ldivide
	subx	%g0,%i1,%i1


ldivide:
	cmp	%i2,%i1		! divisor
	bgu,a	leastquo	! if divisor > dividendh then goto leastquo
	mov	0,%l1		! and set quotienth = 0.
    ! quotienth is non-zero
	mov	%i1,%o0
	call	.udiv		! quotienth = dividendh/divisor
	mov	%i2,%o1
	mov	%o0,%l1		! store result in quotienth
	tst	%o3
	bge,a	leastquo
	mov	%o3,%i1		! %o2 is the remainder, set %i1 = %o2
	add	%o3,%i2,%i1
leastquo:
    ! computing quotientl, the least sig. 32 bits of the quotient
	tst	%i1		! if 0, then quotientl=dividendl/divisor
	bne,a	1f
	mov	0,%l0		! initialize quotientl
	mov	%i0,%o0
	call	.udiv
	mov	%i2,%o1
	mov	%o0,%i0
	tst	%o3
	bge,a	2f
	mov	%o3,%i2
	add	%o3,%i2,%i2
2:
	ba	endldivide
	mov	%l1,%i1
1:
	mov	32,%l4		! initialize counter
	addcc	%i0,%i0,%i0
loop:
	addxcc	%i1,%i1,%i1	! dividend << 1
	bcs	2f
	add	%l0,%l0,%l0	! quo << 1
	cmp	%i1,%i2
	bcs,a	1f		! if dividend < divisor skip inc quotient
	subcc	%l4,1,%l4	! counter -= 1
2:
	sub	%i1,%i2,%i1	! dividendh -= divisor
	inc	%l0		! quo += 1 when divisor < dividendh
	subcc	%l4,1,%l4	! counter -= 1
1:
	bg,a	loop
	addcc	%i0,%i0,%i0
	mov	%i1,%i2		! remainder in %i2
	mov	%l1,%i1		! most  32 bits of quotient in %i1
	mov	%l0,%i0		! least 32 bits of quotient in %i0
endldivide:
	tst	%l3		! check sign
	bge	1f
	nop
	neg	%i2
	subcc	%g0,%i0,%i0
	subx	%g0,%i1,%i1
1:
    ! store result to QL,QH and R
! mask off
!	st	%i0,[%i3]
!	st	%i1,[%i4]
!	st	%i2,[%i5]
	ret
	restore