andrastantos.cray-sim/simulator/sim_lib/cray_float.cpp

#include "utils.h"
#include "cray_types.h"
#include "cray_logger.h"

// Cygwin didn't quite like abs(long long) so let's have a local equivalent...
template <typename tType> static tType Abs(const tType &aA) { return aA > tType(0) ? aA : -aA; }

static const size_t cFractShiftAmount = 0;

//static long long StickyShift(long long aFract, size_t aAmount) {
//	long long Mask = (1LL << aAmount) - 1;
//	if ((aFract & Mask) != 0) {
//		return (aFract >> aAmount) | 1;
//	} else {
//		return (aFract >> aAmount);
//	}
//}

static long long Shift(long long aFract, size_t aAmount) {
	return (aFract >> aAmount);
}

// Math operators and functions
CFloat_t operator +(const CFloat_t  &aA, const CFloat_t  &aB) {
	// Handle underflow
	CFloat_t A = aA.UnderflowOrValue();
	CFloat_t B = aB.UnderflowOrValue();

	long ExpA = A.Exp();
	long ExpB = B.Exp();
	long long FractA = A.UFract() << cFractShiftAmount;
	long long FractB = B.UFract() << cFractShiftAmount;

	long long ResultFract;
	long ResultExp;
	if (ExpA > ExpB) {
		FractB = Shift(FractB, std::min<long>(63,(ExpA-ExpB)));
		//if ((FractB & 1) != 0) FractB += 1;
		//FractB >>= 1;
		ResultExp = ExpA;
	} else if (ExpA < ExpB) {
		FractA = Shift(FractA, std::min<long>(63,(ExpB-ExpA)));
		//if ((FractA & 1) != 0) FractA += 1;
		//FractA >>= 1;
		ResultExp = ExpB;
	} else {
		ResultExp = ExpB;
	}
	bool ResultPositive = true;
	if (!A.IsPositive() && !B.IsPositive()) {
		ResultFract = FractA + FractB;
		ResultPositive = false;
	} else if (A.IsPositive() && B.IsPositive()) {
		ResultFract = FractA + FractB;
		ResultPositive = true;
	} else if (A.IsPositive() && !B.IsPositive()) {
		ResultFract = FractA - FractB;
		if (ResultFract < 0) {
			ResultPositive = false;
			ResultFract = -ResultFract;
		} else {
			ResultPositive = true;
		}
	} else {
		ResultFract = FractB - FractA;
		if (ResultFract < 0) {
			ResultPositive = false;
			ResultFract = -ResultFract;
		}
		else {
			ResultPositive = true;
		}
	}

	if ((Abs(ResultFract) & (1LL << (CFloat_t::FractSize + cFractShiftAmount))) != 0) {
		++ResultExp;
//		if ((ResultFract & 1) != 0) ResultFract += 1;
		ResultFract = Shift(ResultFract, 1);
	}
	CFloat_t RetVal(ResultExp, ResultPositive, Shift(ResultFract, cFractShiftAmount));
	RetVal.Normalize();
	return RetVal.UnderflowOrValue();
}

CFloat_t operator -(const CFloat_t  &aA) {
	CFloat_t RetVal(aA);
	RetVal.FlipSign();
	return RetVal;
}

CFloat_t operator -(const CFloat_t  &aA, const CFloat_t  &aB) {
	return aA + (-aB);
}

static CFloat_t Mult(const CFloat_t  &aA, const CFloat_t  &aB, bool aDoRounding, bool aHalfPrecision = false, bool aTwoMinus = false) {
	// Each fractional is between 0 and 2.
	// The result is thus between 0 and 4.
	// Each fractional is 48 bits, so the result is 49 bits, with the potential of one normalization shift in the end if the MSB is '1'.
	// Cut each fractional into two 24-bit portions and implement the multiply that way - this makes carry handling in the addition step easier

	// Handle underflow
	CFloat_t A = aA.RawExp() == 0 ? aA : aA.UnderflowOrValue();
	CFloat_t B = aB.RawExp() == 0 ? aB : aB.UnderflowOrValue();

	if (A.Value == 0) return CFloat_t(CInt_t(0));
	if (B.Value == 0) return CFloat_t(CInt_t(0));

	long Exp = A.Exp() + B.Exp();
	bool ResultPositive = A.IsPositive() == B.IsPositive();

	bool IntegerMult = (A.RawExp() == 0 && B.RawExp() == 0);
	if  (IntegerMult) {
		// Integer multiply domain (1)
		Exp = -CFloat_t::ExpOfs; // Zero-out exponent
	}

	// Multiplier is only 56-bit wide
	uint64_t FractA = A.UFract() << 8;
	uint64_t FractB = B.UFract() << 8;
	uint64_t CorrectionFactor = 18;
	uint64_t ResultFract = 0;
	if (aDoRounding) {
		CorrectionFactor |= 0x80 | 0x40;
	}
	if (aHalfPrecision) {
		CorrectionFactor |= (1 << 24) | (1 << 25);
	}
	if (aTwoMinus) {
		CorrectionFactor = ~CorrectionFactor;
	}
	//std::cout << "Correction factor: " << HexPrinter(CorrectionFactor) << std::endl;
	for (size_t BitIdx = 0; BitIdx < 48; ++BitIdx) {
		if (((FractB >> (47 + 8 - BitIdx)) & 1) != 0) {
			ResultFract = ResultFract + FractA;
		}
		FractA >>= 1;
	}
	//std::cout << "Before adjustment: " << HexPrinter(ResultFract) << std::endl;
	ResultFract += CorrectionFactor;
	//std::cout << "After adjustment: " << HexPrinter(ResultFract) << std::endl;
	if (aTwoMinus) {
		ResultFract = ~ResultFract;
	}
	//std::cout << "After inversion: " << HexPrinter(ResultFract) << std::endl;
	// Re-normalizing if necessary
	if (!IntegerMult) {
		if ((ResultFract & (1ull << 56)) != 0) {
			++Exp;
			// This is most likely wrong: the original implementation almost certainly didn't do this rounding step.
			//ResultFract += ResultFract & 1;
			ResultFract >>= 1;
		}
		if (Exp + CFloat_t::ExpOfs < CFloat_t::ExpMin) {
			Exp = -CFloat_t::ExpOfs;
			ResultFract = 0;
			ResultPositive = true;
		}
		if (aHalfPrecision) {
			ResultFract &= 0xFFFFFFFFF8000000ULL; // Mask out the bottom 19 bits (plus 8 for the shift later)
		}
	} else {
		ResultFract >>= 1;
	}
	CFloat_t RetVal(Exp, ResultPositive, ResultFract >> 8);
	return RetVal;
}

CFloat_t operator *(const CFloat_t  &aA, const CFloat_t  &aB) {
	return Mult(aA,aB,false);
}

static uint64_t ReciprocLookup_s[] = {
	0x0000000007FBF804ull, 0x0000000007EBE824ull, 0x0000000007DBD864ull, 0x0000000007CBC8C4ull, 0x0000000007BBB944ull, 0x0000000007ABA9E4ull, 0x0000000007A3A240ull, 0x0000000007939310ull,
	0x0000000007838400ull, 0x0000000007737510ull, 0x00000000076B6DA4ull, 0x00000000075B5EE4ull, 0x00000000074B5044ull, 0x0000000007434900ull, 0x0000000007333A90ull, 0x0000000007232C40ull,
	0x00000000071B2524ull, 0x00000000070B1704ull, 0x0000000007031000ull, 0x0000000006F30210ull, 0x0000000006EAFB24ull, 0x0000000006DAED64ull, 0x0000000006D2E690ull, 0x0000000006C2D900ull,
	0x0000000006BAD244ull, 0x0000000006AAC4E4ull, 0x0000000006A2BE40ull, 0x00000000069AB7A4ull, 0x00000000068AAA84ull, 0x000000000682A400ull, 0x00000000067A9D84ull, 0x00000000066A90A4ull,

	0x0000000006628A40ull, 0x00000000065A83E4ull, 0x0000000006527D90ull, 0x0000000006427100ull, 0x00000000063A6AC4ull, 0x0000000006326490ull, 0x00000000062A5E64ull, 0x0000000006225840ull,
	0x0000000006124C10ull, 0x00000000060A4604ull, 0x0000000006024000ull, 0x0000000005FA3A04ull, 0x0000000005F23410ull, 0x0000000005EA2E24ull, 0x0000000005E22840ull, 0x0000000005DA2264ull,
	0x0000000005D21C90ull, 0x0000000005CA16C4ull, 0x0000000005C21100ull, 0x0000000005BA0B44ull, 0x0000000005B20590ull, 0x0000000005A9FFE4ull, 0x0000000005A1FA40ull, 0x000000000599F4A4ull,
	0x000000000591EF10ull, 0x000000000589E984ull, 0x000000000581E400ull, 0x000000000579DE84ull, 0x000000000571D910ull, 0x000000000569D3A4ull, 0x000000000561CE40ull, 0x000000000559C8E4ull,

	0x000000000551C390ull, 0x000000000549BE44ull, 0x000000000541B900ull, 0x000000000541B900ull, 0x000000000539B3C4ull, 0x000000000531AE90ull, 0x000000000529A964ull, 0x000000000521A440ull,
	0x0000000005199F24ull, 0x0000000005199F24ull, 0x0000000005119A10ull, 0x0000000005099504ull, 0x0000000005019000ull, 0x0000000004F98B04ull, 0x0000000004F98B04ull, 0x0000000004F18610ull,
	0x0000000004E98124ull, 0x0000000004E17C40ull, 0x0000000004E17C40ull, 0x0000000004D97764ull, 0x0000000004D17290ull, 0x0000000004C96DC4ull, 0x0000000004C96DC4ull, 0x0000000004C16900ull,
	0x0000000004B96444ull, 0x0000000004B96444ull, 0x0000000004B15F90ull, 0x0000000004A95AE4ull, 0x0000000004A95AE4ull, 0x0000000004A15640ull, 0x00000000049951A4ull, 0x00000000049951A4ull,

	0x0000000004914D10ull, 0x0000000004894884ull, 0x0000000004894884ull, 0x0000000004814400ull, 0x0000000004793F84ull, 0x0000000004793F84ull, 0x0000000004713B10ull, 0x0000000004713B10ull,
	0x00000000046936A4ull, 0x0000000004613240ull, 0x0000000004613240ull, 0x0000000004592DE4ull, 0x0000000004592DE4ull, 0x0000000004512990ull, 0x0000000004492544ull, 0x0000000004492544ull,
	0x0000000004412100ull, 0x0000000004412100ull, 0x0000000004391CC4ull, 0x0000000004391CC4ull, 0x0000000004311890ull, 0x0000000004291464ull, 0x0000000004291464ull, 0x0000000004211040ull,
	0x0000000004211040ull, 0x0000000004190C24ull, 0x0000000004190C24ull, 0x0000000004110810ull, 0x0000000004110810ull, 0x0000000004090404ull, 0x0000000004090404ull, 0x0000000004010000ull,
};


static uint64_t Mult(uint64_t &S3, uint64_t &S5, uint32_t &A6) {
	uint64_t S0, S2;
	uint32_t A0;
	// Output: S2 = S5 * S3
	S2 = 0;
	do {
		S0 = S3 << 1;
		S3 = S3 << 1;
		if (int64_t(S0) < 0) {
			S2 = S2 + S5;
		}
		S5 = S5 >> 1;
		A0 = A6 - 1;
		A6 = A6 - 1;
	} while (A0 != 0);
	return S2;
}

static uint64_t Mult(uint64_t &S2, uint64_t &S3, uint64_t &S5, uint32_t &A6) {
	uint64_t S0;
	uint32_t A0;
	// Output: S2 = S5 * S3
	do {
		S0 = S3 << 1;
		S3 = S3 << 1;
		if (int64_t(S0) < 0) {
			S2 = S2 + S5;
		}
		S5 = S5 >> 1;
		A0 = A6 - 1;
		A6 = A6 - 1;
	} while (A0 != 0);
	return S2;
}

static uint64_t MaskRight(uint32_t aValue) {
	CRAY_ASSERT(aValue <= 63);
	return (1ull << aValue) - 1;
}

static uint64_t MaskLeft(uint32_t aValue) {
	CRAY_ASSERT(aValue <= 63);
	return (0xffffffffffffffffULL) << (64 - aValue);
}


CInt_t ReciprocApproxFinalMul(const CInt_t aA, const CInt_t aB, bool aDoFinalRounding) {
	uint64_t A = aA & ((1ull << 37) - 1);
	uint64_t B = (aB & ((1ull << 37) - 1)) << 3;
	uint64_t P = 0;
	uint64_t SubP = 0;

	SubP = 0;
	SubP += (GetBit(A,  0) != 0 ? (B >> 35) : 0ull) & ~3ull;
	SubP += (GetBit(A,  1) != 0 ? (B >> 34) : 0ull) & ~3ull;
////SubP += (GetBit(A,  2) != 0 ? (B >> 33) : 0ull) & ~3ull;
	SubP += (GetBit(A,  3) != 0 ? (B >> 32) : 0ull) & ~3ull;
	SubP += (GetBit(A,  4) != 0 ? (B >> 31) : 0ull) & ~3ull;
	SubP += (GetBit(A,  5) != 0 ? (B >> 30) : 0ull) & ~3ull;
	SubP += (GetBit(A,  6) != 0 ? (B >> 29) : 0ull) & ~3ull;
	SubP += (GetBit(A,  7) != 0 ? (B >> 28) : 0ull) & ~3ull;
	SubP &= ~7ull;
	P += SubP;
	SubP = 0;
	SubP += (GetBit(A,  8) != 0 ? (B >> 27) : 0ull) & ~1ull;
	SubP += (GetBit(A,  9) != 0 ? (B >> 26) : 0ull) & ~0ull;
	SubP += (GetBit(A, 10) != 0 ? (B >> 25) : 0ull) & ~0ull;
	SubP += (GetBit(A, 11) != 0 ? (B >> 24) : 0ull) & ~0ull;
	SubP &= ~3ull;
	P += SubP;
	SubP = 0;
	SubP += (GetBit(A, 12) != 0 ? (B >> 23) : 0ull) & ~3ull;
	SubP += (GetBit(A, 13) != 0 ? (B >> 22) : 0ull) & ~3ull;
	SubP += (GetBit(A, 14) != 0 ? (B >> 21) : 0ull) & ~7ull;
	SubP += (GetBit(A, 15) != 0 ? (B >> 20) : 0ull) & ~15ull;
	P += SubP;
	SubP = 0;
	SubP += (GetBit(A, 16) != 0 ? (B >> 19) : 0ull) & ~3ull;
	SubP += (GetBit(A, 17) != 0 ? (B >> 18) : 0ull) & ~7ull;
	SubP += (GetBit(A, 18) != 0 ? (B >> 17) : 0ull) & ~3ull;
	SubP += (GetBit(A, 19) != 0 ? (B >> 16) : 0ull) & ~3ull;
	SubP += (GetBit(A, 20) != 0 ? (B >> 15) : 0ull) & ~3ull;
	SubP += (GetBit(A, 21) != 0 ? (B >> 14) : 0ull) & ~3ull;
	SubP += (GetBit(A, 22) != 0 ? (B >> 13) : 0ull) & ~3ull;
	SubP += (GetBit(A, 23) != 0 ? (B >> 12) : 0ull) & ~3ull;
	SubP &= ~7ull;
	P += SubP;
	SubP = 0;
	SubP += (GetBit(A, 24) != 0 ? (B >> 11) : 0ull) & ~1ull;
	SubP += (GetBit(A, 25) != 0 ? (B >> 10) : 0ull) & ~0ull;
	SubP += (GetBit(A, 26) != 0 ? (B >>  9) : 0ull) & ~0ull;
	SubP += (GetBit(A, 27) != 0 ? (B >>  8) : 0ull) & ~0ull;
	SubP &= ~3ull;
	P += SubP;
	SubP = 0;
	SubP += (GetBit(A, 28) != 0 ? (B >>  7) : 0ull) & ~3ull;
	SubP += (GetBit(A, 29) != 0 ? (B >>  6) : 0ull) & ~3ull;
	SubP += (GetBit(A, 30) != 0 ? (B >>  5) : 0ull) & ~7ull;
	SubP += (GetBit(A, 31) != 0 ? (B >>  4) : 0ull) & ~15ull;
	P += SubP;
	SubP = 0;
	SubP += (GetBit(A, 32) != 0 ? (B >>  3) : 0ull) & ~3ull;
	SubP += (GetBit(A, 33) != 0 ? (B >>  2) : 0ull) & ~3ull;
	SubP += (GetBit(A, 34) != 0 ? (B >>  1) : 0ull) & ~3ull;
	SubP += (GetBit(A, 35) != 0 ? (B >>  0) : 0ull) & ~3ull;
	SubP &= ~7ull;
	P += SubP;

	P >>= 2;
	if (aDoFinalRounding) {
		uint64_t Bit1 = GetBit(P, 1) != 0 ? 1 : 0;
		P >>= 2;
		P += Bit1;
		P >>= 2;
	}
	return P;
}

uint64_t FractionalMult(uint64_t aA, uint64_t aB, size_t aBitCnt) {
	uint64_t RetVal = 0;
	for (size_t BitIdx = 0; BitIdx < aBitCnt; ++BitIdx) {
		RetVal += (GetBit(aA, BitIdx) != 0) ? (aB >> (aBitCnt - BitIdx - 1)) : 0ull;
	}
	return RetVal;
}

uint64_t ReciprocApprox(const uint64_t aInput) {
	// Start by looking up 2*A0 and A0^2. They're combined into a single lookup table so some bit-twiddling is needed to get to them...
	size_t LookupIdx = (aInput >> 40) & 127;
	CRAY_ASSERT(LookupIdx < sizeof(ReciprocLookup_s) / sizeof(ReciprocLookup_s[0]));
	uint64_t LookupVal = ReciprocLookup_s[LookupIdx];
	uint64_t twiceA0 = (LookupVal >> 18);     // Put top 9 bits to S6 (this is 2A0 according to the comment)
	uint64_t A0sq = (LookupVal >> 2) & 0xffff; // Bottom 16 bits: this is A0^2 according to the comment

	// Do the first iteration: compute 2*A0 - A0^2 * Mantissa, but with some tweaks:
	// 1. Do some funky rounding on 2*A0
	// 2. Have some offset in the addition
	// 3. Shift things to the 'right' place to avoid truncation
	// 4. Truncate the result to the correct number of bits and mask out the top ones (is that needed???)
	uint64_t Mantissa = (aInput & CFloat_t::FractMask);
	uint64_t A0sqB = FractionalMult(A0sq, Mantissa >> 24, 16);
	uint64_t RoundedA0 = (((twiceA0 << 8) + 1) << 8) + (twiceA0 >> 1);
	uint64_t A1 = (RoundedA0 - A0sqB - 256) >> 6;
	A1 = A1 & ((1 << 18) - 1);

	// Do the second iteration: compute A1^2 and extract the proper section of the mantissa (B2)
	uint64_t A1sq = A1 * A1;
	uint64_t B2 = (Mantissa | (1ULL << 47)) >> 11;
	// Compute A1^2 * Mantissa'
	uint64_t A1sqB2 = ReciprocApproxFinalMul(A1sq, B2, true);

	// We have all the pieces to finalize the computation:
	// 1. Compute 2*A1 - A1^2 * B2
	// 2. Mask it to the right size and make sure it's normalized
	// 3. Extract the exponent, check for over/underflow
	// 4. Compute final exponent
	// 5. Assemble floating point result from the pieces
	uint64_t FinalMantissa = ((A1 << 31) - (A1sqB2 << 14) - 1);
	FinalMantissa = (FinalMantissa | (1ULL << 47)) & 0xffffffff8000ULL;
	uint16_t Exp = uint16_t(CFloat_t(aInput).RawExp());
	uint16_t FinalExp;
	bool Sign = (aInput >> CFloat_t::FractSignBitPos) == 0;
	if (Exp < 0x6000 && Exp > 0x2001) {
		FinalExp = 2*CFloat_t::ExpOfs - Exp - 1;
	} else {
		FinalExp = 0x6000;
		FinalMantissa = FinalMantissa & (~(1ULL << 47)); // Clear the normalized bit
	}
	CFloat_t Result = CFloat_t(FinalExp - CFloat_t::ExpOfs, Sign, FinalMantissa);
	return Result.Value;
}


static uint64_t DebugB;
static uint64_t DebugA0sq;
static uint64_t DebugA0;
static uint64_t DebugA0sqB;
static uint64_t DebugA1sq;
static uint64_t DebugA1;
static uint64_t DebugB2;
static uint64_t DebugA1sqB2;
static uint64_t DebugRoundedA0;
static uint64_t DebugResult;
// Actual source code can be found in
//    ...\cray-xmp\unicos\unicos_4.0_xmp\UC-04.0-U#-PL30_UNICOS_4.0_usr_src_diag\diag\crayxmp\xmp\SFR
// starting from label 'SIM'
// That code has lots of comments in it, but all constants are in octal, which is a pain to translate
// The general algorithm is described here:
//    https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
// The iteration seems to be:
// X[i+1] = X[i] * (2 - D*X[i]), where D is the number to be inverted.
// This can be expressed as 2*X[i] - D * X[i]^2, which the algorithm seem to compute according to the notes in the sources
static uint64_t OldReciprocApprox(uint64_t aInput) {
	uint64_t S0, S1, S2, S3, S4, S5, S6, S7;
	uint32_t A6, A7;
	uint64_t T8;
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@ SW implementation of reciproc approximation
	//@@ Input:  S1 - the value to compute the reciprocal for
	//@@ Output : S4 - output S4 ~= 1.0 / S1
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	S1 = aInput;
	S0 = 0;
	S2 = MaskRight(7);   // Mask out bits 40...47 from the input, and use it to look up initial approximation; form 7 ones from right
	S2 = S2 << 40;
	S2 = S1 & S2;
	S2 = S2 >> 40;
	A7 = uint32_t(S2);
	CRAY_ASSERT(A7 < sizeof(ReciprocLookup_s) / sizeof(ReciprocLookup_s[0]));
	S7 = ReciprocLookup_s[A7];
	S6 = S7;                      // Put top 9 bits to S6 (this is 2A0 according to the comment)
	S6 = S6 >> 18;                // ....
	S7 = S7 << 46;                // Put the bottom 16 bits into S7
	S5 = S7;                      // ....
	S5 = S5 >> 48;                // S5 now contains bottom 16 bits: this is A0^2 according to the comment
	S7 = S1;                      // Extract the top bottom 48 bits of mantissa of the input exponent into S7
	S7 = S7 << 16;                // ....
	S7 = S7 >> 40;                // ....
	DebugB = S1;
	DebugA0sq = S5 << 040;
	DebugA0 = S6 << 047;

	// At this point we have :
	// S1 : input
	// S7 : input mantissa
	// S6 : 2A0 (from lookup)
	// S5 : A0^2 (from lookup)
	// Now the comment says: FORM A1=-(A0**2B1-2A0)=2A0-A0**2B1, which translates to:
	// A1 = 2A0 - A0^2 * B1, I think, where B1 is the input mantissa

	// Multiply S5 by S7, put the result in S4, that is A0^2 * B1
	S5 = S5 << 47;                // Align the bits 2...17 of the lookup to the top of S5
	S4 = 0;
	do {
		S0 = S5 << 1;
		S5 = S5 << 1;

		if (int64_t(S0) < 0) {
			S4 = S4 + S7;
		}
		S7 = S7 >> 1;
	} while (S0 != 0);
	DebugA0sqB = S4;
	// According to the comments in the source we do some roungin of 2A0 here (2A0 is 9 bits)
	S7 = S6;
	S6 = S6 << 8;
	S7 = S7 >> 1;
	S3 = 1;
	S6 = S3 + S6;
	S6 = S6 << 8;
	S6 = S6 + S7;                 // We did some rounding of 2A0 apparently and aligned to bit position -17 according to the comment (???)
	DebugRoundedA0 = S6;
	S6 = ~S6;                     // Original comment: COMP S6 W/O S**5 COMP CORR ???? This seems to be changing the sign, somehow
	S7 = S4 + S6;                 // Original comment: FORM (A0**2B1-2A0), so apparently we've formed A0^2 * B1 (in S4) - 2A0 (in S6)
	S5 = 0x0000000000000100ull;   // This is some rounding constant
	S5 = S5 + S7;                 // Do the rounding
	S7 = ~S5;                     // Complement the result for some reason, probably to get the sign-bits right
	S7 = S7 >> 6;                 // Truncate results to 18 bits, according to the comment (here we loose the bottom 'noise' bits)
	S5 = MaskRight(18);           // Do the actual masking
	S7 = S5 & S7;                 // ...
	S7 = S7 << 30;                // This most likely achieves nothing (used in the original code to store A1 into memory
	DebugA1 = S7;
	S7 = S7 >> 30;                // ...

								  // According to original comment, now we create A1^2 in S6; 2*A1 in S7 and B2 in S5
	S5 = S7;
	S6 = 0;
	S7 = S7 << 17;
	S5 = S5 << 45;
	S4 = S7;
	do {
		S0 = S5 << 1;
		S5 = S5 << 1;
		if (int64_t(S0) < 0) {
			S6 = S6 + S4;
		}
		S4 = S4 >> 1;
	} while (S0 != 0);
	// At this point S6 contains A1^2
	// Difference with source!!!!! These two instructions are missing:
	//*
	//*        S6 now contains A1^2.  Clear bits 2 and 1,
	//*        which represent bits 2^-34 and 2^-35.
	//*
	//S5        6
	//S6        #S5&S6
	S5 = 1;                       // We're taking S1 (the input, or B1), extract the mantissa, making sure it's normalized and put it in S5
	S5 = S5 << 47;                // ...
	S1 = S1 | S5;                 // ...
	S5 = MaskRight(48);           // ...
	S5 = S1 & S5;                 // ...
	S5 = S5 >> 11;                // We keep only the top 37 bits, which (according to the source) forms B2
	DebugB2 = S5;
	S7 = S7 << 2;                 // DIFFERENCE WITH THE SOURCE!!!! THIS IS S7 S7<1 there!!!!!!! Still, tests only pass with 2 being here, so this must be accounted for elsewhere
								  // Here we create 2*A1 in S7, but that only makes sense if we follow the original source code!
	S6 = S6 << 12;                // This most likely achieves nothing (used in the original code to store A1^2 into memory
	DebugA1sq = S6;
	S6 = S6 >> 12;                // ...
								  // *FORM A2=-(A1**2B2-2A1)=2A1-A1**2B2
								  // *LOWER 37 BITS OF S5=B2,LOWER 36 BITS OF S6=A1**2
								  // *LOWER 37 BITS OF S7=2A1
								  // *ITERATION FOR RM1,RM8 & RN2
	S4 = 0;
	S6 = S6 << 27;
	// Here the source starts to wildly differ from this disassembly...
	// But, according to notes in the source, the main problem seems to be that the multiplies are
	// larger than 64 bits at this point, so we need to do it in pieces...
	T8 = S7;                      // We save off  2*A1 for later
	S3 = S6;                      // S6 is A1^2


								  //////////////////////////////////////////////////////////////////////////////////////////////
								  //////////////////////////////////////////////////////////////////////////////////////////////
								  //////////////////////////////////////////////////////////////////////////////////////////////
								  // We start the multiply here. The two inputs are:
								  // S3 - top 37 bits are valid, the rest are 0
								  // S5 - bottom 37 bits are valid, the rest are 0
								  // The output ends up in S4
								  // During the multiplication we keep shifting S5 right and S3 left. The MSB of S3 determines if we add S5 to the partial product.
								  // Overall we seem to be doing 35 bits of partial products, but in a rather wierd way. Also, because S5 is (more or less) right-justified
								  // we don't seem to be doing a full product: we keep shifting valid bits out. So: we're doing a similar, truncated product as its done in
								  // the floating-point multiplier.
								  // My best reproduction of the multiplier triangle is this:
								  //     555555555555555555555555555555555555_
								  //     _555555555555555555555555555555555555
								  //     __55555555555555555555555555555555555
								  //     ___5555555555555555555555555555555555
								  //     clear LSB of sum
								  //     ____5555555555555555555555555555555__
								  //     _____5555555555555555555555555555555_
								  //     ______5555555555555555555555555555555
								  //     _______555555555555555555555555555555
								  //
								  //     ________5555555555555555555555555555555
								  //     _________555555555555555555555555555555
								  //     __________55555555555555555555555555555
								  //     clear LSB of partial sum-------------^ (but allow carries before cutting)
								  //     ___________555555555555555555555555555
								  //     clear LSB of partial sum-------------^ (but allow carries before cutting)
								  //
								  //     ____________5555555555555555555555555
								  //     _____________555555555555555555555555
								  //     ______________55555555555555555555555
								  //     _______________5555555555555555555555
								  //     ________________555555555555555555555
								  //     _________________55555555555555555555
								  //     __________________555555555555555555_
								  //     ___________________555555555555555555
								  //     clear LSB of sum
								  //     ____________________555555555555555__
								  //     _____________________555555555555555_
								  //     ______________________555555555555555
								  //     _______________________55555555555555
								  //
								  //     ________________________555555555555555
								  //     _________________________55555555555555
								  //     __________________________5555555555555
								  //     clear LSB of partial sum-------------^ (but allow carries before cutting)
								  //     ___________________________55555555555
								  //     clear LSB of partial sum-------------^ (but allow carries before cutting)
								  //     ____________________________555555555
								  //     _____________________________55555555
								  //     ______________________________5555555
								  //     _______________________________555555
								  //     ________________________________55555
								  //     __________________________________555
								  //     ___________________________________55
								  // This is not for certain, for example I'm left with an extra bit at the end, there's a jump at almost the end, and as you can see there's a bunch of LSB truncation
								  // as well as uneven right-side truncation in the triangle. This is actually reminescent of the original way the floating-point multiplier was described in the Cray-1
								  // manual. It's also interesting why the XMP version of the code seem to have used 128-bit arithemetic for this... Maybe it's worth looking at that code and re-construct
								  // the tree from there as well.
	S5 = S5 << 1;                  // S5 is B2, we multiply it by 2
	A6 = 0x00000004;               // Here, the comments seem to line up again: *ITRATION FOR RM2 & RM9,S5=B2>5,S3=A1**2<4
	S2 = Mult(S3, S5, A6);         // Do 4 partial products
	S2 = S2 >> 1;                  // Zero out the LSB of S2
	S2 = S2 << 1;                  // ...
	S4 = S4 + S2;                  // Add to the sum (well it's 0 so just move)
	S5 = S5 >> 2;                  // Zero out the bottom two LSBs of S5
	S7 = S5;                       // Not sure why we save the shifted version of S5, but it's going to contain two additional bits than what S5 is after the next partial product
	S5 = S5 << 2;                  // ...
	A6 = 0x00000004;               // Another partial product for the next 4 bits
	S2 = Mult(S3, S5, A6);         // ...
	S4 = S4 + S2;                  // ...
	S5 = S7;                       // Restore S5: this is now two more bits than it was after the partial product up there
	A6 = 0x00000003;               // Do three partial products (two of them are something that has been done previuosly, but of course S3 was shifted already
	S2 = Mult(S3, S5, A6);         // ...
	S2 = S2 >> 1;                  // We seem to be doing an extra bit of multiply here, though I'm not sure why we shift S2 first...
	S5 = S5 >> 1;                  // ...
	S0 = S3 << 1;                  // ...
	S3 = S3 << 1;                  // ...
	if (int64_t(S0) < 0) {         // ...
		S2 = S2 + S5;              // ...
	}                              // ...
	S5 = S5 >> 2;                  // We seem to be skipping a few bits here, so S5 lines up with the two shifts of S2. But I still don't get these shifts, especially since it gets added to the unshifted S4
	S2 = S2 >> 1;                  // ...
	S4 = S4 + S2;                  // Yet again, add the partial product to the accumulator
	A6 = 0x00000006;               // Compute another 6 bits of partial product
	S2 = Mult(S3, S5, A6);         // ...
	S5 = S5 >> 1;                  // Clear LSB of S5 yet again
	S5 = S5 << 1;                  // ...
	A6 = 0x00000002;               // Continue the partitial product with the now modified S5 for another two bits
	S2 = Mult(S2, S3, S5, A6);     // ...
	S2 = S2 >> 1;                  // Now clear the LSB of the partial product before adding it to the product
	S2 = S2 << 1;                  // ...
	S4 = S4 + S2;                  // Add it to the accumulator
	S5 = S5 >> 2;                  // Here we do the wierd product creation yet again...
	S7 = S5;
	S5 = S5 << 2;
	A6 = 0x00000004;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S4 = S4 + S2;
	S5 = S7;
	A6 = 0x00000003;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S2 = S2 >> 1;
	S5 = S5 >> 1;
	S0 = S3 << 1;
	S3 = S3 << 1;
	if (int64_t(S0) < 0) {
		S2 = S2 + S5;
	}
	S5 = S5 >> 2;
	S2 = S2 >> 1;
	S4 = S4 + S2;
	A6 = 0x00000005;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S3 = S3 << 1;
	S5 = S5 >> 1;
	A6 = 0x00000002;
	S2 = Mult(S2, S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S2 = S2 >> 1;
	S2 = S2 << 1;
	S4 = S4 + S2;


	S2 = 1;
	S4 = S4 >> 1;
	S2 = S2 & S4;
	S4 = S4 >> 1;
	S4 = S2 + S4;
	S4 = S4 >> 2;
	//////////////////////////////////////////////////////////////////////////////////////////////
	//////////////////////////////////////////////////////////////////////////////////////////////
	//////////////////////////////////////////////////////////////////////////////////////////////


	// I'm guessing here that
	//  S4 finally contains the product of A1^2 * B2
	// Here the sources start to line up again:
	//   *S4=FINAL RECIPROCAL PRIOR TO SHIFT UP
	//   *S4=A1**2B2,RIGHT JUSTIFIED,LOWER 37 BITS OF S7=2A1
	DebugA1sqB2 = S4;
	S7 = T8;                       // Restoring 2*A1
	S4 = S4 << 14;                 // Positioning the two mantissas for final add
	S7 = S7 << 12;                 // ...
	S6 = ~S7;                      // Comment says: S6=-2A1 W/O 2**5 COMP. CORR.
	S5 = S4 + S6;                  // Do the actual add
	S3 = 1;                        // Make it two's complement (original comment says: 2**5 COMP. CORRECTION
	S2 = S3 + S5;                  // ...
	S2 = ~S2;                      // Final compliment, according to orignal source
	S3 = 0400004000000000000000ULL; // 0.4
	S2 = S2 | S3;                  // This - according to the source - is adding in the normalize bit.
	S3 = MaskRight(33);            // We (in a complicated way) extract the mantissa, or at least the top 33 bits of it
	S3 = S3 << 15;                 // ...
	S2 = S2 & S3;                  // ...
	S3 = MaskLeft(15);             // Extract the exponent of the input
	S3 = S3 >> 1;                  // ...
	S3 = S1 & S3;                  // ...
	S7 = 0x0000000000006000ull;    // Test for valid exponent range
	S7 = S7 << 48;                 // ...
	S6 = 0x0000000000002002ull;    // ...
	S6 = S6 << 48;                 // ...
	if (int64_t(S3 - S7) < 0 && int64_t(S3 - S6) >= 0) {
		// Exponent valid, invert it (it's complicated due to the offset nature of the exponent, but that's what's going on here, I believe
		S4 = 0x0000000000000002;
		S4 = S4 << 48;
		S3 = ~S3;
		S3 = S3 + S4;
	}
	else {
		// Exponent invalid, clear the normalize bit in mantissa and saturate exponent to max valid value
		S3 = 0x0000000000006000;
		S3 = S3 << 48;
		S4 = 1;
		S4 = S4 << 47;
		S2 = (~S4) & S2;
	}
	// At this point:
	// S3: exponent
	// S2: mantissa
	// All we need to do is copy the pieces together, including the sign bit from the input
	S4 = MaskRight(63); // form 63 ones from right
	S3 = S3 & S4;
	S4 = S1 & 0x8000000000000000ull;
	S3 = S3 | S4;
	S4 = MaskLeft(16); // form 16 ones from left
	S3 = S3 & S4;
	S4 = S2 | S3;
	DebugResult = S4;
	return S4;
}

#if 0
static uint64_t OldOldReciprocApprox(uint64_t aInput) {
	uint64_t S0, S1, S2, S3, S4, S5, S6, S7;
	uint32_t A6, A7;
	uint64_t T8;
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@ SW implementation of reciproc approximation
	//@@ Input:  S1 - the value to compute the reciprocal for
	//@@ Output : S4 - output S4 ~= 1.0 / S1
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	//@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	S1 = aInput;
	S0 = 0;
	S2 = MaskRight(7);   // Mask out bits 40...47 from the input, and use it to look up initial approximation; form 7 ones from right
	S2 = S2 << 40;
	S2 = S1 & S2;
	S2 = S2 >> 40;
	A7 = uint32_t(S2);
	CRAY_ASSERT(A7 < sizeof(ReciprocLookup_s) / sizeof(ReciprocLookup_s[0]));
	S7 = ReciprocLookup_s[A7];
	S6 = S7; // Put top 10 bits to S6
	S6 = S6 >> 18;
	S7 = S7 << 46; // Put the bottom 18 bits into S7
	S5 = S7;
	S5 = S5 >> 48; // S5 now contains bottom 18 bits divided by 4
	S7 = S1; // Extract the top 24 bits of the input exponent into S7
	S7 = S7 << 16;
	S7 = S7 >> 40;
	//0x00000B00 : p2(0x00000B00:p2) 0130100 : 0006336 : 0000000 - [0x00000CDE, ] S1               @ Store input in some debug location(? )
	S3 = S5; // Store some other data in some debug locations as well...
	S3 = S3 << 32;
	//0x00000B01 : p3(0x00000B01:p3) 0130300 : 0006340 : 0000000 - [0x00000CE0, ] S3
	S3 = S6;
	S3 = S3 << 39;
	//0x00000B03 : p0(0x00000B03:p0) 0130300 : 0006337 : 0000000 - [0x00000CDF, ] S3
	// At this point we have :
	// S1 : input
	// S7 : top 24 bits of input exponent
	// S6 : top 18 bits of lookup
	// S5 : bottom 18 bits of lookup, divded by 4 (or in other words bits 2...17 of lookup)

	// Multiply S5 by S7, put the result in S4.Result is aligned to the bottom of S4
	S5 = S5 << 47; // Align the bits 2...17 of the lookup to the top of S5
	S4 = 0;
	do {
		S0 = S5 << 1;
		S5 = S5 << 1;

		if (int64_t(S0) < 0) {
			S4 = S4 + S7;
		}
		S7 = S7 >> 1;
	} while (S0 != 0);
	// If I have to guess, we calculate the 2 - S5*S7 here, though the math seems rather funky...
	S7 = S6; // S6 is the top 18 bits: we multiply that by 256, add 1 to it, multiply by 256 again and add S6 / 2 to it
	S6 = S6 << 8;
	S7 = S7 >> 1;
	S3 = 1;
	S6 = S3 + S6;
	S6 = S6 << 8;
	S6 = S6 + S7; // S6 = S6 / 2 + (S6 * 256 + 1) * 256
	S6 = ~S6; // S6 not(0) xor S6
	S7 = S4 + S6; // S7 = S5 * S7 - (S6 / 2 + (S6 * 256 + 1) * 256) - 1
	S5 = 0x0000000000000100ull;
	S5 = S5 + S7; // S7 = S5 * S7 - (S6 / 2 + (S6 * 256 + 1) * 256) + 255
	S7 = ~S5; // S7 not(0) xor S5
	S7 = S7 >> 6; // S7 = -(S5 * S7 - (S6 / 2 + (S6 * 256 + 1) * 256) + 255) - 1
	S5 = MaskRight(18); // Masking out the bottom 18 bits of the result, shift it up to the top fo he exponent range and store it in a debug location.
	S7 = S5 & S7;
	S7 = S7 << 30;
	//0x00000B0A : p3(0x00000B0A:p3) 0130700 : 0006341 : 0000000 - [0x00000CE1, ] S7 |
	// Calculating S7 * (S7 << 17), put it in S6
	S7 = S7 >> 30;
	S5 = S7;
	S6 = 0;
	S7 = S7 << 17;
	S5 = S5 << 45;
	S4 = S7;
	do {
		S0 = S5 << 1;
		S5 = S5 << 1;
		if (int64_t(S0) < 0) {
			S6 = S6 + S4;
		}
		S4 = S4 >> 1;
	} while (S0 != 0);
	S5 = 1;
	S5 = S5 << 47;
	S1 = S1 | S5;
	S5 = MaskRight(48);
	S5 = S1 & S5;
	S5 = S5 >> 11;
	S7 = S7 << 2;
	S6 = S6 << 12;
	//0x00000B11 : p0(0x00000B11:p0) 0130600 : 0006342 : 0000000 - [0x00000CE2, ] S6 |
	S6 = S6 >> 12;
	S4 = 0;
	S6 = S6 << 27;
	T8 = S7;
	S3 = S6;
	S5 = S5 << 1;
	A6 = 0x00000004;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S4 = S4 + S2;
	S4 = S4 >> 1;
	S5 = S5 >> 2;
	S4 = S4 << 1;
	S7 = S5;
	S5 = S5 << 2;
	A6 = 0x00000004;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S4 = S4 + S2;
	S5 = S7;
	A6 = 0x00000003;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S2 = S2 >> 1;
	S5 = S5 >> 1;
	S0 = S3 << 1;
	S3 = S3 << 1;
	if (int64_t(S0) < 0) {
		S2 = S2 + S5;
	}
	S5 = S5 >> 2;
	S2 = S2 >> 1;
	S4 = S4 + S2;
	A6 = 0x00000006;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S5 = S5 >> 1;
	S5 = S5 << 1;
	A6 = 0x00000002;
	S2 = Mult(S2, S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S2 = S2 >> 1;
	S2 = S2 << 1;
	S4 = S4 + S2;
	S5 = S5 >> 2;
	S7 = S5;
	S5 = S5 << 2;
	A6 = 0x00000004;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S4 = S4 + S2;
	S5 = S7;
	A6 = 0x00000003;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S2 = S2 >> 1;
	S5 = S5 >> 1;
	S0 = S3 << 1;
	S3 = S3 << 1;
	if (int64_t(S0) < 0) {
		S2 = S2 + S5;
	}
	S5 = S5 >> 2;
	S2 = S2 >> 1;
	S4 = S4 + S2;
	A6 = 0x00000005;
	S2 = Mult(S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S3 = S3 << 1;
	S5 = S5 >> 1;
	A6 = 0x00000002;
	S2 = Mult(S2, S3, S5, A6); // S2 = S3 * S5(A6 number of bits)
	S2 = S2 >> 1;
	S2 = S2 << 1;
	S4 = S4 + S2;
	S2 = 1;
	S4 = S4 >> 1;
	S2 = S2 & S4;
	S4 = S4 >> 1;
	S4 = S2 + S4;
	S4 = S4 >> 2;
	S7 = T8;
	S4 = S4 << 14;
	S7 = S7 << 12;
	S6 = ~S7;
	S5 = S4 + S6;
	S3 = 1;
	S2 = S3 + S5;
	S2 = ~S2;
	S3 = 0400004000000000000000ULL; // 0.4
	S2 = S2 | S3;
	S3 = MaskRight(33); // form 33 ones from right
	S3 = S3 << 15;
	S2 = S2 & S3;
	S3 = MaskLeft(15); // form 15 ones from left
	S3 = S3 >> 1;
	S3 = S1 & S3;
	S7 = 0x0000000000006000ull;
	S7 = S7 << 48;
	S6 = 0x0000000000002002ull;
	S6 = S6 << 48;
	// TODO: this used to be ||, but I believe that's incorrect: we're checking for exponent over or underflow, but the way
	//       the statement is set up is that the 'if' part is the non-overflow case, where it's *between* the bounds.
	//       The original code would under no condition execute the 'else' part.
	if (int64_t(S3 - S7) < 0 && int64_t(S3 - S6) >= 0) {
		S4 = 0x0000000000000002;
		S4 = S4 << 48;
		S3 = ~S3;
		S3 = S3 + S4;
	} else {
		S3 = 0x0000000000006000;
		S3 = S3 << 48;
		S4 = 1;
		S4 = S4 << 47;
		S2 = (~S4) & S2;
	}
	S4 = MaskRight(63); // form 63 ones from right
	S3 = S3 & S4;
	S4 = S1 & 0x8000000000000000ull;
	S3 = S3 | S4;
	S4 = MaskLeft(16); // form 16 ones from left
	S3 = S3 & S4;
	S4 = S2 | S3;
	return S4;
}
#endif // 0


static class InvertLookup {
public:
	InvertLookup() {
		size_t Scale = sizeof(Table) / sizeof(Table[0]);
		for (size_t i = 1; i<Scale; ++i) {
			double Num = 2.0 / Scale*i;
			double Inv = 1.0 / Num;
			long Exp = (long)(log(Inv) / log(2.0) + 0.5);
			long long Coef = (long long)(Inv / (double)(1ULL << std::min<long>(63, Exp)) * (double)(1ULL << (CFloat_t::FractSize - 1)));
			if (Coef >(1LL << 48)) {
				Coef /= 2;
				++Exp;
			}
			Table[i] = CFloat_t(Exp, Coef);
			Table[i].Normalize();
		}
		Table[0] = Table[1];
	};

	CFloat_t Table[256];
} sInvertLookup;

static const CFloat_t CFloat2p0(1, (1ULL << 47));

CFloat_t ReciprocApprox(const CFloat_t &aInput) {
	return CFloat_t(ReciprocApprox(aInput.Value));
}

CFloat_t OldReciprocApprox(const CFloat_t &aInput) {
	return CFloat_t(OldReciprocApprox(aInput.Value));
}


/**************************************************/


void CFloat_t::Normalize() {
	if ((Value & FractMask) == 0) {
		Value = 0;
		return;
	}
	while (((Value & (1ULL << (FractSize-1))) == 0) && (((Value & ExpMask) >> ExpBitPos) > (uint64_t)ExpMin)) {
		Value = (Value & FractSignMask) | (((Value & ExpMask) - (1ULL << ExpBitPos)) & ExpMask) | ((Value << 1) & FractMask);
	}
}

double CFloat_t::ToDouble() const {
	// Doubles have a different binary representation. Important differences:
	// - only 11 bits of exponent
	// - normalized numbers don't store the leading '1' in the fractional
	CFloat_t RetVal = *this;
	RetVal.Normalize();
	unsigned long long URetVal;
	if (RetVal.Value == 0) {
		URetVal = 0;
	} else {
		long MyExp = RetVal.Exp();
		if (MyExp > -1023) {
			// Normalized value
			URetVal = (RetVal.Value & RetVal.FractSignMask) | (((unsigned long long)MyExp + 1023) << 52) | ((RetVal.Value & ((1ULL << (RetVal.FractSize-1)) - 1)) << (53 - RetVal.FractSize));
		} else {
			// Un-normalized value
			unsigned long long MyUFract = RetVal.UFract();
			long Shift = std::min<long>(63,-MyExp - 1023);
			MyUFract >>= Shift;
			URetVal = (RetVal.Value & RetVal.FractSignMask) | (MyUFract << (52 - RetVal.FractSize));
		}
	}
	return *((double*)(&URetVal));
}

CFloat_t ReciprocIterate(const CFloat_t &aLeft,const CFloat_t &aRight) {
	//CFloat_t RetValOri = CFloat2p0 - aLeft * aRight;
	CFloat_t RetValNew = Mult(aLeft, aRight, false, false, true);
	return RetValNew;
}

CFloat_t RoundedMult(const CFloat_t &aLeft, const CFloat_t &aRight) {
	return Mult(aLeft,aRight,true);
}

CFloat_t HalfPrecisionMult(const CFloat_t &aLeft, const CFloat_t &aRight) {
	return Mult(aLeft, aRight, false, true);
}