Please file new bugs on Launchpad: Invirt or XVM (if you're not sure which, just pick one)

Context Navigation

source: trunk/packages/xen-common/xen-common/tools/ioemu/fpu/softfloat.c @ 34

Last change on this file since 34 was 34, checked in by hartmans, 17 years ago
Add xen and xen-common
File size: 188.2 KB

Line
1
2	/*============================================================================
3
4	This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5	Package, Release 2b.
6
7	Written by John R. Hauser. This work was made possible in part by the
8	International Computer Science Institute, located at Suite 600, 1947 Center
9	Street, Berkeley, California 94704. Funding was partially provided by the
10	National Science Foundation under grant MIP-9311980. The original version
11	of this code was written as part of a project to build a fixed-point vector
12	processor in collaboration with the University of California at Berkeley,
13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15	arithmetic/SoftFloat.html'.
16
17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26	Derivative works are acceptable, even for commercial purposes, so long as
27	(1) the source code for the derivative work includes prominent notice that
28	the work is derivative, and (2) the source code includes prominent notice with
29	these four paragraphs for those parts of this code that are retained.
30
31	=============================================================================*/
32
33	#include "softfloat.h"
34
35	/*----------------------------------------------------------------------------
36	\| Primitive arithmetic functions, including multi-word arithmetic, and
37	\| division and square root approximations. (Can be specialized to target if
38	\| desired.)
39	----------------------------------------------------------------------------/
40	#include "softfloat-macros.h"
41
42	/*----------------------------------------------------------------------------
43	\| Functions and definitions to determine: (1) whether tininess for underflow
44	\| is detected before or after rounding by default, (2) what (if anything)
45	\| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46	\| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47	\| are propagated from function inputs to output. These details are target-
48	\| specific.
49	----------------------------------------------------------------------------/
50	#include "softfloat-specialize.h"
51
52	void set_float_rounding_mode(int val STATUS_PARAM)
53	{
54	STATUS(float_rounding_mode) = val;
55	}
56
57	void set_float_exception_flags(int val STATUS_PARAM)
58	{
59	STATUS(float_exception_flags) = val;
60	}
61
62	#ifdef FLOATX80
63	void set_floatx80_rounding_precision(int val STATUS_PARAM)
64	{
65	STATUS(floatx80_rounding_precision) = val;
66	}
67	#endif
68
69	/*----------------------------------------------------------------------------
70	\| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71	\| and 7, and returns the properly rounded 32-bit integer corresponding to the
72	\| input. If `zSign' is 1, the input is negated before being converted to an
73	\| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74	\| is simply rounded to an integer, with the inexact exception raised if the
75	\| input cannot be represented exactly as an integer. However, if the fixed-
76	\| point input is too large, the invalid exception is raised and the largest
77	\| positive or negative integer is returned.
78	----------------------------------------------------------------------------/
79
80	static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81	{
82	int8 roundingMode;
83	flag roundNearestEven;
84	int8 roundIncrement, roundBits;
85	int32 z;
86
87	roundingMode = STATUS(float_rounding_mode);
88	roundNearestEven = ( roundingMode == float_round_nearest_even );
89	roundIncrement = 0x40;
90	if ( ! roundNearestEven ) {
91	if ( roundingMode == float_round_to_zero ) {
92	roundIncrement = 0;
93	}
94	else {
95	roundIncrement = 0x7F;
96	if ( zSign ) {
97	if ( roundingMode == float_round_up ) roundIncrement = 0;
98	}
99	else {
100	if ( roundingMode == float_round_down ) roundIncrement = 0;
101	}
102	}
103	}
104	roundBits = absZ & 0x7F;
105	absZ = ( absZ + roundIncrement )>>7;
106	absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107	z = absZ;
108	if ( zSign ) z = - z;
109	if ( ( absZ>>32 ) \|\| ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110	float_raise( float_flag_invalid STATUS_VAR);
111	return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112	}
113	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
114	return z;
115
116	}
117
118	/*----------------------------------------------------------------------------
119	\| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120	\| `absZ1', with binary point between bits 63 and 64 (between the input words),
121	\| and returns the properly rounded 64-bit integer corresponding to the input.
122	\| If `zSign' is 1, the input is negated before being converted to an integer.
123	\| Ordinarily, the fixed-point input is simply rounded to an integer, with
124	\| the inexact exception raised if the input cannot be represented exactly as
125	\| an integer. However, if the fixed-point input is too large, the invalid
126	\| exception is raised and the largest positive or negative integer is
127	\| returned.
128	----------------------------------------------------------------------------/
129
130	static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131	{
132	int8 roundingMode;
133	flag roundNearestEven, increment;
134	int64 z;
135
136	roundingMode = STATUS(float_rounding_mode);
137	roundNearestEven = ( roundingMode == float_round_nearest_even );
138	increment = ( (sbits64) absZ1 < 0 );
139	if ( ! roundNearestEven ) {
140	if ( roundingMode == float_round_to_zero ) {
141	increment = 0;
142	}
143	else {
144	if ( zSign ) {
145	increment = ( roundingMode == float_round_down ) && absZ1;
146	}
147	else {
148	increment = ( roundingMode == float_round_up ) && absZ1;
149	}
150	}
151	}
152	if ( increment ) {
153	++absZ0;
154	if ( absZ0 == 0 ) goto overflow;
155	absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156	}
157	z = absZ0;
158	if ( zSign ) z = - z;
159	if ( z && ( ( z < 0 ) ^ zSign ) ) {
160	overflow:
161	float_raise( float_flag_invalid STATUS_VAR);
162	return
163	zSign ? (sbits64) LIT64( 0x8000000000000000 )
164	: LIT64( 0x7FFFFFFFFFFFFFFF );
165	}
166	if ( absZ1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
167	return z;
168
169	}
170
171	/*----------------------------------------------------------------------------
172	\| Returns the fraction bits of the single-precision floating-point value `a'.
173	----------------------------------------------------------------------------/
174
175	INLINE bits32 extractFloat32Frac( float32 a )
176	{
177
178	return a & 0x007FFFFF;
179
180	}
181
182	/*----------------------------------------------------------------------------
183	\| Returns the exponent bits of the single-precision floating-point value `a'.
184	----------------------------------------------------------------------------/
185
186	INLINE int16 extractFloat32Exp( float32 a )
187	{
188
189	return ( a>>23 ) & 0xFF;
190
191	}
192
193	/*----------------------------------------------------------------------------
194	\| Returns the sign bit of the single-precision floating-point value `a'.
195	----------------------------------------------------------------------------/
196
197	INLINE flag extractFloat32Sign( float32 a )
198	{
199
200	return a>>31;
201
202	}
203
204	/*----------------------------------------------------------------------------
205	\| Normalizes the subnormal single-precision floating-point value represented
206	\| by the denormalized significand `aSig'. The normalized exponent and
207	\| significand are stored at the locations pointed to by `zExpPtr' and
208	\| `zSigPtr', respectively.
209	----------------------------------------------------------------------------/
210
211	static void
212	normalizeFloat32Subnormal( bits32 aSig, int16 zExpPtr, bits32 zSigPtr )
213	{
214	int8 shiftCount;
215
216	shiftCount = countLeadingZeros32( aSig ) - 8;
217	*zSigPtr = aSig<<shiftCount;
218	*zExpPtr = 1 - shiftCount;
219
220	}
221
222	/*----------------------------------------------------------------------------
223	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224	\| single-precision floating-point value, returning the result. After being
225	\| shifted into the proper positions, the three fields are simply added
226	\| together to form the result. This means that any integer portion of `zSig'
227	\| will be added into the exponent. Since a properly normalized significand
228	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
229	\| than the desired result exponent whenever `zSig' is a complete, normalized
230	\| significand.
231	----------------------------------------------------------------------------/
232
233	INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234	{
235
236	return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
237
238	}
239
240	/*----------------------------------------------------------------------------
241	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242	\| and significand `zSig', and returns the proper single-precision floating-
243	\| point value corresponding to the abstract input. Ordinarily, the abstract
244	\| value is simply rounded and packed into the single-precision format, with
245	\| the inexact exception raised if the abstract input cannot be represented
246	\| exactly. However, if the abstract value is too large, the overflow and
247	\| inexact exceptions are raised and an infinity or maximal finite value is
248	\| returned. If the abstract value is too small, the input value is rounded to
249	\| a subnormal number, and the underflow and inexact exceptions are raised if
250	\| the abstract input cannot be represented exactly as a subnormal single-
251	\| precision floating-point number.
252	\| The input significand `zSig' has its binary point between bits 30
253	\| and 29, which is 7 bits to the left of the usual location. This shifted
254	\| significand must be normalized or smaller. If `zSig' is not normalized,
255	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
256	\| and it must not require rounding. In the usual case that `zSig' is
257	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
259	\| Binary Floating-Point Arithmetic.
260	----------------------------------------------------------------------------/
261
262	static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
263	{
264	int8 roundingMode;
265	flag roundNearestEven;
266	int8 roundIncrement, roundBits;
267	flag isTiny;
268
269	roundingMode = STATUS(float_rounding_mode);
270	roundNearestEven = ( roundingMode == float_round_nearest_even );
271	roundIncrement = 0x40;
272	if ( ! roundNearestEven ) {
273	if ( roundingMode == float_round_to_zero ) {
274	roundIncrement = 0;
275	}
276	else {
277	roundIncrement = 0x7F;
278	if ( zSign ) {
279	if ( roundingMode == float_round_up ) roundIncrement = 0;
280	}
281	else {
282	if ( roundingMode == float_round_down ) roundIncrement = 0;
283	}
284	}
285	}
286	roundBits = zSig & 0x7F;
287	if ( 0xFD <= (bits16) zExp ) {
288	if ( ( 0xFD < zExp )
289	\|\| ( ( zExp == 0xFD )
290	&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
291	) {
292	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
293	return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
294	}
295	if ( zExp < 0 ) {
296	isTiny =
297	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298	\|\| ( zExp < -1 )
299	\|\| ( zSig + roundIncrement < 0x80000000 );
300	shift32RightJamming( zSig, - zExp, &zSig );
301	zExp = 0;
302	roundBits = zSig & 0x7F;
303	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
304	}
305	}
306	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
307	zSig = ( zSig + roundIncrement )>>7;
308	zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309	if ( zSig == 0 ) zExp = 0;
310	return packFloat32( zSign, zExp, zSig );
311
312	}
313
314	/*----------------------------------------------------------------------------
315	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316	\| and significand `zSig', and returns the proper single-precision floating-
317	\| point value corresponding to the abstract input. This routine is just like
318	\| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319	\| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320	\| floating-point exponent.
321	----------------------------------------------------------------------------/
322
323	static float32
324	normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
325	{
326	int8 shiftCount;
327
328	shiftCount = countLeadingZeros32( zSig ) - 1;
329	return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
330
331	}
332
333	/*----------------------------------------------------------------------------
334	\| Returns the fraction bits of the double-precision floating-point value `a'.
335	----------------------------------------------------------------------------/
336
337	INLINE bits64 extractFloat64Frac( float64 a )
338	{
339
340	return a & LIT64( 0x000FFFFFFFFFFFFF );
341
342	}
343
344	/*----------------------------------------------------------------------------
345	\| Returns the exponent bits of the double-precision floating-point value `a'.
346	----------------------------------------------------------------------------/
347
348	INLINE int16 extractFloat64Exp( float64 a )
349	{
350
351	return ( a>>52 ) & 0x7FF;
352
353	}
354
355	/*----------------------------------------------------------------------------
356	\| Returns the sign bit of the double-precision floating-point value `a'.
357	----------------------------------------------------------------------------/
358
359	INLINE flag extractFloat64Sign( float64 a )
360	{
361
362	return a>>63;
363
364	}
365
366	/*----------------------------------------------------------------------------
367	\| Normalizes the subnormal double-precision floating-point value represented
368	\| by the denormalized significand `aSig'. The normalized exponent and
369	\| significand are stored at the locations pointed to by `zExpPtr' and
370	\| `zSigPtr', respectively.
371	----------------------------------------------------------------------------/
372
373	static void
374	normalizeFloat64Subnormal( bits64 aSig, int16 zExpPtr, bits64 zSigPtr )
375	{
376	int8 shiftCount;
377
378	shiftCount = countLeadingZeros64( aSig ) - 11;
379	*zSigPtr = aSig<<shiftCount;
380	*zExpPtr = 1 - shiftCount;
381
382	}
383
384	/*----------------------------------------------------------------------------
385	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386	\| double-precision floating-point value, returning the result. After being
387	\| shifted into the proper positions, the three fields are simply added
388	\| together to form the result. This means that any integer portion of `zSig'
389	\| will be added into the exponent. Since a properly normalized significand
390	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
391	\| than the desired result exponent whenever `zSig' is a complete, normalized
392	\| significand.
393	----------------------------------------------------------------------------/
394
395	INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
396	{
397
398	return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
399
400	}
401
402	/*----------------------------------------------------------------------------
403	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404	\| and significand `zSig', and returns the proper double-precision floating-
405	\| point value corresponding to the abstract input. Ordinarily, the abstract
406	\| value is simply rounded and packed into the double-precision format, with
407	\| the inexact exception raised if the abstract input cannot be represented
408	\| exactly. However, if the abstract value is too large, the overflow and
409	\| inexact exceptions are raised and an infinity or maximal finite value is
410	\| returned. If the abstract value is too small, the input value is rounded
411	\| to a subnormal number, and the underflow and inexact exceptions are raised
412	\| if the abstract input cannot be represented exactly as a subnormal double-
413	\| precision floating-point number.
414	\| The input significand `zSig' has its binary point between bits 62
415	\| and 61, which is 10 bits to the left of the usual location. This shifted
416	\| significand must be normalized or smaller. If `zSig' is not normalized,
417	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
418	\| and it must not require rounding. In the usual case that `zSig' is
419	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
421	\| Binary Floating-Point Arithmetic.
422	----------------------------------------------------------------------------/
423
424	static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
425	{
426	int8 roundingMode;
427	flag roundNearestEven;
428	int16 roundIncrement, roundBits;
429	flag isTiny;
430
431	roundingMode = STATUS(float_rounding_mode);
432	roundNearestEven = ( roundingMode == float_round_nearest_even );
433	roundIncrement = 0x200;
434	if ( ! roundNearestEven ) {
435	if ( roundingMode == float_round_to_zero ) {
436	roundIncrement = 0;
437	}
438	else {
439	roundIncrement = 0x3FF;
440	if ( zSign ) {
441	if ( roundingMode == float_round_up ) roundIncrement = 0;
442	}
443	else {
444	if ( roundingMode == float_round_down ) roundIncrement = 0;
445	}
446	}
447	}
448	roundBits = zSig & 0x3FF;
449	if ( 0x7FD <= (bits16) zExp ) {
450	if ( ( 0x7FD < zExp )
451	\|\| ( ( zExp == 0x7FD )
452	&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
453	) {
454	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
455	return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
456	}
457	if ( zExp < 0 ) {
458	isTiny =
459	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460	\|\| ( zExp < -1 )
461	\|\| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462	shift64RightJamming( zSig, - zExp, &zSig );
463	zExp = 0;
464	roundBits = zSig & 0x3FF;
465	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
466	}
467	}
468	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
469	zSig = ( zSig + roundIncrement )>>10;
470	zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471	if ( zSig == 0 ) zExp = 0;
472	return packFloat64( zSign, zExp, zSig );
473
474	}
475
476	/*----------------------------------------------------------------------------
477	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478	\| and significand `zSig', and returns the proper double-precision floating-
479	\| point value corresponding to the abstract input. This routine is just like
480	\| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481	\| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482	\| floating-point exponent.
483	----------------------------------------------------------------------------/
484
485	static float64
486	normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
487	{
488	int8 shiftCount;
489
490	shiftCount = countLeadingZeros64( zSig ) - 1;
491	return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492
493	}
494
495	#ifdef FLOATX80
496
497	/*----------------------------------------------------------------------------
498	\| Returns the fraction bits of the extended double-precision floating-point
499	\| value `a'.
500	----------------------------------------------------------------------------/
501
502	INLINE bits64 extractFloatx80Frac( floatx80 a )
503	{
504
505	return a.low;
506
507	}
508
509	/*----------------------------------------------------------------------------
510	\| Returns the exponent bits of the extended double-precision floating-point
511	\| value `a'.
512	----------------------------------------------------------------------------/
513
514	INLINE int32 extractFloatx80Exp( floatx80 a )
515	{
516
517	return a.high & 0x7FFF;
518
519	}
520
521	/*----------------------------------------------------------------------------
522	\| Returns the sign bit of the extended double-precision floating-point value
523	\| `a'.
524	----------------------------------------------------------------------------/
525
526	INLINE flag extractFloatx80Sign( floatx80 a )
527	{
528
529	return a.high>>15;
530
531	}
532
533	/*----------------------------------------------------------------------------
534	\| Normalizes the subnormal extended double-precision floating-point value
535	\| represented by the denormalized significand `aSig'. The normalized exponent
536	\| and significand are stored at the locations pointed to by `zExpPtr' and
537	\| `zSigPtr', respectively.
538	----------------------------------------------------------------------------/
539
540	static void
541	normalizeFloatx80Subnormal( bits64 aSig, int32 zExpPtr, bits64 zSigPtr )
542	{
543	int8 shiftCount;
544
545	shiftCount = countLeadingZeros64( aSig );
546	*zSigPtr = aSig<<shiftCount;
547	*zExpPtr = 1 - shiftCount;
548
549	}
550
551	/*----------------------------------------------------------------------------
552	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553	\| extended double-precision floating-point value, returning the result.
554	----------------------------------------------------------------------------/
555
556	INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
557	{
558	floatx80 z;
559
560	z.low = zSig;
561	z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562	return z;
563
564	}
565
566	/*----------------------------------------------------------------------------
567	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568	\| and extended significand formed by the concatenation of `zSig0' and `zSig1',
569	\| and returns the proper extended double-precision floating-point value
570	\| corresponding to the abstract input. Ordinarily, the abstract value is
571	\| rounded and packed into the extended double-precision format, with the
572	\| inexact exception raised if the abstract input cannot be represented
573	\| exactly. However, if the abstract value is too large, the overflow and
574	\| inexact exceptions are raised and an infinity or maximal finite value is
575	\| returned. If the abstract value is too small, the input value is rounded to
576	\| a subnormal number, and the underflow and inexact exceptions are raised if
577	\| the abstract input cannot be represented exactly as a subnormal extended
578	\| double-precision floating-point number.
579	\| If `roundingPrecision' is 32 or 64, the result is rounded to the same
580	\| number of bits as single or double precision, respectively. Otherwise, the
581	\| result is rounded to the full precision of the extended double-precision
582	\| format.
583	\| The input significand must be normalized or smaller. If the input
584	\| significand is not normalized, `zExp' must be 0; in that case, the result
585	\| returned is a subnormal number, and it must not require rounding. The
586	\| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587	\| Floating-Point Arithmetic.
588	----------------------------------------------------------------------------/
589
590	static floatx80
591	roundAndPackFloatx80(
592	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593	STATUS_PARAM)
594	{
595	int8 roundingMode;
596	flag roundNearestEven, increment, isTiny;
597	int64 roundIncrement, roundMask, roundBits;
598
599	roundingMode = STATUS(float_rounding_mode);
600	roundNearestEven = ( roundingMode == float_round_nearest_even );
601	if ( roundingPrecision == 80 ) goto precision80;
602	if ( roundingPrecision == 64 ) {
603	roundIncrement = LIT64( 0x0000000000000400 );
604	roundMask = LIT64( 0x00000000000007FF );
605	}
606	else if ( roundingPrecision == 32 ) {
607	roundIncrement = LIT64( 0x0000008000000000 );
608	roundMask = LIT64( 0x000000FFFFFFFFFF );
609	}
610	else {
611	goto precision80;
612	}
613	zSig0 \|= ( zSig1 != 0 );
614	if ( ! roundNearestEven ) {
615	if ( roundingMode == float_round_to_zero ) {
616	roundIncrement = 0;
617	}
618	else {
619	roundIncrement = roundMask;
620	if ( zSign ) {
621	if ( roundingMode == float_round_up ) roundIncrement = 0;
622	}
623	else {
624	if ( roundingMode == float_round_down ) roundIncrement = 0;
625	}
626	}
627	}
628	roundBits = zSig0 & roundMask;
629	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630	if ( ( 0x7FFE < zExp )
631	\|\| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632	) {
633	goto overflow;
634	}
635	if ( zExp <= 0 ) {
636	isTiny =
637	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638	\|\| ( zExp < 0 )
639	\|\| ( zSig0 <= zSig0 + roundIncrement );
640	shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641	zExp = 0;
642	roundBits = zSig0 & roundMask;
643	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
645	zSig0 += roundIncrement;
646	if ( (sbits64) zSig0 < 0 ) zExp = 1;
647	roundIncrement = roundMask + 1;
648	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649	roundMask \|= roundIncrement;
650	}
651	zSig0 &= ~ roundMask;
652	return packFloatx80( zSign, zExp, zSig0 );
653	}
654	}
655	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
656	zSig0 += roundIncrement;
657	if ( zSig0 < roundIncrement ) {
658	++zExp;
659	zSig0 = LIT64( 0x8000000000000000 );
660	}
661	roundIncrement = roundMask + 1;
662	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663	roundMask \|= roundIncrement;
664	}
665	zSig0 &= ~ roundMask;
666	if ( zSig0 == 0 ) zExp = 0;
667	return packFloatx80( zSign, zExp, zSig0 );
668	precision80:
669	increment = ( (sbits64) zSig1 < 0 );
670	if ( ! roundNearestEven ) {
671	if ( roundingMode == float_round_to_zero ) {
672	increment = 0;
673	}
674	else {
675	if ( zSign ) {
676	increment = ( roundingMode == float_round_down ) && zSig1;
677	}
678	else {
679	increment = ( roundingMode == float_round_up ) && zSig1;
680	}
681	}
682	}
683	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684	if ( ( 0x7FFE < zExp )
685	\|\| ( ( zExp == 0x7FFE )
686	&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687	&& increment
688	)
689	) {
690	roundMask = 0;
691	overflow:
692	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
693	if ( ( roundingMode == float_round_to_zero )
694	\|\| ( zSign && ( roundingMode == float_round_up ) )
695	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
696	) {
697	return packFloatx80( zSign, 0x7FFE, ~ roundMask );
698	}
699	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
700	}
701	if ( zExp <= 0 ) {
702	isTiny =
703	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704	\|\| ( zExp < 0 )
705	\|\| ! increment
706	\|\| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707	shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708	zExp = 0;
709	if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
711	if ( roundNearestEven ) {
712	increment = ( (sbits64) zSig1 < 0 );
713	}
714	else {
715	if ( zSign ) {
716	increment = ( roundingMode == float_round_down ) && zSig1;
717	}
718	else {
719	increment = ( roundingMode == float_round_up ) && zSig1;
720	}
721	}
722	if ( increment ) {
723	++zSig0;
724	zSig0 &=
725	~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726	if ( (sbits64) zSig0 < 0 ) zExp = 1;
727	}
728	return packFloatx80( zSign, zExp, zSig0 );
729	}
730	}
731	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
732	if ( increment ) {
733	++zSig0;
734	if ( zSig0 == 0 ) {
735	++zExp;
736	zSig0 = LIT64( 0x8000000000000000 );
737	}
738	else {
739	zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
740	}
741	}
742	else {
743	if ( zSig0 == 0 ) zExp = 0;
744	}
745	return packFloatx80( zSign, zExp, zSig0 );
746
747	}
748
749	/*----------------------------------------------------------------------------
750	\| Takes an abstract floating-point value having sign `zSign', exponent
751	\| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752	\| and returns the proper extended double-precision floating-point value
753	\| corresponding to the abstract input. This routine is just like
754	\| `roundAndPackFloatx80' except that the input significand does not have to be
755	\| normalized.
756	----------------------------------------------------------------------------/
757
758	static floatx80
759	normalizeRoundAndPackFloatx80(
760	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761	STATUS_PARAM)
762	{
763	int8 shiftCount;
764
765	if ( zSig0 == 0 ) {
766	zSig0 = zSig1;
767	zSig1 = 0;
768	zExp -= 64;
769	}
770	shiftCount = countLeadingZeros64( zSig0 );
771	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772	zExp -= shiftCount;
773	return
774	roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
775
776	}
777
778	#endif
779
780	#ifdef FLOAT128
781
782	/*----------------------------------------------------------------------------
783	\| Returns the least-significant 64 fraction bits of the quadruple-precision
784	\| floating-point value `a'.
785	----------------------------------------------------------------------------/
786
787	INLINE bits64 extractFloat128Frac1( float128 a )
788	{
789
790	return a.low;
791
792	}
793
794	/*----------------------------------------------------------------------------
795	\| Returns the most-significant 48 fraction bits of the quadruple-precision
796	\| floating-point value `a'.
797	----------------------------------------------------------------------------/
798
799	INLINE bits64 extractFloat128Frac0( float128 a )
800	{
801
802	return a.high & LIT64( 0x0000FFFFFFFFFFFF );
803
804	}
805
806	/*----------------------------------------------------------------------------
807	\| Returns the exponent bits of the quadruple-precision floating-point value
808	\| `a'.
809	----------------------------------------------------------------------------/
810
811	INLINE int32 extractFloat128Exp( float128 a )
812	{
813
814	return ( a.high>>48 ) & 0x7FFF;
815
816	}
817
818	/*----------------------------------------------------------------------------
819	\| Returns the sign bit of the quadruple-precision floating-point value `a'.
820	----------------------------------------------------------------------------/
821
822	INLINE flag extractFloat128Sign( float128 a )
823	{
824
825	return a.high>>63;
826
827	}
828
829	/*----------------------------------------------------------------------------
830	\| Normalizes the subnormal quadruple-precision floating-point value
831	\| represented by the denormalized significand formed by the concatenation of
832	\| `aSig0' and `aSig1'. The normalized exponent is stored at the location
833	\| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
834	\| significand are stored at the location pointed to by `zSig0Ptr', and the
835	\| least significant 64 bits of the normalized significand are stored at the
836	\| location pointed to by `zSig1Ptr'.
837	----------------------------------------------------------------------------/
838
839	static void
840	normalizeFloat128Subnormal(
841	bits64 aSig0,
842	bits64 aSig1,
843	int32 *zExpPtr,
844	bits64 *zSig0Ptr,
845	bits64 *zSig1Ptr
846	)
847	{
848	int8 shiftCount;
849
850	if ( aSig0 == 0 ) {
851	shiftCount = countLeadingZeros64( aSig1 ) - 15;
852	if ( shiftCount < 0 ) {
853	*zSig0Ptr = aSig1>>( - shiftCount );
854	*zSig1Ptr = aSig1<<( shiftCount & 63 );
855	}
856	else {
857	*zSig0Ptr = aSig1<<shiftCount;
858	*zSig1Ptr = 0;
859	}
860	*zExpPtr = - shiftCount - 63;
861	}
862	else {
863	shiftCount = countLeadingZeros64( aSig0 ) - 15;
864	shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865	*zExpPtr = 1 - shiftCount;
866	}
867
868	}
869
870	/*----------------------------------------------------------------------------
871	\| Packs the sign `zSign', the exponent `zExp', and the significand formed
872	\| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873	\| floating-point value, returning the result. After being shifted into the
874	\| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875	\| added together to form the most significant 32 bits of the result. This
876	\| means that any integer portion of `zSig0' will be added into the exponent.
877	\| Since a properly normalized significand will have an integer portion equal
878	\| to 1, the `zExp' input should be 1 less than the desired result exponent
879	\| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880	\| significand.
881	----------------------------------------------------------------------------/
882
883	INLINE float128
884	packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
885	{
886	float128 z;
887
888	z.low = zSig1;
889	z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890	return z;
891
892	}
893
894	/*----------------------------------------------------------------------------
895	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896	\| and extended significand formed by the concatenation of `zSig0', `zSig1',
897	\| and `zSig2', and returns the proper quadruple-precision floating-point value
898	\| corresponding to the abstract input. Ordinarily, the abstract value is
899	\| simply rounded and packed into the quadruple-precision format, with the
900	\| inexact exception raised if the abstract input cannot be represented
901	\| exactly. However, if the abstract value is too large, the overflow and
902	\| inexact exceptions are raised and an infinity or maximal finite value is
903	\| returned. If the abstract value is too small, the input value is rounded to
904	\| a subnormal number, and the underflow and inexact exceptions are raised if
905	\| the abstract input cannot be represented exactly as a subnormal quadruple-
906	\| precision floating-point number.
907	\| The input significand must be normalized or smaller. If the input
908	\| significand is not normalized, `zExp' must be 0; in that case, the result
909	\| returned is a subnormal number, and it must not require rounding. In the
910	\| usual case that the input significand is normalized, `zExp' must be 1 less
911	\| than the ``true'' floating-point exponent. The handling of underflow and
912	\| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913	----------------------------------------------------------------------------/
914
915	static float128
916	roundAndPackFloat128(
917	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
918	{
919	int8 roundingMode;
920	flag roundNearestEven, increment, isTiny;
921
922	roundingMode = STATUS(float_rounding_mode);
923	roundNearestEven = ( roundingMode == float_round_nearest_even );
924	increment = ( (sbits64) zSig2 < 0 );
925	if ( ! roundNearestEven ) {
926	if ( roundingMode == float_round_to_zero ) {
927	increment = 0;
928	}
929	else {
930	if ( zSign ) {
931	increment = ( roundingMode == float_round_down ) && zSig2;
932	}
933	else {
934	increment = ( roundingMode == float_round_up ) && zSig2;
935	}
936	}
937	}
938	if ( 0x7FFD <= (bits32) zExp ) {
939	if ( ( 0x7FFD < zExp )
940	\|\| ( ( zExp == 0x7FFD )
941	&& eq128(
942	LIT64( 0x0001FFFFFFFFFFFF ),
943	LIT64( 0xFFFFFFFFFFFFFFFF ),
944	zSig0,
945	zSig1
946	)
947	&& increment
948	)
949	) {
950	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
951	if ( ( roundingMode == float_round_to_zero )
952	\|\| ( zSign && ( roundingMode == float_round_up ) )
953	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
954	) {
955	return
956	packFloat128(
957	zSign,
958	0x7FFE,
959	LIT64( 0x0000FFFFFFFFFFFF ),
960	LIT64( 0xFFFFFFFFFFFFFFFF )
961	);
962	}
963	return packFloat128( zSign, 0x7FFF, 0, 0 );
964	}
965	if ( zExp < 0 ) {
966	isTiny =
967	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968	\|\| ( zExp < -1 )
969	\|\| ! increment
970	\|\| lt128(
971	zSig0,
972	zSig1,
973	LIT64( 0x0001FFFFFFFFFFFF ),
974	LIT64( 0xFFFFFFFFFFFFFFFF )
975	);
976	shift128ExtraRightJamming(
977	zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978	zExp = 0;
979	if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980	if ( roundNearestEven ) {
981	increment = ( (sbits64) zSig2 < 0 );
982	}
983	else {
984	if ( zSign ) {
985	increment = ( roundingMode == float_round_down ) && zSig2;
986	}
987	else {
988	increment = ( roundingMode == float_round_up ) && zSig2;
989	}
990	}
991	}
992	}
993	if ( zSig2 ) STATUS(float_exception_flags) \|= float_flag_inexact;
994	if ( increment ) {
995	add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996	zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
997	}
998	else {
999	if ( ( zSig0 \| zSig1 ) == 0 ) zExp = 0;
1000	}
1001	return packFloat128( zSign, zExp, zSig0, zSig1 );
1002
1003	}
1004
1005	/*----------------------------------------------------------------------------
1006	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007	\| and significand formed by the concatenation of `zSig0' and `zSig1', and
1008	\| returns the proper quadruple-precision floating-point value corresponding
1009	\| to the abstract input. This routine is just like `roundAndPackFloat128'
1010	\| except that the input significand has fewer bits and does not have to be
1011	\| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1012	\| point exponent.
1013	----------------------------------------------------------------------------/
1014
1015	static float128
1016	normalizeRoundAndPackFloat128(
1017	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018	{
1019	int8 shiftCount;
1020	bits64 zSig2;
1021
1022	if ( zSig0 == 0 ) {
1023	zSig0 = zSig1;
1024	zSig1 = 0;
1025	zExp -= 64;
1026	}
1027	shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028	if ( 0 <= shiftCount ) {
1029	zSig2 = 0;
1030	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031	}
1032	else {
1033	shift128ExtraRightJamming(
1034	zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035	}
1036	zExp -= shiftCount;
1037	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038
1039	}
1040
1041	#endif
1042
1043	/*----------------------------------------------------------------------------
1044	\| Returns the result of converting the 32-bit two's complement integer `a'
1045	\| to the single-precision floating-point format. The conversion is performed
1046	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047	----------------------------------------------------------------------------/
1048
1049	float32 int32_to_float32( int32 a STATUS_PARAM )
1050	{
1051	flag zSign;
1052
1053	if ( a == 0 ) return 0;
1054	if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055	zSign = ( a < 0 );
1056	return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057
1058	}
1059
1060	/*----------------------------------------------------------------------------
1061	\| Returns the result of converting the 32-bit two's complement integer `a'
1062	\| to the double-precision floating-point format. The conversion is performed
1063	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064	----------------------------------------------------------------------------/
1065
1066	float64 int32_to_float64( int32 a STATUS_PARAM )
1067	{
1068	flag zSign;
1069	uint32 absA;
1070	int8 shiftCount;
1071	bits64 zSig;
1072
1073	if ( a == 0 ) return 0;
1074	zSign = ( a < 0 );
1075	absA = zSign ? - a : a;
1076	shiftCount = countLeadingZeros32( absA ) + 21;
1077	zSig = absA;
1078	return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079
1080	}
1081
1082	#ifdef FLOATX80
1083
1084	/*----------------------------------------------------------------------------
1085	\| Returns the result of converting the 32-bit two's complement integer `a'
1086	\| to the extended double-precision floating-point format. The conversion
1087	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088	\| Arithmetic.
1089	----------------------------------------------------------------------------/
1090
1091	floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092	{
1093	flag zSign;
1094	uint32 absA;
1095	int8 shiftCount;
1096	bits64 zSig;
1097
1098	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099	zSign = ( a < 0 );
1100	absA = zSign ? - a : a;
1101	shiftCount = countLeadingZeros32( absA ) + 32;
1102	zSig = absA;
1103	return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104
1105	}
1106
1107	#endif
1108
1109	#ifdef FLOAT128
1110
1111	/*----------------------------------------------------------------------------
1112	\| Returns the result of converting the 32-bit two's complement integer `a' to
1113	\| the quadruple-precision floating-point format. The conversion is performed
1114	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115	----------------------------------------------------------------------------/
1116
1117	float128 int32_to_float128( int32 a STATUS_PARAM )
1118	{
1119	flag zSign;
1120	uint32 absA;
1121	int8 shiftCount;
1122	bits64 zSig0;
1123
1124	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125	zSign = ( a < 0 );
1126	absA = zSign ? - a : a;
1127	shiftCount = countLeadingZeros32( absA ) + 17;
1128	zSig0 = absA;
1129	return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130
1131	}
1132
1133	#endif
1134
1135	/*----------------------------------------------------------------------------
1136	\| Returns the result of converting the 64-bit two's complement integer `a'
1137	\| to the single-precision floating-point format. The conversion is performed
1138	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139	----------------------------------------------------------------------------/
1140
1141	float32 int64_to_float32( int64 a STATUS_PARAM )
1142	{
1143	flag zSign;
1144	uint64 absA;
1145	int8 shiftCount;
1146
1147	if ( a == 0 ) return 0;
1148	zSign = ( a < 0 );
1149	absA = zSign ? - a : a;
1150	shiftCount = countLeadingZeros64( absA ) - 40;
1151	if ( 0 <= shiftCount ) {
1152	return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153	}
1154	else {
1155	shiftCount += 7;
1156	if ( shiftCount < 0 ) {
1157	shift64RightJamming( absA, - shiftCount, &absA );
1158	}
1159	else {
1160	absA <<= shiftCount;
1161	}
1162	return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163	}
1164
1165	}
1166
1167	/*----------------------------------------------------------------------------
1168	\| Returns the result of converting the 64-bit two's complement integer `a'
1169	\| to the double-precision floating-point format. The conversion is performed
1170	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171	----------------------------------------------------------------------------/
1172
1173	float64 int64_to_float64( int64 a STATUS_PARAM )
1174	{
1175	flag zSign;
1176
1177	if ( a == 0 ) return 0;
1178	if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179	return packFloat64( 1, 0x43E, 0 );
1180	}
1181	zSign = ( a < 0 );
1182	return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183
1184	}
1185
1186	#ifdef FLOATX80
1187
1188	/*----------------------------------------------------------------------------
1189	\| Returns the result of converting the 64-bit two's complement integer `a'
1190	\| to the extended double-precision floating-point format. The conversion
1191	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192	\| Arithmetic.
1193	----------------------------------------------------------------------------/
1194
1195	floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196	{
1197	flag zSign;
1198	uint64 absA;
1199	int8 shiftCount;
1200
1201	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202	zSign = ( a < 0 );
1203	absA = zSign ? - a : a;
1204	shiftCount = countLeadingZeros64( absA );
1205	return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206
1207	}
1208
1209	#endif
1210
1211	#ifdef FLOAT128
1212
1213	/*----------------------------------------------------------------------------
1214	\| Returns the result of converting the 64-bit two's complement integer `a' to
1215	\| the quadruple-precision floating-point format. The conversion is performed
1216	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217	----------------------------------------------------------------------------/
1218
1219	float128 int64_to_float128( int64 a STATUS_PARAM )
1220	{
1221	flag zSign;
1222	uint64 absA;
1223	int8 shiftCount;
1224	int32 zExp;
1225	bits64 zSig0, zSig1;
1226
1227	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228	zSign = ( a < 0 );
1229	absA = zSign ? - a : a;
1230	shiftCount = countLeadingZeros64( absA ) + 49;
1231	zExp = 0x406E - shiftCount;
1232	if ( 64 <= shiftCount ) {
1233	zSig1 = 0;
1234	zSig0 = absA;
1235	shiftCount -= 64;
1236	}
1237	else {
1238	zSig1 = absA;
1239	zSig0 = 0;
1240	}
1241	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242	return packFloat128( zSign, zExp, zSig0, zSig1 );
1243
1244	}
1245
1246	#endif
1247
1248	/*----------------------------------------------------------------------------
1249	\| Returns the result of converting the single-precision floating-point value
1250	\| `a' to the 32-bit two's complement integer format. The conversion is
1251	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1252	\| Arithmetic---which means in particular that the conversion is rounded
1253	\| according to the current rounding mode. If `a' is a NaN, the largest
1254	\| positive integer is returned. Otherwise, if the conversion overflows, the
1255	\| largest integer with the same sign as `a' is returned.
1256	----------------------------------------------------------------------------/
1257
1258	int32 float32_to_int32( float32 a STATUS_PARAM )
1259	{
1260	flag aSign;
1261	int16 aExp, shiftCount;
1262	bits32 aSig;
1263	bits64 aSig64;
1264
1265	aSig = extractFloat32Frac( a );
1266	aExp = extractFloat32Exp( a );
1267	aSign = extractFloat32Sign( a );
1268	if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269	if ( aExp ) aSig \|= 0x00800000;
1270	shiftCount = 0xAF - aExp;
1271	aSig64 = aSig;
1272	aSig64 <<= 32;
1273	if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274	return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275
1276	}
1277
1278	/*----------------------------------------------------------------------------
1279	\| Returns the result of converting the single-precision floating-point value
1280	\| `a' to the 32-bit two's complement integer format. The conversion is
1281	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282	\| Arithmetic, except that the conversion is always rounded toward zero.
1283	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1284	\| the conversion overflows, the largest integer with the same sign as `a' is
1285	\| returned.
1286	----------------------------------------------------------------------------/
1287
1288	int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289	{
1290	flag aSign;
1291	int16 aExp, shiftCount;
1292	bits32 aSig;
1293	int32 z;
1294
1295	aSig = extractFloat32Frac( a );
1296	aExp = extractFloat32Exp( a );
1297	aSign = extractFloat32Sign( a );
1298	shiftCount = aExp - 0x9E;
1299	if ( 0 <= shiftCount ) {
1300	if ( a != 0xCF000000 ) {
1301	float_raise( float_flag_invalid STATUS_VAR);
1302	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303	}
1304	return (sbits32) 0x80000000;
1305	}
1306	else if ( aExp <= 0x7E ) {
1307	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1308	return 0;
1309	}
1310	aSig = ( aSig \| 0x00800000 )<<8;
1311	z = aSig>>( - shiftCount );
1312	if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313	STATUS(float_exception_flags) \|= float_flag_inexact;
1314	}
1315	if ( aSign ) z = - z;
1316	return z;
1317
1318	}
1319
1320	/*----------------------------------------------------------------------------
1321	\| Returns the result of converting the single-precision floating-point value
1322	\| `a' to the 64-bit two's complement integer format. The conversion is
1323	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1324	\| Arithmetic---which means in particular that the conversion is rounded
1325	\| according to the current rounding mode. If `a' is a NaN, the largest
1326	\| positive integer is returned. Otherwise, if the conversion overflows, the
1327	\| largest integer with the same sign as `a' is returned.
1328	----------------------------------------------------------------------------/
1329
1330	int64 float32_to_int64( float32 a STATUS_PARAM )
1331	{
1332	flag aSign;
1333	int16 aExp, shiftCount;
1334	bits32 aSig;
1335	bits64 aSig64, aSigExtra;
1336
1337	aSig = extractFloat32Frac( a );
1338	aExp = extractFloat32Exp( a );
1339	aSign = extractFloat32Sign( a );
1340	shiftCount = 0xBE - aExp;
1341	if ( shiftCount < 0 ) {
1342	float_raise( float_flag_invalid STATUS_VAR);
1343	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1344	return LIT64( 0x7FFFFFFFFFFFFFFF );
1345	}
1346	return (sbits64) LIT64( 0x8000000000000000 );
1347	}
1348	if ( aExp ) aSig \|= 0x00800000;
1349	aSig64 = aSig;
1350	aSig64 <<= 40;
1351	shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352	return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353
1354	}
1355
1356	/*----------------------------------------------------------------------------
1357	\| Returns the result of converting the single-precision floating-point value
1358	\| `a' to the 64-bit two's complement integer format. The conversion is
1359	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360	\| Arithmetic, except that the conversion is always rounded toward zero. If
1361	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1362	\| conversion overflows, the largest integer with the same sign as `a' is
1363	\| returned.
1364	----------------------------------------------------------------------------/
1365
1366	int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367	{
1368	flag aSign;
1369	int16 aExp, shiftCount;
1370	bits32 aSig;
1371	bits64 aSig64;
1372	int64 z;
1373
1374	aSig = extractFloat32Frac( a );
1375	aExp = extractFloat32Exp( a );
1376	aSign = extractFloat32Sign( a );
1377	shiftCount = aExp - 0xBE;
1378	if ( 0 <= shiftCount ) {
1379	if ( a != 0xDF000000 ) {
1380	float_raise( float_flag_invalid STATUS_VAR);
1381	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1382	return LIT64( 0x7FFFFFFFFFFFFFFF );
1383	}
1384	}
1385	return (sbits64) LIT64( 0x8000000000000000 );
1386	}
1387	else if ( aExp <= 0x7E ) {
1388	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1389	return 0;
1390	}
1391	aSig64 = aSig \| 0x00800000;
1392	aSig64 <<= 40;
1393	z = aSig64>>( - shiftCount );
1394	if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395	STATUS(float_exception_flags) \|= float_flag_inexact;
1396	}
1397	if ( aSign ) z = - z;
1398	return z;
1399
1400	}
1401
1402	/*----------------------------------------------------------------------------
1403	\| Returns the result of converting the single-precision floating-point value
1404	\| `a' to the double-precision floating-point format. The conversion is
1405	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1406	\| Arithmetic.
1407	----------------------------------------------------------------------------/
1408
1409	float64 float32_to_float64( float32 a STATUS_PARAM )
1410	{
1411	flag aSign;
1412	int16 aExp;
1413	bits32 aSig;
1414
1415	aSig = extractFloat32Frac( a );
1416	aExp = extractFloat32Exp( a );
1417	aSign = extractFloat32Sign( a );
1418	if ( aExp == 0xFF ) {
1419	if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420	return packFloat64( aSign, 0x7FF, 0 );
1421	}
1422	if ( aExp == 0 ) {
1423	if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425	--aExp;
1426	}
1427	return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428
1429	}
1430
1431	#ifdef FLOATX80
1432
1433	/*----------------------------------------------------------------------------
1434	\| Returns the result of converting the single-precision floating-point value
1435	\| `a' to the extended double-precision floating-point format. The conversion
1436	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437	\| Arithmetic.
1438	----------------------------------------------------------------------------/
1439
1440	floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441	{
1442	flag aSign;
1443	int16 aExp;
1444	bits32 aSig;
1445
1446	aSig = extractFloat32Frac( a );
1447	aExp = extractFloat32Exp( a );
1448	aSign = extractFloat32Sign( a );
1449	if ( aExp == 0xFF ) {
1450	if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452	}
1453	if ( aExp == 0 ) {
1454	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456	}
1457	aSig \|= 0x00800000;
1458	return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459
1460	}
1461
1462	#endif
1463
1464	#ifdef FLOAT128
1465
1466	/*----------------------------------------------------------------------------
1467	\| Returns the result of converting the single-precision floating-point value
1468	\| `a' to the double-precision floating-point format. The conversion is
1469	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1470	\| Arithmetic.
1471	----------------------------------------------------------------------------/
1472
1473	float128 float32_to_float128( float32 a STATUS_PARAM )
1474	{
1475	flag aSign;
1476	int16 aExp;
1477	bits32 aSig;
1478
1479	aSig = extractFloat32Frac( a );
1480	aExp = extractFloat32Exp( a );
1481	aSign = extractFloat32Sign( a );
1482	if ( aExp == 0xFF ) {
1483	if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484	return packFloat128( aSign, 0x7FFF, 0, 0 );
1485	}
1486	if ( aExp == 0 ) {
1487	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489	--aExp;
1490	}
1491	return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492
1493	}
1494
1495	#endif
1496
1497	/*----------------------------------------------------------------------------
1498	\| Rounds the single-precision floating-point value `a' to an integer, and
1499	\| returns the result as a single-precision floating-point value. The
1500	\| operation is performed according to the IEC/IEEE Standard for Binary
1501	\| Floating-Point Arithmetic.
1502	----------------------------------------------------------------------------/
1503
1504	float32 float32_round_to_int( float32 a STATUS_PARAM)
1505	{
1506	flag aSign;
1507	int16 aExp;
1508	bits32 lastBitMask, roundBitsMask;
1509	int8 roundingMode;
1510	float32 z;
1511
1512	aExp = extractFloat32Exp( a );
1513	if ( 0x96 <= aExp ) {
1514	if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515	return propagateFloat32NaN( a, a STATUS_VAR );
1516	}
1517	return a;
1518	}
1519	if ( aExp <= 0x7E ) {
1520	if ( (bits32) ( a<<1 ) == 0 ) return a;
1521	STATUS(float_exception_flags) \|= float_flag_inexact;
1522	aSign = extractFloat32Sign( a );
1523	switch ( STATUS(float_rounding_mode) ) {
1524	case float_round_nearest_even:
1525	if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526	return packFloat32( aSign, 0x7F, 0 );
1527	}
1528	break;
1529	case float_round_down:
1530	return aSign ? 0xBF800000 : 0;
1531	case float_round_up:
1532	return aSign ? 0x80000000 : 0x3F800000;
1533	}
1534	return packFloat32( aSign, 0, 0 );
1535	}
1536	lastBitMask = 1;
1537	lastBitMask <<= 0x96 - aExp;
1538	roundBitsMask = lastBitMask - 1;
1539	z = a;
1540	roundingMode = STATUS(float_rounding_mode);
1541	if ( roundingMode == float_round_nearest_even ) {
1542	z += lastBitMask>>1;
1543	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544	}
1545	else if ( roundingMode != float_round_to_zero ) {
1546	if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547	z += roundBitsMask;
1548	}
1549	}
1550	z &= ~ roundBitsMask;
1551	if ( z != a ) STATUS(float_exception_flags) \|= float_flag_inexact;
1552	return z;
1553
1554	}
1555
1556	/*----------------------------------------------------------------------------
1557	\| Returns the result of adding the absolute values of the single-precision
1558	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1559	\| before being returned. `zSign' is ignored if the result is a NaN.
1560	\| The addition is performed according to the IEC/IEEE Standard for Binary
1561	\| Floating-Point Arithmetic.
1562	----------------------------------------------------------------------------/
1563
1564	static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565	{
1566	int16 aExp, bExp, zExp;
1567	bits32 aSig, bSig, zSig;
1568	int16 expDiff;
1569
1570	aSig = extractFloat32Frac( a );
1571	aExp = extractFloat32Exp( a );
1572	bSig = extractFloat32Frac( b );
1573	bExp = extractFloat32Exp( b );
1574	expDiff = aExp - bExp;
1575	aSig <<= 6;
1576	bSig <<= 6;
1577	if ( 0 < expDiff ) {
1578	if ( aExp == 0xFF ) {
1579	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580	return a;
1581	}
1582	if ( bExp == 0 ) {
1583	--expDiff;
1584	}
1585	else {
1586	bSig \|= 0x20000000;
1587	}
1588	shift32RightJamming( bSig, expDiff, &bSig );
1589	zExp = aExp;
1590	}
1591	else if ( expDiff < 0 ) {
1592	if ( bExp == 0xFF ) {
1593	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594	return packFloat32( zSign, 0xFF, 0 );
1595	}
1596	if ( aExp == 0 ) {
1597	++expDiff;
1598	}
1599	else {
1600	aSig \|= 0x20000000;
1601	}
1602	shift32RightJamming( aSig, - expDiff, &aSig );
1603	zExp = bExp;
1604	}
1605	else {
1606	if ( aExp == 0xFF ) {
1607	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608	return a;
1609	}
1610	if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611	zSig = 0x40000000 + aSig + bSig;
1612	zExp = aExp;
1613	goto roundAndPack;
1614	}
1615	aSig \|= 0x20000000;
1616	zSig = ( aSig + bSig )<<1;
1617	--zExp;
1618	if ( (sbits32) zSig < 0 ) {
1619	zSig = aSig + bSig;
1620	++zExp;
1621	}
1622	roundAndPack:
1623	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624
1625	}
1626
1627	/*----------------------------------------------------------------------------
1628	\| Returns the result of subtracting the absolute values of the single-
1629	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
1630	\| difference is negated before being returned. `zSign' is ignored if the
1631	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
1632	\| Standard for Binary Floating-Point Arithmetic.
1633	----------------------------------------------------------------------------/
1634
1635	static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636	{
1637	int16 aExp, bExp, zExp;
1638	bits32 aSig, bSig, zSig;
1639	int16 expDiff;
1640
1641	aSig = extractFloat32Frac( a );
1642	aExp = extractFloat32Exp( a );
1643	bSig = extractFloat32Frac( b );
1644	bExp = extractFloat32Exp( b );
1645	expDiff = aExp - bExp;
1646	aSig <<= 7;
1647	bSig <<= 7;
1648	if ( 0 < expDiff ) goto aExpBigger;
1649	if ( expDiff < 0 ) goto bExpBigger;
1650	if ( aExp == 0xFF ) {
1651	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652	float_raise( float_flag_invalid STATUS_VAR);
1653	return float32_default_nan;
1654	}
1655	if ( aExp == 0 ) {
1656	aExp = 1;
1657	bExp = 1;
1658	}
1659	if ( bSig < aSig ) goto aBigger;
1660	if ( aSig < bSig ) goto bBigger;
1661	return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662	bExpBigger:
1663	if ( bExp == 0xFF ) {
1664	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665	return packFloat32( zSign ^ 1, 0xFF, 0 );
1666	}
1667	if ( aExp == 0 ) {
1668	++expDiff;
1669	}
1670	else {
1671	aSig \|= 0x40000000;
1672	}
1673	shift32RightJamming( aSig, - expDiff, &aSig );
1674	bSig \|= 0x40000000;
1675	bBigger:
1676	zSig = bSig - aSig;
1677	zExp = bExp;
1678	zSign ^= 1;
1679	goto normalizeRoundAndPack;
1680	aExpBigger:
1681	if ( aExp == 0xFF ) {
1682	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683	return a;
1684	}
1685	if ( bExp == 0 ) {
1686	--expDiff;
1687	}
1688	else {
1689	bSig \|= 0x40000000;
1690	}
1691	shift32RightJamming( bSig, expDiff, &bSig );
1692	aSig \|= 0x40000000;
1693	aBigger:
1694	zSig = aSig - bSig;
1695	zExp = aExp;
1696	normalizeRoundAndPack:
1697	--zExp;
1698	return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699
1700	}
1701
1702	/*----------------------------------------------------------------------------
1703	\| Returns the result of adding the single-precision floating-point values `a'
1704	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
1705	\| Binary Floating-Point Arithmetic.
1706	----------------------------------------------------------------------------/
1707
1708	float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709	{
1710	flag aSign, bSign;
1711
1712	aSign = extractFloat32Sign( a );
1713	bSign = extractFloat32Sign( b );
1714	if ( aSign == bSign ) {
1715	return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716	}
1717	else {
1718	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719	}
1720
1721	}
1722
1723	/*----------------------------------------------------------------------------
1724	\| Returns the result of subtracting the single-precision floating-point values
1725	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1726	\| for Binary Floating-Point Arithmetic.
1727	----------------------------------------------------------------------------/
1728
1729	float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730	{
1731	flag aSign, bSign;
1732
1733	aSign = extractFloat32Sign( a );
1734	bSign = extractFloat32Sign( b );
1735	if ( aSign == bSign ) {
1736	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737	}
1738	else {
1739	return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740	}
1741
1742	}
1743
1744	/*----------------------------------------------------------------------------
1745	\| Returns the result of multiplying the single-precision floating-point values
1746	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1747	\| for Binary Floating-Point Arithmetic.
1748	----------------------------------------------------------------------------/
1749
1750	float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751	{
1752	flag aSign, bSign, zSign;
1753	int16 aExp, bExp, zExp;
1754	bits32 aSig, bSig;
1755	bits64 zSig64;
1756	bits32 zSig;
1757
1758	aSig = extractFloat32Frac( a );
1759	aExp = extractFloat32Exp( a );
1760	aSign = extractFloat32Sign( a );
1761	bSig = extractFloat32Frac( b );
1762	bExp = extractFloat32Exp( b );
1763	bSign = extractFloat32Sign( b );
1764	zSign = aSign ^ bSign;
1765	if ( aExp == 0xFF ) {
1766	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1767	return propagateFloat32NaN( a, b STATUS_VAR );
1768	}
1769	if ( ( bExp \| bSig ) == 0 ) {
1770	float_raise( float_flag_invalid STATUS_VAR);
1771	return float32_default_nan;
1772	}
1773	return packFloat32( zSign, 0xFF, 0 );
1774	}
1775	if ( bExp == 0xFF ) {
1776	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777	if ( ( aExp \| aSig ) == 0 ) {
1778	float_raise( float_flag_invalid STATUS_VAR);
1779	return float32_default_nan;
1780	}
1781	return packFloat32( zSign, 0xFF, 0 );
1782	}
1783	if ( aExp == 0 ) {
1784	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786	}
1787	if ( bExp == 0 ) {
1788	if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790	}
1791	zExp = aExp + bExp - 0x7F;
1792	aSig = ( aSig \| 0x00800000 )<<7;
1793	bSig = ( bSig \| 0x00800000 )<<8;
1794	shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795	zSig = zSig64;
1796	if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797	zSig <<= 1;
1798	--zExp;
1799	}
1800	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801
1802	}
1803
1804	/*----------------------------------------------------------------------------
1805	\| Returns the result of dividing the single-precision floating-point value `a'
1806	\| by the corresponding value `b'. The operation is performed according to the
1807	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808	----------------------------------------------------------------------------/
1809
1810	float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811	{
1812	flag aSign, bSign, zSign;
1813	int16 aExp, bExp, zExp;
1814	bits32 aSig, bSig, zSig;
1815
1816	aSig = extractFloat32Frac( a );
1817	aExp = extractFloat32Exp( a );
1818	aSign = extractFloat32Sign( a );
1819	bSig = extractFloat32Frac( b );
1820	bExp = extractFloat32Exp( b );
1821	bSign = extractFloat32Sign( b );
1822	zSign = aSign ^ bSign;
1823	if ( aExp == 0xFF ) {
1824	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825	if ( bExp == 0xFF ) {
1826	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827	float_raise( float_flag_invalid STATUS_VAR);
1828	return float32_default_nan;
1829	}
1830	return packFloat32( zSign, 0xFF, 0 );
1831	}
1832	if ( bExp == 0xFF ) {
1833	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834	return packFloat32( zSign, 0, 0 );
1835	}
1836	if ( bExp == 0 ) {
1837	if ( bSig == 0 ) {
1838	if ( ( aExp \| aSig ) == 0 ) {
1839	float_raise( float_flag_invalid STATUS_VAR);
1840	return float32_default_nan;
1841	}
1842	float_raise( float_flag_divbyzero STATUS_VAR);
1843	return packFloat32( zSign, 0xFF, 0 );
1844	}
1845	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846	}
1847	if ( aExp == 0 ) {
1848	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850	}
1851	zExp = aExp - bExp + 0x7D;
1852	aSig = ( aSig \| 0x00800000 )<<7;
1853	bSig = ( bSig \| 0x00800000 )<<8;
1854	if ( bSig <= ( aSig + aSig ) ) {
1855	aSig >>= 1;
1856	++zExp;
1857	}
1858	zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859	if ( ( zSig & 0x3F ) == 0 ) {
1860	zSig \|= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861	}
1862	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863
1864	}
1865
1866	/*----------------------------------------------------------------------------
1867	\| Returns the remainder of the single-precision floating-point value `a'
1868	\| with respect to the corresponding value `b'. The operation is performed
1869	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870	----------------------------------------------------------------------------/
1871
1872	float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873	{
1874	flag aSign, bSign, zSign;
1875	int16 aExp, bExp, expDiff;
1876	bits32 aSig, bSig;
1877	bits32 q;
1878	bits64 aSig64, bSig64, q64;
1879	bits32 alternateASig;
1880	sbits32 sigMean;
1881
1882	aSig = extractFloat32Frac( a );
1883	aExp = extractFloat32Exp( a );
1884	aSign = extractFloat32Sign( a );
1885	bSig = extractFloat32Frac( b );
1886	bExp = extractFloat32Exp( b );
1887	bSign = extractFloat32Sign( b );
1888	if ( aExp == 0xFF ) {
1889	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1890	return propagateFloat32NaN( a, b STATUS_VAR );
1891	}
1892	float_raise( float_flag_invalid STATUS_VAR);
1893	return float32_default_nan;
1894	}
1895	if ( bExp == 0xFF ) {
1896	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897	return a;
1898	}
1899	if ( bExp == 0 ) {
1900	if ( bSig == 0 ) {
1901	float_raise( float_flag_invalid STATUS_VAR);
1902	return float32_default_nan;
1903	}
1904	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905	}
1906	if ( aExp == 0 ) {
1907	if ( aSig == 0 ) return a;
1908	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909	}
1910	expDiff = aExp - bExp;
1911	aSig \|= 0x00800000;
1912	bSig \|= 0x00800000;
1913	if ( expDiff < 32 ) {
1914	aSig <<= 8;
1915	bSig <<= 8;
1916	if ( expDiff < 0 ) {
1917	if ( expDiff < -1 ) return a;
1918	aSig >>= 1;
1919	}
1920	q = ( bSig <= aSig );
1921	if ( q ) aSig -= bSig;
1922	if ( 0 < expDiff ) {
1923	q = ( ( (bits64) aSig )<<32 ) / bSig;
1924	q >>= 32 - expDiff;
1925	bSig >>= 2;
1926	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927	}
1928	else {
1929	aSig >>= 2;
1930	bSig >>= 2;
1931	}
1932	}
1933	else {
1934	if ( bSig <= aSig ) aSig -= bSig;
1935	aSig64 = ( (bits64) aSig )<<40;
1936	bSig64 = ( (bits64) bSig )<<40;
1937	expDiff -= 64;
1938	while ( 0 < expDiff ) {
1939	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941	aSig64 = - ( ( bSig * q64 )<<38 );
1942	expDiff -= 62;
1943	}
1944	expDiff += 64;
1945	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947	q = q64>>( 64 - expDiff );
1948	bSig <<= 6;
1949	aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950	}
1951	do {
1952	alternateASig = aSig;
1953	++q;
1954	aSig -= bSig;
1955	} while ( 0 <= (sbits32) aSig );
1956	sigMean = aSig + alternateASig;
1957	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958	aSig = alternateASig;
1959	}
1960	zSign = ( (sbits32) aSig < 0 );
1961	if ( zSign ) aSig = - aSig;
1962	return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963
1964	}
1965
1966	/*----------------------------------------------------------------------------
1967	\| Returns the square root of the single-precision floating-point value `a'.
1968	\| The operation is performed according to the IEC/IEEE Standard for Binary
1969	\| Floating-Point Arithmetic.
1970	----------------------------------------------------------------------------/
1971
1972	float32 float32_sqrt( float32 a STATUS_PARAM )
1973	{
1974	flag aSign;
1975	int16 aExp, zExp;
1976	bits32 aSig, zSig;
1977	bits64 rem, term;
1978
1979	aSig = extractFloat32Frac( a );
1980	aExp = extractFloat32Exp( a );
1981	aSign = extractFloat32Sign( a );
1982	if ( aExp == 0xFF ) {
1983	if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984	if ( ! aSign ) return a;
1985	float_raise( float_flag_invalid STATUS_VAR);
1986	return float32_default_nan;
1987	}
1988	if ( aSign ) {
1989	if ( ( aExp \| aSig ) == 0 ) return a;
1990	float_raise( float_flag_invalid STATUS_VAR);
1991	return float32_default_nan;
1992	}
1993	if ( aExp == 0 ) {
1994	if ( aSig == 0 ) return 0;
1995	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996	}
1997	zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998	aSig = ( aSig \| 0x00800000 )<<8;
1999	zSig = estimateSqrt32( aExp, aSig ) + 2;
2000	if ( ( zSig & 0x7F ) <= 5 ) {
2001	if ( zSig < 2 ) {
2002	zSig = 0x7FFFFFFF;
2003	goto roundAndPack;
2004	}
2005	aSig >>= aExp & 1;
2006	term = ( (bits64) zSig ) * zSig;
2007	rem = ( ( (bits64) aSig )<<32 ) - term;
2008	while ( (sbits64) rem < 0 ) {
2009	--zSig;
2010	rem += ( ( (bits64) zSig )<<1 ) \| 1;
2011	}
2012	zSig \|= ( rem != 0 );
2013	}
2014	shift32RightJamming( zSig, 1, &zSig );
2015	roundAndPack:
2016	return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017
2018	}
2019
2020	/*----------------------------------------------------------------------------
2021	\| Returns 1 if the single-precision floating-point value `a' is equal to
2022	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2023	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024	----------------------------------------------------------------------------/
2025
2026	flag float32_eq( float32 a, float32 b STATUS_PARAM )
2027	{
2028
2029	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031	) {
2032	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2033	float_raise( float_flag_invalid STATUS_VAR);
2034	}
2035	return 0;
2036	}
2037	return ( a == b ) \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2038
2039	}
2040
2041	/*----------------------------------------------------------------------------
2042	\| Returns 1 if the single-precision floating-point value `a' is less than
2043	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
2044	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045	\| Arithmetic.
2046	----------------------------------------------------------------------------/
2047
2048	flag float32_le( float32 a, float32 b STATUS_PARAM )
2049	{
2050	flag aSign, bSign;
2051
2052	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054	) {
2055	float_raise( float_flag_invalid STATUS_VAR);
2056	return 0;
2057	}
2058	aSign = extractFloat32Sign( a );
2059	bSign = extractFloat32Sign( b );
2060	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2061	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
2062
2063	}
2064
2065	/*----------------------------------------------------------------------------
2066	\| Returns 1 if the single-precision floating-point value `a' is less than
2067	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2068	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069	----------------------------------------------------------------------------/
2070
2071	flag float32_lt( float32 a, float32 b STATUS_PARAM )
2072	{
2073	flag aSign, bSign;
2074
2075	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077	) {
2078	float_raise( float_flag_invalid STATUS_VAR);
2079	return 0;
2080	}
2081	aSign = extractFloat32Sign( a );
2082	bSign = extractFloat32Sign( b );
2083	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a \| b )<<1 ) != 0 );
2084	return ( a != b ) && ( aSign ^ ( a < b ) );
2085
2086	}
2087
2088	/*----------------------------------------------------------------------------
2089	\| Returns 1 if the single-precision floating-point value `a' is equal to
2090	\| the corresponding value `b', and 0 otherwise. The invalid exception is
2091	\| raised if either operand is a NaN. Otherwise, the comparison is performed
2092	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093	----------------------------------------------------------------------------/
2094
2095	flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096	{
2097
2098	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100	) {
2101	float_raise( float_flag_invalid STATUS_VAR);
2102	return 0;
2103	}
2104	return ( a == b ) \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2105
2106	}
2107
2108	/*----------------------------------------------------------------------------
2109	\| Returns 1 if the single-precision floating-point value `a' is less than or
2110	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2111	\| cause an exception. Otherwise, the comparison is performed according to the
2112	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113	----------------------------------------------------------------------------/
2114
2115	flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116	{
2117	flag aSign, bSign;
2118
2119	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121	) {
2122	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2123	float_raise( float_flag_invalid STATUS_VAR);
2124	}
2125	return 0;
2126	}
2127	aSign = extractFloat32Sign( a );
2128	bSign = extractFloat32Sign( b );
2129	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2130	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
2131
2132	}
2133
2134	/*----------------------------------------------------------------------------
2135	\| Returns 1 if the single-precision floating-point value `a' is less than
2136	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2137	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2138	\| Standard for Binary Floating-Point Arithmetic.
2139	----------------------------------------------------------------------------/
2140
2141	flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142	{
2143	flag aSign, bSign;
2144
2145	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147	) {
2148	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2149	float_raise( float_flag_invalid STATUS_VAR);
2150	}
2151	return 0;
2152	}
2153	aSign = extractFloat32Sign( a );
2154	bSign = extractFloat32Sign( b );
2155	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a \| b )<<1 ) != 0 );
2156	return ( a != b ) && ( aSign ^ ( a < b ) );
2157
2158	}
2159
2160	/*----------------------------------------------------------------------------
2161	\| Returns the result of converting the double-precision floating-point value
2162	\| `a' to the 32-bit two's complement integer format. The conversion is
2163	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2164	\| Arithmetic---which means in particular that the conversion is rounded
2165	\| according to the current rounding mode. If `a' is a NaN, the largest
2166	\| positive integer is returned. Otherwise, if the conversion overflows, the
2167	\| largest integer with the same sign as `a' is returned.
2168	----------------------------------------------------------------------------/
2169
2170	int32 float64_to_int32( float64 a STATUS_PARAM )
2171	{
2172	flag aSign;
2173	int16 aExp, shiftCount;
2174	bits64 aSig;
2175
2176	aSig = extractFloat64Frac( a );
2177	aExp = extractFloat64Exp( a );
2178	aSign = extractFloat64Sign( a );
2179	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2181	shiftCount = 0x42C - aExp;
2182	if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183	return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184
2185	}
2186
2187	/*----------------------------------------------------------------------------
2188	\| Returns the result of converting the double-precision floating-point value
2189	\| `a' to the 32-bit two's complement integer format. The conversion is
2190	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2191	\| Arithmetic, except that the conversion is always rounded toward zero.
2192	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2193	\| the conversion overflows, the largest integer with the same sign as `a' is
2194	\| returned.
2195	----------------------------------------------------------------------------/
2196
2197	int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198	{
2199	flag aSign;
2200	int16 aExp, shiftCount;
2201	bits64 aSig, savedASig;
2202	int32 z;
2203
2204	aSig = extractFloat64Frac( a );
2205	aExp = extractFloat64Exp( a );
2206	aSign = extractFloat64Sign( a );
2207	if ( 0x41E < aExp ) {
2208	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209	goto invalid;
2210	}
2211	else if ( aExp < 0x3FF ) {
2212	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2213	return 0;
2214	}
2215	aSig \|= LIT64( 0x0010000000000000 );
2216	shiftCount = 0x433 - aExp;
2217	savedASig = aSig;
2218	aSig >>= shiftCount;
2219	z = aSig;
2220	if ( aSign ) z = - z;
2221	if ( ( z < 0 ) ^ aSign ) {
2222	invalid:
2223	float_raise( float_flag_invalid STATUS_VAR);
2224	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225	}
2226	if ( ( aSig<<shiftCount ) != savedASig ) {
2227	STATUS(float_exception_flags) \|= float_flag_inexact;
2228	}
2229	return z;
2230
2231	}
2232
2233	/*----------------------------------------------------------------------------
2234	\| Returns the result of converting the double-precision floating-point value
2235	\| `a' to the 64-bit two's complement integer format. The conversion is
2236	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237	\| Arithmetic---which means in particular that the conversion is rounded
2238	\| according to the current rounding mode. If `a' is a NaN, the largest
2239	\| positive integer is returned. Otherwise, if the conversion overflows, the
2240	\| largest integer with the same sign as `a' is returned.
2241	----------------------------------------------------------------------------/
2242
2243	int64 float64_to_int64( float64 a STATUS_PARAM )
2244	{
2245	flag aSign;
2246	int16 aExp, shiftCount;
2247	bits64 aSig, aSigExtra;
2248
2249	aSig = extractFloat64Frac( a );
2250	aExp = extractFloat64Exp( a );
2251	aSign = extractFloat64Sign( a );
2252	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2253	shiftCount = 0x433 - aExp;
2254	if ( shiftCount <= 0 ) {
2255	if ( 0x43E < aExp ) {
2256	float_raise( float_flag_invalid STATUS_VAR);
2257	if ( ! aSign
2258	\|\| ( ( aExp == 0x7FF )
2259	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2260	) {
2261	return LIT64( 0x7FFFFFFFFFFFFFFF );
2262	}
2263	return (sbits64) LIT64( 0x8000000000000000 );
2264	}
2265	aSigExtra = 0;
2266	aSig <<= - shiftCount;
2267	}
2268	else {
2269	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270	}
2271	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272
2273	}
2274
2275	/*----------------------------------------------------------------------------
2276	\| Returns the result of converting the double-precision floating-point value
2277	\| `a' to the 64-bit two's complement integer format. The conversion is
2278	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2279	\| Arithmetic, except that the conversion is always rounded toward zero.
2280	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2281	\| the conversion overflows, the largest integer with the same sign as `a' is
2282	\| returned.
2283	----------------------------------------------------------------------------/
2284
2285	int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286	{
2287	flag aSign;
2288	int16 aExp, shiftCount;
2289	bits64 aSig;
2290	int64 z;
2291
2292	aSig = extractFloat64Frac( a );
2293	aExp = extractFloat64Exp( a );
2294	aSign = extractFloat64Sign( a );
2295	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2296	shiftCount = aExp - 0x433;
2297	if ( 0 <= shiftCount ) {
2298	if ( 0x43E <= aExp ) {
2299	if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300	float_raise( float_flag_invalid STATUS_VAR);
2301	if ( ! aSign
2302	\|\| ( ( aExp == 0x7FF )
2303	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2304	) {
2305	return LIT64( 0x7FFFFFFFFFFFFFFF );
2306	}
2307	}
2308	return (sbits64) LIT64( 0x8000000000000000 );
2309	}
2310	z = aSig<<shiftCount;
2311	}
2312	else {
2313	if ( aExp < 0x3FE ) {
2314	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2315	return 0;
2316	}
2317	z = aSig>>( - shiftCount );
2318	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319	STATUS(float_exception_flags) \|= float_flag_inexact;
2320	}
2321	}
2322	if ( aSign ) z = - z;
2323	return z;
2324
2325	}
2326
2327	/*----------------------------------------------------------------------------
2328	\| Returns the result of converting the double-precision floating-point value
2329	\| `a' to the single-precision floating-point format. The conversion is
2330	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2331	\| Arithmetic.
2332	----------------------------------------------------------------------------/
2333
2334	float32 float64_to_float32( float64 a STATUS_PARAM )
2335	{
2336	flag aSign;
2337	int16 aExp;
2338	bits64 aSig;
2339	bits32 zSig;
2340
2341	aSig = extractFloat64Frac( a );
2342	aExp = extractFloat64Exp( a );
2343	aSign = extractFloat64Sign( a );
2344	if ( aExp == 0x7FF ) {
2345	if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346	return packFloat32( aSign, 0xFF, 0 );
2347	}
2348	shift64RightJamming( aSig, 22, &aSig );
2349	zSig = aSig;
2350	if ( aExp \|\| zSig ) {
2351	zSig \|= 0x40000000;
2352	aExp -= 0x381;
2353	}
2354	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355
2356	}
2357
2358	#ifdef FLOATX80
2359
2360	/*----------------------------------------------------------------------------
2361	\| Returns the result of converting the double-precision floating-point value
2362	\| `a' to the extended double-precision floating-point format. The conversion
2363	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364	\| Arithmetic.
2365	----------------------------------------------------------------------------/
2366
2367	floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368	{
2369	flag aSign;
2370	int16 aExp;
2371	bits64 aSig;
2372
2373	aSig = extractFloat64Frac( a );
2374	aExp = extractFloat64Exp( a );
2375	aSign = extractFloat64Sign( a );
2376	if ( aExp == 0x7FF ) {
2377	if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379	}
2380	if ( aExp == 0 ) {
2381	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383	}
2384	return
2385	packFloatx80(
2386	aSign, aExp + 0x3C00, ( aSig \| LIT64( 0x0010000000000000 ) )<<11 );
2387
2388	}
2389
2390	#endif
2391
2392	#ifdef FLOAT128
2393
2394	/*----------------------------------------------------------------------------
2395	\| Returns the result of converting the double-precision floating-point value
2396	\| `a' to the quadruple-precision floating-point format. The conversion is
2397	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2398	\| Arithmetic.
2399	----------------------------------------------------------------------------/
2400
2401	float128 float64_to_float128( float64 a STATUS_PARAM )
2402	{
2403	flag aSign;
2404	int16 aExp;
2405	bits64 aSig, zSig0, zSig1;
2406
2407	aSig = extractFloat64Frac( a );
2408	aExp = extractFloat64Exp( a );
2409	aSign = extractFloat64Sign( a );
2410	if ( aExp == 0x7FF ) {
2411	if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412	return packFloat128( aSign, 0x7FFF, 0, 0 );
2413	}
2414	if ( aExp == 0 ) {
2415	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417	--aExp;
2418	}
2419	shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420	return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421
2422	}
2423
2424	#endif
2425
2426	/*----------------------------------------------------------------------------
2427	\| Rounds the double-precision floating-point value `a' to an integer, and
2428	\| returns the result as a double-precision floating-point value. The
2429	\| operation is performed according to the IEC/IEEE Standard for Binary
2430	\| Floating-Point Arithmetic.
2431	----------------------------------------------------------------------------/
2432
2433	float64 float64_round_to_int( float64 a STATUS_PARAM )
2434	{
2435	flag aSign;
2436	int16 aExp;
2437	bits64 lastBitMask, roundBitsMask;
2438	int8 roundingMode;
2439	float64 z;
2440
2441	aExp = extractFloat64Exp( a );
2442	if ( 0x433 <= aExp ) {
2443	if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444	return propagateFloat64NaN( a, a STATUS_VAR );
2445	}
2446	return a;
2447	}
2448	if ( aExp < 0x3FF ) {
2449	if ( (bits64) ( a<<1 ) == 0 ) return a;
2450	STATUS(float_exception_flags) \|= float_flag_inexact;
2451	aSign = extractFloat64Sign( a );
2452	switch ( STATUS(float_rounding_mode) ) {
2453	case float_round_nearest_even:
2454	if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455	return packFloat64( aSign, 0x3FF, 0 );
2456	}
2457	break;
2458	case float_round_down:
2459	return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460	case float_round_up:
2461	return
2462	aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463	}
2464	return packFloat64( aSign, 0, 0 );
2465	}
2466	lastBitMask = 1;
2467	lastBitMask <<= 0x433 - aExp;
2468	roundBitsMask = lastBitMask - 1;
2469	z = a;
2470	roundingMode = STATUS(float_rounding_mode);
2471	if ( roundingMode == float_round_nearest_even ) {
2472	z += lastBitMask>>1;
2473	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474	}
2475	else if ( roundingMode != float_round_to_zero ) {
2476	if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477	z += roundBitsMask;
2478	}
2479	}
2480	z &= ~ roundBitsMask;
2481	if ( z != a ) STATUS(float_exception_flags) \|= float_flag_inexact;
2482	return z;
2483
2484	}
2485
2486	/*----------------------------------------------------------------------------
2487	\| Returns the result of adding the absolute values of the double-precision
2488	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2489	\| before being returned. `zSign' is ignored if the result is a NaN.
2490	\| The addition is performed according to the IEC/IEEE Standard for Binary
2491	\| Floating-Point Arithmetic.
2492	----------------------------------------------------------------------------/
2493
2494	static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2495	{
2496	int16 aExp, bExp, zExp;
2497	bits64 aSig, bSig, zSig;
2498	int16 expDiff;
2499
2500	aSig = extractFloat64Frac( a );
2501	aExp = extractFloat64Exp( a );
2502	bSig = extractFloat64Frac( b );
2503	bExp = extractFloat64Exp( b );
2504	expDiff = aExp - bExp;
2505	aSig <<= 9;
2506	bSig <<= 9;
2507	if ( 0 < expDiff ) {
2508	if ( aExp == 0x7FF ) {
2509	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2510	return a;
2511	}
2512	if ( bExp == 0 ) {
2513	--expDiff;
2514	}
2515	else {
2516	bSig \|= LIT64( 0x2000000000000000 );
2517	}
2518	shift64RightJamming( bSig, expDiff, &bSig );
2519	zExp = aExp;
2520	}
2521	else if ( expDiff < 0 ) {
2522	if ( bExp == 0x7FF ) {
2523	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2524	return packFloat64( zSign, 0x7FF, 0 );
2525	}
2526	if ( aExp == 0 ) {
2527	++expDiff;
2528	}
2529	else {
2530	aSig \|= LIT64( 0x2000000000000000 );
2531	}
2532	shift64RightJamming( aSig, - expDiff, &aSig );
2533	zExp = bExp;
2534	}
2535	else {
2536	if ( aExp == 0x7FF ) {
2537	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2538	return a;
2539	}
2540	if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2541	zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2542	zExp = aExp;
2543	goto roundAndPack;
2544	}
2545	aSig \|= LIT64( 0x2000000000000000 );
2546	zSig = ( aSig + bSig )<<1;
2547	--zExp;
2548	if ( (sbits64) zSig < 0 ) {
2549	zSig = aSig + bSig;
2550	++zExp;
2551	}
2552	roundAndPack:
2553	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2554
2555	}
2556
2557	/*----------------------------------------------------------------------------
2558	\| Returns the result of subtracting the absolute values of the double-
2559	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
2560	\| difference is negated before being returned. `zSign' is ignored if the
2561	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
2562	\| Standard for Binary Floating-Point Arithmetic.
2563	----------------------------------------------------------------------------/
2564
2565	static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2566	{
2567	int16 aExp, bExp, zExp;
2568	bits64 aSig, bSig, zSig;
2569	int16 expDiff;
2570
2571	aSig = extractFloat64Frac( a );
2572	aExp = extractFloat64Exp( a );
2573	bSig = extractFloat64Frac( b );
2574	bExp = extractFloat64Exp( b );
2575	expDiff = aExp - bExp;
2576	aSig <<= 10;
2577	bSig <<= 10;
2578	if ( 0 < expDiff ) goto aExpBigger;
2579	if ( expDiff < 0 ) goto bExpBigger;
2580	if ( aExp == 0x7FF ) {
2581	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582	float_raise( float_flag_invalid STATUS_VAR);
2583	return float64_default_nan;
2584	}
2585	if ( aExp == 0 ) {
2586	aExp = 1;
2587	bExp = 1;
2588	}
2589	if ( bSig < aSig ) goto aBigger;
2590	if ( aSig < bSig ) goto bBigger;
2591	return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2592	bExpBigger:
2593	if ( bExp == 0x7FF ) {
2594	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2595	return packFloat64( zSign ^ 1, 0x7FF, 0 );
2596	}
2597	if ( aExp == 0 ) {
2598	++expDiff;
2599	}
2600	else {
2601	aSig \|= LIT64( 0x4000000000000000 );
2602	}
2603	shift64RightJamming( aSig, - expDiff, &aSig );
2604	bSig \|= LIT64( 0x4000000000000000 );
2605	bBigger:
2606	zSig = bSig - aSig;
2607	zExp = bExp;
2608	zSign ^= 1;
2609	goto normalizeRoundAndPack;
2610	aExpBigger:
2611	if ( aExp == 0x7FF ) {
2612	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2613	return a;
2614	}
2615	if ( bExp == 0 ) {
2616	--expDiff;
2617	}
2618	else {
2619	bSig \|= LIT64( 0x4000000000000000 );
2620	}
2621	shift64RightJamming( bSig, expDiff, &bSig );
2622	aSig \|= LIT64( 0x4000000000000000 );
2623	aBigger:
2624	zSig = aSig - bSig;
2625	zExp = aExp;
2626	normalizeRoundAndPack:
2627	--zExp;
2628	return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2629
2630	}
2631
2632	/*----------------------------------------------------------------------------
2633	\| Returns the result of adding the double-precision floating-point values `a'
2634	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
2635	\| Binary Floating-Point Arithmetic.
2636	----------------------------------------------------------------------------/
2637
2638	float64 float64_add( float64 a, float64 b STATUS_PARAM )
2639	{
2640	flag aSign, bSign;
2641
2642	aSign = extractFloat64Sign( a );
2643	bSign = extractFloat64Sign( b );
2644	if ( aSign == bSign ) {
2645	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2646	}
2647	else {
2648	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2649	}
2650
2651	}
2652
2653	/*----------------------------------------------------------------------------
2654	\| Returns the result of subtracting the double-precision floating-point values
2655	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2656	\| for Binary Floating-Point Arithmetic.
2657	----------------------------------------------------------------------------/
2658
2659	float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2660	{
2661	flag aSign, bSign;
2662
2663	aSign = extractFloat64Sign( a );
2664	bSign = extractFloat64Sign( b );
2665	if ( aSign == bSign ) {
2666	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2667	}
2668	else {
2669	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2670	}
2671
2672	}
2673
2674	/*----------------------------------------------------------------------------
2675	\| Returns the result of multiplying the double-precision floating-point values
2676	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2677	\| for Binary Floating-Point Arithmetic.
2678	----------------------------------------------------------------------------/
2679
2680	float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2681	{
2682	flag aSign, bSign, zSign;
2683	int16 aExp, bExp, zExp;
2684	bits64 aSig, bSig, zSig0, zSig1;
2685
2686	aSig = extractFloat64Frac( a );
2687	aExp = extractFloat64Exp( a );
2688	aSign = extractFloat64Sign( a );
2689	bSig = extractFloat64Frac( b );
2690	bExp = extractFloat64Exp( b );
2691	bSign = extractFloat64Sign( b );
2692	zSign = aSign ^ bSign;
2693	if ( aExp == 0x7FF ) {
2694	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2695	return propagateFloat64NaN( a, b STATUS_VAR );
2696	}
2697	if ( ( bExp \| bSig ) == 0 ) {
2698	float_raise( float_flag_invalid STATUS_VAR);
2699	return float64_default_nan;
2700	}
2701	return packFloat64( zSign, 0x7FF, 0 );
2702	}
2703	if ( bExp == 0x7FF ) {
2704	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2705	if ( ( aExp \| aSig ) == 0 ) {
2706	float_raise( float_flag_invalid STATUS_VAR);
2707	return float64_default_nan;
2708	}
2709	return packFloat64( zSign, 0x7FF, 0 );
2710	}
2711	if ( aExp == 0 ) {
2712	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2713	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2714	}
2715	if ( bExp == 0 ) {
2716	if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2717	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2718	}
2719	zExp = aExp + bExp - 0x3FF;
2720	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2721	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2722	mul64To128( aSig, bSig, &zSig0, &zSig1 );
2723	zSig0 \|= ( zSig1 != 0 );
2724	if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2725	zSig0 <<= 1;
2726	--zExp;
2727	}
2728	return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2729
2730	}
2731
2732	/*----------------------------------------------------------------------------
2733	\| Returns the result of dividing the double-precision floating-point value `a'
2734	\| by the corresponding value `b'. The operation is performed according to
2735	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2736	----------------------------------------------------------------------------/
2737
2738	float64 float64_div( float64 a, float64 b STATUS_PARAM )
2739	{
2740	flag aSign, bSign, zSign;
2741	int16 aExp, bExp, zExp;
2742	bits64 aSig, bSig, zSig;
2743	bits64 rem0, rem1;
2744	bits64 term0, term1;
2745
2746	aSig = extractFloat64Frac( a );
2747	aExp = extractFloat64Exp( a );
2748	aSign = extractFloat64Sign( a );
2749	bSig = extractFloat64Frac( b );
2750	bExp = extractFloat64Exp( b );
2751	bSign = extractFloat64Sign( b );
2752	zSign = aSign ^ bSign;
2753	if ( aExp == 0x7FF ) {
2754	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2755	if ( bExp == 0x7FF ) {
2756	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2757	float_raise( float_flag_invalid STATUS_VAR);
2758	return float64_default_nan;
2759	}
2760	return packFloat64( zSign, 0x7FF, 0 );
2761	}
2762	if ( bExp == 0x7FF ) {
2763	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2764	return packFloat64( zSign, 0, 0 );
2765	}
2766	if ( bExp == 0 ) {
2767	if ( bSig == 0 ) {
2768	if ( ( aExp \| aSig ) == 0 ) {
2769	float_raise( float_flag_invalid STATUS_VAR);
2770	return float64_default_nan;
2771	}
2772	float_raise( float_flag_divbyzero STATUS_VAR);
2773	return packFloat64( zSign, 0x7FF, 0 );
2774	}
2775	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776	}
2777	if ( aExp == 0 ) {
2778	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2779	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2780	}
2781	zExp = aExp - bExp + 0x3FD;
2782	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2783	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2784	if ( bSig <= ( aSig + aSig ) ) {
2785	aSig >>= 1;
2786	++zExp;
2787	}
2788	zSig = estimateDiv128To64( aSig, 0, bSig );
2789	if ( ( zSig & 0x1FF ) <= 2 ) {
2790	mul64To128( bSig, zSig, &term0, &term1 );
2791	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2792	while ( (sbits64) rem0 < 0 ) {
2793	--zSig;
2794	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2795	}
2796	zSig \|= ( rem1 != 0 );
2797	}
2798	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2799
2800	}
2801
2802	/*----------------------------------------------------------------------------
2803	\| Returns the remainder of the double-precision floating-point value `a'
2804	\| with respect to the corresponding value `b'. The operation is performed
2805	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2806	----------------------------------------------------------------------------/
2807
2808	float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2809	{
2810	flag aSign, bSign, zSign;
2811	int16 aExp, bExp, expDiff;
2812	bits64 aSig, bSig;
2813	bits64 q, alternateASig;
2814	sbits64 sigMean;
2815
2816	aSig = extractFloat64Frac( a );
2817	aExp = extractFloat64Exp( a );
2818	aSign = extractFloat64Sign( a );
2819	bSig = extractFloat64Frac( b );
2820	bExp = extractFloat64Exp( b );
2821	bSign = extractFloat64Sign( b );
2822	if ( aExp == 0x7FF ) {
2823	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2824	return propagateFloat64NaN( a, b STATUS_VAR );
2825	}
2826	float_raise( float_flag_invalid STATUS_VAR);
2827	return float64_default_nan;
2828	}
2829	if ( bExp == 0x7FF ) {
2830	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2831	return a;
2832	}
2833	if ( bExp == 0 ) {
2834	if ( bSig == 0 ) {
2835	float_raise( float_flag_invalid STATUS_VAR);
2836	return float64_default_nan;
2837	}
2838	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2839	}
2840	if ( aExp == 0 ) {
2841	if ( aSig == 0 ) return a;
2842	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2843	}
2844	expDiff = aExp - bExp;
2845	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<11;
2846	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2847	if ( expDiff < 0 ) {
2848	if ( expDiff < -1 ) return a;
2849	aSig >>= 1;
2850	}
2851	q = ( bSig <= aSig );
2852	if ( q ) aSig -= bSig;
2853	expDiff -= 64;
2854	while ( 0 < expDiff ) {
2855	q = estimateDiv128To64( aSig, 0, bSig );
2856	q = ( 2 < q ) ? q - 2 : 0;
2857	aSig = - ( ( bSig>>2 ) * q );
2858	expDiff -= 62;
2859	}
2860	expDiff += 64;
2861	if ( 0 < expDiff ) {
2862	q = estimateDiv128To64( aSig, 0, bSig );
2863	q = ( 2 < q ) ? q - 2 : 0;
2864	q >>= 64 - expDiff;
2865	bSig >>= 2;
2866	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2867	}
2868	else {
2869	aSig >>= 2;
2870	bSig >>= 2;
2871	}
2872	do {
2873	alternateASig = aSig;
2874	++q;
2875	aSig -= bSig;
2876	} while ( 0 <= (sbits64) aSig );
2877	sigMean = aSig + alternateASig;
2878	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2879	aSig = alternateASig;
2880	}
2881	zSign = ( (sbits64) aSig < 0 );
2882	if ( zSign ) aSig = - aSig;
2883	return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2884
2885	}
2886
2887	/*----------------------------------------------------------------------------
2888	\| Returns the square root of the double-precision floating-point value `a'.
2889	\| The operation is performed according to the IEC/IEEE Standard for Binary
2890	\| Floating-Point Arithmetic.
2891	----------------------------------------------------------------------------/
2892
2893	float64 float64_sqrt( float64 a STATUS_PARAM )
2894	{
2895	flag aSign;
2896	int16 aExp, zExp;
2897	bits64 aSig, zSig, doubleZSig;
2898	bits64 rem0, rem1, term0, term1;
2899
2900	aSig = extractFloat64Frac( a );
2901	aExp = extractFloat64Exp( a );
2902	aSign = extractFloat64Sign( a );
2903	if ( aExp == 0x7FF ) {
2904	if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2905	if ( ! aSign ) return a;
2906	float_raise( float_flag_invalid STATUS_VAR);
2907	return float64_default_nan;
2908	}
2909	if ( aSign ) {
2910	if ( ( aExp \| aSig ) == 0 ) return a;
2911	float_raise( float_flag_invalid STATUS_VAR);
2912	return float64_default_nan;
2913	}
2914	if ( aExp == 0 ) {
2915	if ( aSig == 0 ) return 0;
2916	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2917	}
2918	zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2919	aSig \|= LIT64( 0x0010000000000000 );
2920	zSig = estimateSqrt32( aExp, aSig>>21 );
2921	aSig <<= 9 - ( aExp & 1 );
2922	zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2923	if ( ( zSig & 0x1FF ) <= 5 ) {
2924	doubleZSig = zSig<<1;
2925	mul64To128( zSig, zSig, &term0, &term1 );
2926	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2927	while ( (sbits64) rem0 < 0 ) {
2928	--zSig;
2929	doubleZSig -= 2;
2930	add128( rem0, rem1, zSig>>63, doubleZSig \| 1, &rem0, &rem1 );
2931	}
2932	zSig \|= ( ( rem0 \| rem1 ) != 0 );
2933	}
2934	return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2935
2936	}
2937
2938	/*----------------------------------------------------------------------------
2939	\| Returns 1 if the double-precision floating-point value `a' is equal to the
2940	\| corresponding value `b', and 0 otherwise. The comparison is performed
2941	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942	----------------------------------------------------------------------------/
2943
2944	flag float64_eq( float64 a, float64 b STATUS_PARAM )
2945	{
2946
2947	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2948	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2949	) {
2950	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
2951	float_raise( float_flag_invalid STATUS_VAR);
2952	}
2953	return 0;
2954	}
2955	return ( a == b ) \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
2956
2957	}
2958
2959	/*----------------------------------------------------------------------------
2960	\| Returns 1 if the double-precision floating-point value `a' is less than or
2961	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
2962	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963	\| Arithmetic.
2964	----------------------------------------------------------------------------/
2965
2966	flag float64_le( float64 a, float64 b STATUS_PARAM )
2967	{
2968	flag aSign, bSign;
2969
2970	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2971	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2972	) {
2973	float_raise( float_flag_invalid STATUS_VAR);
2974	return 0;
2975	}
2976	aSign = extractFloat64Sign( a );
2977	bSign = extractFloat64Sign( b );
2978	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
2979	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
2980
2981	}
2982
2983	/*----------------------------------------------------------------------------
2984	\| Returns 1 if the double-precision floating-point value `a' is less than
2985	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2986	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2987	----------------------------------------------------------------------------/
2988
2989	flag float64_lt( float64 a, float64 b STATUS_PARAM )
2990	{
2991	flag aSign, bSign;
2992
2993	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2994	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2995	) {
2996	float_raise( float_flag_invalid STATUS_VAR);
2997	return 0;
2998	}
2999	aSign = extractFloat64Sign( a );
3000	bSign = extractFloat64Sign( b );
3001	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a \| b )<<1 ) != 0 );
3002	return ( a != b ) && ( aSign ^ ( a < b ) );
3003
3004	}
3005
3006	/*----------------------------------------------------------------------------
3007	\| Returns 1 if the double-precision floating-point value `a' is equal to the
3008	\| corresponding value `b', and 0 otherwise. The invalid exception is raised
3009	\| if either operand is a NaN. Otherwise, the comparison is performed
3010	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011	----------------------------------------------------------------------------/
3012
3013	flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3014	{
3015
3016	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3017	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3018	) {
3019	float_raise( float_flag_invalid STATUS_VAR);
3020	return 0;
3021	}
3022	return ( a == b ) \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
3023
3024	}
3025
3026	/*----------------------------------------------------------------------------
3027	\| Returns 1 if the double-precision floating-point value `a' is less than or
3028	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3029	\| cause an exception. Otherwise, the comparison is performed according to the
3030	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3031	----------------------------------------------------------------------------/
3032
3033	flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3034	{
3035	flag aSign, bSign;
3036
3037	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3038	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3039	) {
3040	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3041	float_raise( float_flag_invalid STATUS_VAR);
3042	}
3043	return 0;
3044	}
3045	aSign = extractFloat64Sign( a );
3046	bSign = extractFloat64Sign( b );
3047	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
3048	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
3049
3050	}
3051
3052	/*----------------------------------------------------------------------------
3053	\| Returns 1 if the double-precision floating-point value `a' is less than
3054	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3055	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3056	\| Standard for Binary Floating-Point Arithmetic.
3057	----------------------------------------------------------------------------/
3058
3059	flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3060	{
3061	flag aSign, bSign;
3062
3063	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3064	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3065	) {
3066	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3067	float_raise( float_flag_invalid STATUS_VAR);
3068	}
3069	return 0;
3070	}
3071	aSign = extractFloat64Sign( a );
3072	bSign = extractFloat64Sign( b );
3073	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a \| b )<<1 ) != 0 );
3074	return ( a != b ) && ( aSign ^ ( a < b ) );
3075
3076	}
3077
3078	#ifdef FLOATX80
3079
3080	/*----------------------------------------------------------------------------
3081	\| Returns the result of converting the extended double-precision floating-
3082	\| point value `a' to the 32-bit two's complement integer format. The
3083	\| conversion is performed according to the IEC/IEEE Standard for Binary
3084	\| Floating-Point Arithmetic---which means in particular that the conversion
3085	\| is rounded according to the current rounding mode. If `a' is a NaN, the
3086	\| largest positive integer is returned. Otherwise, if the conversion
3087	\| overflows, the largest integer with the same sign as `a' is returned.
3088	----------------------------------------------------------------------------/
3089
3090	int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3091	{
3092	flag aSign;
3093	int32 aExp, shiftCount;
3094	bits64 aSig;
3095
3096	aSig = extractFloatx80Frac( a );
3097	aExp = extractFloatx80Exp( a );
3098	aSign = extractFloatx80Sign( a );
3099	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3100	shiftCount = 0x4037 - aExp;
3101	if ( shiftCount <= 0 ) shiftCount = 1;
3102	shift64RightJamming( aSig, shiftCount, &aSig );
3103	return roundAndPackInt32( aSign, aSig STATUS_VAR );
3104
3105	}
3106
3107	/*----------------------------------------------------------------------------
3108	\| Returns the result of converting the extended double-precision floating-
3109	\| point value `a' to the 32-bit two's complement integer format. The
3110	\| conversion is performed according to the IEC/IEEE Standard for Binary
3111	\| Floating-Point Arithmetic, except that the conversion is always rounded
3112	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3113	\| Otherwise, if the conversion overflows, the largest integer with the same
3114	\| sign as `a' is returned.
3115	----------------------------------------------------------------------------/
3116
3117	int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3118	{
3119	flag aSign;
3120	int32 aExp, shiftCount;
3121	bits64 aSig, savedASig;
3122	int32 z;
3123
3124	aSig = extractFloatx80Frac( a );
3125	aExp = extractFloatx80Exp( a );
3126	aSign = extractFloatx80Sign( a );
3127	if ( 0x401E < aExp ) {
3128	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3129	goto invalid;
3130	}
3131	else if ( aExp < 0x3FFF ) {
3132	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3133	return 0;
3134	}
3135	shiftCount = 0x403E - aExp;
3136	savedASig = aSig;
3137	aSig >>= shiftCount;
3138	z = aSig;
3139	if ( aSign ) z = - z;
3140	if ( ( z < 0 ) ^ aSign ) {
3141	invalid:
3142	float_raise( float_flag_invalid STATUS_VAR);
3143	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3144	}
3145	if ( ( aSig<<shiftCount ) != savedASig ) {
3146	STATUS(float_exception_flags) \|= float_flag_inexact;
3147	}
3148	return z;
3149
3150	}
3151
3152	/*----------------------------------------------------------------------------
3153	\| Returns the result of converting the extended double-precision floating-
3154	\| point value `a' to the 64-bit two's complement integer format. The
3155	\| conversion is performed according to the IEC/IEEE Standard for Binary
3156	\| Floating-Point Arithmetic---which means in particular that the conversion
3157	\| is rounded according to the current rounding mode. If `a' is a NaN,
3158	\| the largest positive integer is returned. Otherwise, if the conversion
3159	\| overflows, the largest integer with the same sign as `a' is returned.
3160	----------------------------------------------------------------------------/
3161
3162	int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3163	{
3164	flag aSign;
3165	int32 aExp, shiftCount;
3166	bits64 aSig, aSigExtra;
3167
3168	aSig = extractFloatx80Frac( a );
3169	aExp = extractFloatx80Exp( a );
3170	aSign = extractFloatx80Sign( a );
3171	shiftCount = 0x403E - aExp;
3172	if ( shiftCount <= 0 ) {
3173	if ( shiftCount ) {
3174	float_raise( float_flag_invalid STATUS_VAR);
3175	if ( ! aSign
3176	\|\| ( ( aExp == 0x7FFF )
3177	&& ( aSig != LIT64( 0x8000000000000000 ) ) )
3178	) {
3179	return LIT64( 0x7FFFFFFFFFFFFFFF );
3180	}
3181	return (sbits64) LIT64( 0x8000000000000000 );
3182	}
3183	aSigExtra = 0;
3184	}
3185	else {
3186	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3187	}
3188	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3189
3190	}
3191
3192	/*----------------------------------------------------------------------------
3193	\| Returns the result of converting the extended double-precision floating-
3194	\| point value `a' to the 64-bit two's complement integer format. The
3195	\| conversion is performed according to the IEC/IEEE Standard for Binary
3196	\| Floating-Point Arithmetic, except that the conversion is always rounded
3197	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3198	\| Otherwise, if the conversion overflows, the largest integer with the same
3199	\| sign as `a' is returned.
3200	----------------------------------------------------------------------------/
3201
3202	int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3203	{
3204	flag aSign;
3205	int32 aExp, shiftCount;
3206	bits64 aSig;
3207	int64 z;
3208
3209	aSig = extractFloatx80Frac( a );
3210	aExp = extractFloatx80Exp( a );
3211	aSign = extractFloatx80Sign( a );
3212	shiftCount = aExp - 0x403E;
3213	if ( 0 <= shiftCount ) {
3214	aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3215	if ( ( a.high != 0xC03E ) \|\| aSig ) {
3216	float_raise( float_flag_invalid STATUS_VAR);
3217	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && aSig ) ) {
3218	return LIT64( 0x7FFFFFFFFFFFFFFF );
3219	}
3220	}
3221	return (sbits64) LIT64( 0x8000000000000000 );
3222	}
3223	else if ( aExp < 0x3FFF ) {
3224	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3225	return 0;
3226	}
3227	z = aSig>>( - shiftCount );
3228	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3229	STATUS(float_exception_flags) \|= float_flag_inexact;
3230	}
3231	if ( aSign ) z = - z;
3232	return z;
3233
3234	}
3235
3236	/*----------------------------------------------------------------------------
3237	\| Returns the result of converting the extended double-precision floating-
3238	\| point value `a' to the single-precision floating-point format. The
3239	\| conversion is performed according to the IEC/IEEE Standard for Binary
3240	\| Floating-Point Arithmetic.
3241	----------------------------------------------------------------------------/
3242
3243	float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3244	{
3245	flag aSign;
3246	int32 aExp;
3247	bits64 aSig;
3248
3249	aSig = extractFloatx80Frac( a );
3250	aExp = extractFloatx80Exp( a );
3251	aSign = extractFloatx80Sign( a );
3252	if ( aExp == 0x7FFF ) {
3253	if ( (bits64) ( aSig<<1 ) ) {
3254	return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3255	}
3256	return packFloat32( aSign, 0xFF, 0 );
3257	}
3258	shift64RightJamming( aSig, 33, &aSig );
3259	if ( aExp \|\| aSig ) aExp -= 0x3F81;
3260	return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3261
3262	}
3263
3264	/*----------------------------------------------------------------------------
3265	\| Returns the result of converting the extended double-precision floating-
3266	\| point value `a' to the double-precision floating-point format. The
3267	\| conversion is performed according to the IEC/IEEE Standard for Binary
3268	\| Floating-Point Arithmetic.
3269	----------------------------------------------------------------------------/
3270
3271	float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3272	{
3273	flag aSign;
3274	int32 aExp;
3275	bits64 aSig, zSig;
3276
3277	aSig = extractFloatx80Frac( a );
3278	aExp = extractFloatx80Exp( a );
3279	aSign = extractFloatx80Sign( a );
3280	if ( aExp == 0x7FFF ) {
3281	if ( (bits64) ( aSig<<1 ) ) {
3282	return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3283	}
3284	return packFloat64( aSign, 0x7FF, 0 );
3285	}
3286	shift64RightJamming( aSig, 1, &zSig );
3287	if ( aExp \|\| aSig ) aExp -= 0x3C01;
3288	return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3289
3290	}
3291
3292	#ifdef FLOAT128
3293
3294	/*----------------------------------------------------------------------------
3295	\| Returns the result of converting the extended double-precision floating-
3296	\| point value `a' to the quadruple-precision floating-point format. The
3297	\| conversion is performed according to the IEC/IEEE Standard for Binary
3298	\| Floating-Point Arithmetic.
3299	----------------------------------------------------------------------------/
3300
3301	float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3302	{
3303	flag aSign;
3304	int16 aExp;
3305	bits64 aSig, zSig0, zSig1;
3306
3307	aSig = extractFloatx80Frac( a );
3308	aExp = extractFloatx80Exp( a );
3309	aSign = extractFloatx80Sign( a );
3310	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3311	return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3312	}
3313	shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3314	return packFloat128( aSign, aExp, zSig0, zSig1 );
3315
3316	}
3317
3318	#endif
3319
3320	/*----------------------------------------------------------------------------
3321	\| Rounds the extended double-precision floating-point value `a' to an integer,
3322	\| and returns the result as an extended quadruple-precision floating-point
3323	\| value. The operation is performed according to the IEC/IEEE Standard for
3324	\| Binary Floating-Point Arithmetic.
3325	----------------------------------------------------------------------------/
3326
3327	floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3328	{
3329	flag aSign;
3330	int32 aExp;
3331	bits64 lastBitMask, roundBitsMask;
3332	int8 roundingMode;
3333	floatx80 z;
3334
3335	aExp = extractFloatx80Exp( a );
3336	if ( 0x403E <= aExp ) {
3337	if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3338	return propagateFloatx80NaN( a, a STATUS_VAR );
3339	}
3340	return a;
3341	}
3342	if ( aExp < 0x3FFF ) {
3343	if ( ( aExp == 0 )
3344	&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3345	return a;
3346	}
3347	STATUS(float_exception_flags) \|= float_flag_inexact;
3348	aSign = extractFloatx80Sign( a );
3349	switch ( STATUS(float_rounding_mode) ) {
3350	case float_round_nearest_even:
3351	if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3352	) {
3353	return
3354	packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3355	}
3356	break;
3357	case float_round_down:
3358	return
3359	aSign ?
3360	packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3361	: packFloatx80( 0, 0, 0 );
3362	case float_round_up:
3363	return
3364	aSign ? packFloatx80( 1, 0, 0 )
3365	: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366	}
3367	return packFloatx80( aSign, 0, 0 );
3368	}
3369	lastBitMask = 1;
3370	lastBitMask <<= 0x403E - aExp;
3371	roundBitsMask = lastBitMask - 1;
3372	z = a;
3373	roundingMode = STATUS(float_rounding_mode);
3374	if ( roundingMode == float_round_nearest_even ) {
3375	z.low += lastBitMask>>1;
3376	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3377	}
3378	else if ( roundingMode != float_round_to_zero ) {
3379	if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3380	z.low += roundBitsMask;
3381	}
3382	}
3383	z.low &= ~ roundBitsMask;
3384	if ( z.low == 0 ) {
3385	++z.high;
3386	z.low = LIT64( 0x8000000000000000 );
3387	}
3388	if ( z.low != a.low ) STATUS(float_exception_flags) \|= float_flag_inexact;
3389	return z;
3390
3391	}
3392
3393	/*----------------------------------------------------------------------------
3394	\| Returns the result of adding the absolute values of the extended double-
3395	\| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3396	\| negated before being returned. `zSign' is ignored if the result is a NaN.
3397	\| The addition is performed according to the IEC/IEEE Standard for Binary
3398	\| Floating-Point Arithmetic.
3399	----------------------------------------------------------------------------/
3400
3401	static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3402	{
3403	int32 aExp, bExp, zExp;
3404	bits64 aSig, bSig, zSig0, zSig1;
3405	int32 expDiff;
3406
3407	aSig = extractFloatx80Frac( a );
3408	aExp = extractFloatx80Exp( a );
3409	bSig = extractFloatx80Frac( b );
3410	bExp = extractFloatx80Exp( b );
3411	expDiff = aExp - bExp;
3412	if ( 0 < expDiff ) {
3413	if ( aExp == 0x7FFF ) {
3414	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3415	return a;
3416	}
3417	if ( bExp == 0 ) --expDiff;
3418	shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3419	zExp = aExp;
3420	}
3421	else if ( expDiff < 0 ) {
3422	if ( bExp == 0x7FFF ) {
3423	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3424	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3425	}
3426	if ( aExp == 0 ) ++expDiff;
3427	shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3428	zExp = bExp;
3429	}
3430	else {
3431	if ( aExp == 0x7FFF ) {
3432	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3433	return propagateFloatx80NaN( a, b STATUS_VAR );
3434	}
3435	return a;
3436	}
3437	zSig1 = 0;
3438	zSig0 = aSig + bSig;
3439	if ( aExp == 0 ) {
3440	normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3441	goto roundAndPack;
3442	}
3443	zExp = aExp;
3444	goto shiftRight1;
3445	}
3446	zSig0 = aSig + bSig;
3447	if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3448	shiftRight1:
3449	shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3450	zSig0 \|= LIT64( 0x8000000000000000 );
3451	++zExp;
3452	roundAndPack:
3453	return
3454	roundAndPackFloatx80(
3455	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3456
3457	}
3458
3459	/*----------------------------------------------------------------------------
3460	\| Returns the result of subtracting the absolute values of the extended
3461	\| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3462	\| difference is negated before being returned. `zSign' is ignored if the
3463	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
3464	\| Standard for Binary Floating-Point Arithmetic.
3465	----------------------------------------------------------------------------/
3466
3467	static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3468	{
3469	int32 aExp, bExp, zExp;
3470	bits64 aSig, bSig, zSig0, zSig1;
3471	int32 expDiff;
3472	floatx80 z;
3473
3474	aSig = extractFloatx80Frac( a );
3475	aExp = extractFloatx80Exp( a );
3476	bSig = extractFloatx80Frac( b );
3477	bExp = extractFloatx80Exp( b );
3478	expDiff = aExp - bExp;
3479	if ( 0 < expDiff ) goto aExpBigger;
3480	if ( expDiff < 0 ) goto bExpBigger;
3481	if ( aExp == 0x7FFF ) {
3482	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3483	return propagateFloatx80NaN( a, b STATUS_VAR );
3484	}
3485	float_raise( float_flag_invalid STATUS_VAR);
3486	z.low = floatx80_default_nan_low;
3487	z.high = floatx80_default_nan_high;
3488	return z;
3489	}
3490	if ( aExp == 0 ) {
3491	aExp = 1;
3492	bExp = 1;
3493	}
3494	zSig1 = 0;
3495	if ( bSig < aSig ) goto aBigger;
3496	if ( aSig < bSig ) goto bBigger;
3497	return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3498	bExpBigger:
3499	if ( bExp == 0x7FFF ) {
3500	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3501	return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3502	}
3503	if ( aExp == 0 ) ++expDiff;
3504	shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3505	bBigger:
3506	sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3507	zExp = bExp;
3508	zSign ^= 1;
3509	goto normalizeRoundAndPack;
3510	aExpBigger:
3511	if ( aExp == 0x7FFF ) {
3512	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3513	return a;
3514	}
3515	if ( bExp == 0 ) --expDiff;
3516	shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3517	aBigger:
3518	sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3519	zExp = aExp;
3520	normalizeRoundAndPack:
3521	return
3522	normalizeRoundAndPackFloatx80(
3523	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3524
3525	}
3526
3527	/*----------------------------------------------------------------------------
3528	\| Returns the result of adding the extended double-precision floating-point
3529	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
3530	\| Standard for Binary Floating-Point Arithmetic.
3531	----------------------------------------------------------------------------/
3532
3533	floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3534	{
3535	flag aSign, bSign;
3536
3537	aSign = extractFloatx80Sign( a );
3538	bSign = extractFloatx80Sign( b );
3539	if ( aSign == bSign ) {
3540	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3541	}
3542	else {
3543	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3544	}
3545
3546	}
3547
3548	/*----------------------------------------------------------------------------
3549	\| Returns the result of subtracting the extended double-precision floating-
3550	\| point values `a' and `b'. The operation is performed according to the
3551	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3552	----------------------------------------------------------------------------/
3553
3554	floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3555	{
3556	flag aSign, bSign;
3557
3558	aSign = extractFloatx80Sign( a );
3559	bSign = extractFloatx80Sign( b );
3560	if ( aSign == bSign ) {
3561	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3562	}
3563	else {
3564	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3565	}
3566
3567	}
3568
3569	/*----------------------------------------------------------------------------
3570	\| Returns the result of multiplying the extended double-precision floating-
3571	\| point values `a' and `b'. The operation is performed according to the
3572	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3573	----------------------------------------------------------------------------/
3574
3575	floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3576	{
3577	flag aSign, bSign, zSign;
3578	int32 aExp, bExp, zExp;
3579	bits64 aSig, bSig, zSig0, zSig1;
3580	floatx80 z;
3581
3582	aSig = extractFloatx80Frac( a );
3583	aExp = extractFloatx80Exp( a );
3584	aSign = extractFloatx80Sign( a );
3585	bSig = extractFloatx80Frac( b );
3586	bExp = extractFloatx80Exp( b );
3587	bSign = extractFloatx80Sign( b );
3588	zSign = aSign ^ bSign;
3589	if ( aExp == 0x7FFF ) {
3590	if ( (bits64) ( aSig<<1 )
3591	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3592	return propagateFloatx80NaN( a, b STATUS_VAR );
3593	}
3594	if ( ( bExp \| bSig ) == 0 ) goto invalid;
3595	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596	}
3597	if ( bExp == 0x7FFF ) {
3598	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3599	if ( ( aExp \| aSig ) == 0 ) {
3600	invalid:
3601	float_raise( float_flag_invalid STATUS_VAR);
3602	z.low = floatx80_default_nan_low;
3603	z.high = floatx80_default_nan_high;
3604	return z;
3605	}
3606	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607	}
3608	if ( aExp == 0 ) {
3609	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3610	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3611	}
3612	if ( bExp == 0 ) {
3613	if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3614	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3615	}
3616	zExp = aExp + bExp - 0x3FFE;
3617	mul64To128( aSig, bSig, &zSig0, &zSig1 );
3618	if ( 0 < (sbits64) zSig0 ) {
3619	shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3620	--zExp;
3621	}
3622	return
3623	roundAndPackFloatx80(
3624	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3625
3626	}
3627
3628	/*----------------------------------------------------------------------------
3629	\| Returns the result of dividing the extended double-precision floating-point
3630	\| value `a' by the corresponding value `b'. The operation is performed
3631	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3632	----------------------------------------------------------------------------/
3633
3634	floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3635	{
3636	flag aSign, bSign, zSign;
3637	int32 aExp, bExp, zExp;
3638	bits64 aSig, bSig, zSig0, zSig1;
3639	bits64 rem0, rem1, rem2, term0, term1, term2;
3640	floatx80 z;
3641
3642	aSig = extractFloatx80Frac( a );
3643	aExp = extractFloatx80Exp( a );
3644	aSign = extractFloatx80Sign( a );
3645	bSig = extractFloatx80Frac( b );
3646	bExp = extractFloatx80Exp( b );
3647	bSign = extractFloatx80Sign( b );
3648	zSign = aSign ^ bSign;
3649	if ( aExp == 0x7FFF ) {
3650	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3651	if ( bExp == 0x7FFF ) {
3652	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3653	goto invalid;
3654	}
3655	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3656	}
3657	if ( bExp == 0x7FFF ) {
3658	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3659	return packFloatx80( zSign, 0, 0 );
3660	}
3661	if ( bExp == 0 ) {
3662	if ( bSig == 0 ) {
3663	if ( ( aExp \| aSig ) == 0 ) {
3664	invalid:
3665	float_raise( float_flag_invalid STATUS_VAR);
3666	z.low = floatx80_default_nan_low;
3667	z.high = floatx80_default_nan_high;
3668	return z;
3669	}
3670	float_raise( float_flag_divbyzero STATUS_VAR);
3671	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672	}
3673	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3674	}
3675	if ( aExp == 0 ) {
3676	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3677	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3678	}
3679	zExp = aExp - bExp + 0x3FFE;
3680	rem1 = 0;
3681	if ( bSig <= aSig ) {
3682	shift128Right( aSig, 0, 1, &aSig, &rem1 );
3683	++zExp;
3684	}
3685	zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3686	mul64To128( bSig, zSig0, &term0, &term1 );
3687	sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3688	while ( (sbits64) rem0 < 0 ) {
3689	--zSig0;
3690	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3691	}
3692	zSig1 = estimateDiv128To64( rem1, 0, bSig );
3693	if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3694	mul64To128( bSig, zSig1, &term1, &term2 );
3695	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3696	while ( (sbits64) rem1 < 0 ) {
3697	--zSig1;
3698	add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3699	}
3700	zSig1 \|= ( ( rem1 \| rem2 ) != 0 );
3701	}
3702	return
3703	roundAndPackFloatx80(
3704	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3705
3706	}
3707
3708	/*----------------------------------------------------------------------------
3709	\| Returns the remainder of the extended double-precision floating-point value
3710	\| `a' with respect to the corresponding value `b'. The operation is performed
3711	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3712	----------------------------------------------------------------------------/
3713
3714	floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3715	{
3716	flag aSign, bSign, zSign;
3717	int32 aExp, bExp, expDiff;
3718	bits64 aSig0, aSig1, bSig;
3719	bits64 q, term0, term1, alternateASig0, alternateASig1;
3720	floatx80 z;
3721
3722	aSig0 = extractFloatx80Frac( a );
3723	aExp = extractFloatx80Exp( a );
3724	aSign = extractFloatx80Sign( a );
3725	bSig = extractFloatx80Frac( b );
3726	bExp = extractFloatx80Exp( b );
3727	bSign = extractFloatx80Sign( b );
3728	if ( aExp == 0x7FFF ) {
3729	if ( (bits64) ( aSig0<<1 )
3730	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3731	return propagateFloatx80NaN( a, b STATUS_VAR );
3732	}
3733	goto invalid;
3734	}
3735	if ( bExp == 0x7FFF ) {
3736	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3737	return a;
3738	}
3739	if ( bExp == 0 ) {
3740	if ( bSig == 0 ) {
3741	invalid:
3742	float_raise( float_flag_invalid STATUS_VAR);
3743	z.low = floatx80_default_nan_low;
3744	z.high = floatx80_default_nan_high;
3745	return z;
3746	}
3747	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3748	}
3749	if ( aExp == 0 ) {
3750	if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3751	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3752	}
3753	bSig \|= LIT64( 0x8000000000000000 );
3754	zSign = aSign;
3755	expDiff = aExp - bExp;
3756	aSig1 = 0;
3757	if ( expDiff < 0 ) {
3758	if ( expDiff < -1 ) return a;
3759	shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3760	expDiff = 0;
3761	}
3762	q = ( bSig <= aSig0 );
3763	if ( q ) aSig0 -= bSig;
3764	expDiff -= 64;
3765	while ( 0 < expDiff ) {
3766	q = estimateDiv128To64( aSig0, aSig1, bSig );
3767	q = ( 2 < q ) ? q - 2 : 0;
3768	mul64To128( bSig, q, &term0, &term1 );
3769	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3770	shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3771	expDiff -= 62;
3772	}
3773	expDiff += 64;
3774	if ( 0 < expDiff ) {
3775	q = estimateDiv128To64( aSig0, aSig1, bSig );
3776	q = ( 2 < q ) ? q - 2 : 0;
3777	q >>= 64 - expDiff;
3778	mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3779	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3780	shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3781	while ( le128( term0, term1, aSig0, aSig1 ) ) {
3782	++q;
3783	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3784	}
3785	}
3786	else {
3787	term1 = 0;
3788	term0 = bSig;
3789	}
3790	sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3791	if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3792	\|\| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3793	&& ( q & 1 ) )
3794	) {
3795	aSig0 = alternateASig0;
3796	aSig1 = alternateASig1;
3797	zSign = ! zSign;
3798	}
3799	return
3800	normalizeRoundAndPackFloatx80(
3801	80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3802
3803	}
3804
3805	/*----------------------------------------------------------------------------
3806	\| Returns the square root of the extended double-precision floating-point
3807	\| value `a'. The operation is performed according to the IEC/IEEE Standard
3808	\| for Binary Floating-Point Arithmetic.
3809	----------------------------------------------------------------------------/
3810
3811	floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3812	{
3813	flag aSign;
3814	int32 aExp, zExp;
3815	bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3816	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3817	floatx80 z;
3818
3819	aSig0 = extractFloatx80Frac( a );
3820	aExp = extractFloatx80Exp( a );
3821	aSign = extractFloatx80Sign( a );
3822	if ( aExp == 0x7FFF ) {
3823	if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3824	if ( ! aSign ) return a;
3825	goto invalid;
3826	}
3827	if ( aSign ) {
3828	if ( ( aExp \| aSig0 ) == 0 ) return a;
3829	invalid:
3830	float_raise( float_flag_invalid STATUS_VAR);
3831	z.low = floatx80_default_nan_low;
3832	z.high = floatx80_default_nan_high;
3833	return z;
3834	}
3835	if ( aExp == 0 ) {
3836	if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3837	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3838	}
3839	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3840	zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3841	shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3842	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3843	doubleZSig0 = zSig0<<1;
3844	mul64To128( zSig0, zSig0, &term0, &term1 );
3845	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3846	while ( (sbits64) rem0 < 0 ) {
3847	--zSig0;
3848	doubleZSig0 -= 2;
3849	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
3850	}
3851	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3852	if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3853	if ( zSig1 == 0 ) zSig1 = 1;
3854	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3855	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3856	mul64To128( zSig1, zSig1, &term2, &term3 );
3857	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3858	while ( (sbits64) rem1 < 0 ) {
3859	--zSig1;
3860	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3861	term3 \|= 1;
3862	term2 \|= doubleZSig0;
3863	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3864	}
3865	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
3866	}
3867	shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3868	zSig0 \|= doubleZSig0;
3869	return
3870	roundAndPackFloatx80(
3871	STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3872
3873	}
3874
3875	/*----------------------------------------------------------------------------
3876	\| Returns 1 if the extended double-precision floating-point value `a' is
3877	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
3878	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
3879	\| Arithmetic.
3880	----------------------------------------------------------------------------/
3881
3882	flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3883	{
3884
3885	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3886	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3887	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3888	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3889	) {
3890	if ( floatx80_is_signaling_nan( a )
3891	\|\| floatx80_is_signaling_nan( b ) ) {
3892	float_raise( float_flag_invalid STATUS_VAR);
3893	}
3894	return 0;
3895	}
3896	return
3897	( a.low == b.low )
3898	&& ( ( a.high == b.high )
3899	\|\| ( ( a.low == 0 )
3900	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
3901	);
3902
3903	}
3904
3905	/*----------------------------------------------------------------------------
3906	\| Returns 1 if the extended double-precision floating-point value `a' is
3907	\| less than or equal to the corresponding value `b', and 0 otherwise. The
3908	\| comparison is performed according to the IEC/IEEE Standard for Binary
3909	\| Floating-Point Arithmetic.
3910	----------------------------------------------------------------------------/
3911
3912	flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3913	{
3914	flag aSign, bSign;
3915
3916	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3917	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3918	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3919	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3920	) {
3921	float_raise( float_flag_invalid STATUS_VAR);
3922	return 0;
3923	}
3924	aSign = extractFloatx80Sign( a );
3925	bSign = extractFloatx80Sign( b );
3926	if ( aSign != bSign ) {
3927	return
3928	aSign
3929	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
3930	== 0 );
3931	}
3932	return
3933	aSign ? le128( b.high, b.low, a.high, a.low )
3934	: le128( a.high, a.low, b.high, b.low );
3935
3936	}
3937
3938	/*----------------------------------------------------------------------------
3939	\| Returns 1 if the extended double-precision floating-point value `a' is
3940	\| less than the corresponding value `b', and 0 otherwise. The comparison
3941	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3942	\| Arithmetic.
3943	----------------------------------------------------------------------------/
3944
3945	flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3946	{
3947	flag aSign, bSign;
3948
3949	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3950	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3951	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3952	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3953	) {
3954	float_raise( float_flag_invalid STATUS_VAR);
3955	return 0;
3956	}
3957	aSign = extractFloatx80Sign( a );
3958	bSign = extractFloatx80Sign( b );
3959	if ( aSign != bSign ) {
3960	return
3961	aSign
3962	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
3963	!= 0 );
3964	}
3965	return
3966	aSign ? lt128( b.high, b.low, a.high, a.low )
3967	: lt128( a.high, a.low, b.high, b.low );
3968
3969	}
3970
3971	/*----------------------------------------------------------------------------
3972	\| Returns 1 if the extended double-precision floating-point value `a' is equal
3973	\| to the corresponding value `b', and 0 otherwise. The invalid exception is
3974	\| raised if either operand is a NaN. Otherwise, the comparison is performed
3975	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3976	----------------------------------------------------------------------------/
3977
3978	flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3979	{
3980
3981	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3982	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3983	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3984	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3985	) {
3986	float_raise( float_flag_invalid STATUS_VAR);
3987	return 0;
3988	}
3989	return
3990	( a.low == b.low )
3991	&& ( ( a.high == b.high )
3992	\|\| ( ( a.low == 0 )
3993	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
3994	);
3995
3996	}
3997
3998	/*----------------------------------------------------------------------------
3999	\| Returns 1 if the extended double-precision floating-point value `a' is less
4000	\| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4001	\| do not cause an exception. Otherwise, the comparison is performed according
4002	\| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4003	----------------------------------------------------------------------------/
4004
4005	flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4006	{
4007	flag aSign, bSign;
4008
4009	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4010	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4011	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4012	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4013	) {
4014	if ( floatx80_is_signaling_nan( a )
4015	\|\| floatx80_is_signaling_nan( b ) ) {
4016	float_raise( float_flag_invalid STATUS_VAR);
4017	}
4018	return 0;
4019	}
4020	aSign = extractFloatx80Sign( a );
4021	bSign = extractFloatx80Sign( b );
4022	if ( aSign != bSign ) {
4023	return
4024	aSign
4025	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4026	== 0 );
4027	}
4028	return
4029	aSign ? le128( b.high, b.low, a.high, a.low )
4030	: le128( a.high, a.low, b.high, b.low );
4031
4032	}
4033
4034	/*----------------------------------------------------------------------------
4035	\| Returns 1 if the extended double-precision floating-point value `a' is less
4036	\| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4037	\| an exception. Otherwise, the comparison is performed according to the
4038	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4039	----------------------------------------------------------------------------/
4040
4041	flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4042	{
4043	flag aSign, bSign;
4044
4045	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4046	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4047	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4048	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4049	) {
4050	if ( floatx80_is_signaling_nan( a )
4051	\|\| floatx80_is_signaling_nan( b ) ) {
4052	float_raise( float_flag_invalid STATUS_VAR);
4053	}
4054	return 0;
4055	}
4056	aSign = extractFloatx80Sign( a );
4057	bSign = extractFloatx80Sign( b );
4058	if ( aSign != bSign ) {
4059	return
4060	aSign
4061	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4062	!= 0 );
4063	}
4064	return
4065	aSign ? lt128( b.high, b.low, a.high, a.low )
4066	: lt128( a.high, a.low, b.high, b.low );
4067
4068	}
4069
4070	#endif
4071
4072	#ifdef FLOAT128
4073
4074	/*----------------------------------------------------------------------------
4075	\| Returns the result of converting the quadruple-precision floating-point
4076	\| value `a' to the 32-bit two's complement integer format. The conversion
4077	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4078	\| Arithmetic---which means in particular that the conversion is rounded
4079	\| according to the current rounding mode. If `a' is a NaN, the largest
4080	\| positive integer is returned. Otherwise, if the conversion overflows, the
4081	\| largest integer with the same sign as `a' is returned.
4082	----------------------------------------------------------------------------/
4083
4084	int32 float128_to_int32( float128 a STATUS_PARAM )
4085	{
4086	flag aSign;
4087	int32 aExp, shiftCount;
4088	bits64 aSig0, aSig1;
4089
4090	aSig1 = extractFloat128Frac1( a );
4091	aSig0 = extractFloat128Frac0( a );
4092	aExp = extractFloat128Exp( a );
4093	aSign = extractFloat128Sign( a );
4094	if ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) aSign = 0;
4095	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4096	aSig0 \|= ( aSig1 != 0 );
4097	shiftCount = 0x4028 - aExp;
4098	if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4099	return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4100
4101	}
4102
4103	/*----------------------------------------------------------------------------
4104	\| Returns the result of converting the quadruple-precision floating-point
4105	\| value `a' to the 32-bit two's complement integer format. The conversion
4106	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4107	\| Arithmetic, except that the conversion is always rounded toward zero. If
4108	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4109	\| conversion overflows, the largest integer with the same sign as `a' is
4110	\| returned.
4111	----------------------------------------------------------------------------/
4112
4113	int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4114	{
4115	flag aSign;
4116	int32 aExp, shiftCount;
4117	bits64 aSig0, aSig1, savedASig;
4118	int32 z;
4119
4120	aSig1 = extractFloat128Frac1( a );
4121	aSig0 = extractFloat128Frac0( a );
4122	aExp = extractFloat128Exp( a );
4123	aSign = extractFloat128Sign( a );
4124	aSig0 \|= ( aSig1 != 0 );
4125	if ( 0x401E < aExp ) {
4126	if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4127	goto invalid;
4128	}
4129	else if ( aExp < 0x3FFF ) {
4130	if ( aExp \|\| aSig0 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4131	return 0;
4132	}
4133	aSig0 \|= LIT64( 0x0001000000000000 );
4134	shiftCount = 0x402F - aExp;
4135	savedASig = aSig0;
4136	aSig0 >>= shiftCount;
4137	z = aSig0;
4138	if ( aSign ) z = - z;
4139	if ( ( z < 0 ) ^ aSign ) {
4140	invalid:
4141	float_raise( float_flag_invalid STATUS_VAR);
4142	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4143	}
4144	if ( ( aSig0<<shiftCount ) != savedASig ) {
4145	STATUS(float_exception_flags) \|= float_flag_inexact;
4146	}
4147	return z;
4148
4149	}
4150
4151	/*----------------------------------------------------------------------------
4152	\| Returns the result of converting the quadruple-precision floating-point
4153	\| value `a' to the 64-bit two's complement integer format. The conversion
4154	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4155	\| Arithmetic---which means in particular that the conversion is rounded
4156	\| according to the current rounding mode. If `a' is a NaN, the largest
4157	\| positive integer is returned. Otherwise, if the conversion overflows, the
4158	\| largest integer with the same sign as `a' is returned.
4159	----------------------------------------------------------------------------/
4160
4161	int64 float128_to_int64( float128 a STATUS_PARAM )
4162	{
4163	flag aSign;
4164	int32 aExp, shiftCount;
4165	bits64 aSig0, aSig1;
4166
4167	aSig1 = extractFloat128Frac1( a );
4168	aSig0 = extractFloat128Frac0( a );
4169	aExp = extractFloat128Exp( a );
4170	aSign = extractFloat128Sign( a );
4171	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4172	shiftCount = 0x402F - aExp;
4173	if ( shiftCount <= 0 ) {
4174	if ( 0x403E < aExp ) {
4175	float_raise( float_flag_invalid STATUS_VAR);
4176	if ( ! aSign
4177	\|\| ( ( aExp == 0x7FFF )
4178	&& ( aSig1 \|\| ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4179	)
4180	) {
4181	return LIT64( 0x7FFFFFFFFFFFFFFF );
4182	}
4183	return (sbits64) LIT64( 0x8000000000000000 );
4184	}
4185	shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4186	}
4187	else {
4188	shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4189	}
4190	return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4191
4192	}
4193
4194	/*----------------------------------------------------------------------------
4195	\| Returns the result of converting the quadruple-precision floating-point
4196	\| value `a' to the 64-bit two's complement integer format. The conversion
4197	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4198	\| Arithmetic, except that the conversion is always rounded toward zero.
4199	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4200	\| the conversion overflows, the largest integer with the same sign as `a' is
4201	\| returned.
4202	----------------------------------------------------------------------------/
4203
4204	int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4205	{
4206	flag aSign;
4207	int32 aExp, shiftCount;
4208	bits64 aSig0, aSig1;
4209	int64 z;
4210
4211	aSig1 = extractFloat128Frac1( a );
4212	aSig0 = extractFloat128Frac0( a );
4213	aExp = extractFloat128Exp( a );
4214	aSign = extractFloat128Sign( a );
4215	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4216	shiftCount = aExp - 0x402F;
4217	if ( 0 < shiftCount ) {
4218	if ( 0x403E <= aExp ) {
4219	aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4220	if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4221	&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4222	if ( aSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4223	}
4224	else {
4225	float_raise( float_flag_invalid STATUS_VAR);
4226	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) ) {
4227	return LIT64( 0x7FFFFFFFFFFFFFFF );
4228	}
4229	}
4230	return (sbits64) LIT64( 0x8000000000000000 );
4231	}
4232	z = ( aSig0<<shiftCount ) \| ( aSig1>>( ( - shiftCount ) & 63 ) );
4233	if ( (bits64) ( aSig1<<shiftCount ) ) {
4234	STATUS(float_exception_flags) \|= float_flag_inexact;
4235	}
4236	}
4237	else {
4238	if ( aExp < 0x3FFF ) {
4239	if ( aExp \| aSig0 \| aSig1 ) {
4240	STATUS(float_exception_flags) \|= float_flag_inexact;
4241	}
4242	return 0;
4243	}
4244	z = aSig0>>( - shiftCount );
4245	if ( aSig1
4246	\|\| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4247	STATUS(float_exception_flags) \|= float_flag_inexact;
4248	}
4249	}
4250	if ( aSign ) z = - z;
4251	return z;
4252
4253	}
4254
4255	/*----------------------------------------------------------------------------
4256	\| Returns the result of converting the quadruple-precision floating-point
4257	\| value `a' to the single-precision floating-point format. The conversion
4258	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259	\| Arithmetic.
4260	----------------------------------------------------------------------------/
4261
4262	float32 float128_to_float32( float128 a STATUS_PARAM )
4263	{
4264	flag aSign;
4265	int32 aExp;
4266	bits64 aSig0, aSig1;
4267	bits32 zSig;
4268
4269	aSig1 = extractFloat128Frac1( a );
4270	aSig0 = extractFloat128Frac0( a );
4271	aExp = extractFloat128Exp( a );
4272	aSign = extractFloat128Sign( a );
4273	if ( aExp == 0x7FFF ) {
4274	if ( aSig0 \| aSig1 ) {
4275	return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4276	}
4277	return packFloat32( aSign, 0xFF, 0 );
4278	}
4279	aSig0 \|= ( aSig1 != 0 );
4280	shift64RightJamming( aSig0, 18, &aSig0 );
4281	zSig = aSig0;
4282	if ( aExp \|\| zSig ) {
4283	zSig \|= 0x40000000;
4284	aExp -= 0x3F81;
4285	}
4286	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4287
4288	}
4289
4290	/*----------------------------------------------------------------------------
4291	\| Returns the result of converting the quadruple-precision floating-point
4292	\| value `a' to the double-precision floating-point format. The conversion
4293	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4294	\| Arithmetic.
4295	----------------------------------------------------------------------------/
4296
4297	float64 float128_to_float64( float128 a STATUS_PARAM )
4298	{
4299	flag aSign;
4300	int32 aExp;
4301	bits64 aSig0, aSig1;
4302
4303	aSig1 = extractFloat128Frac1( a );
4304	aSig0 = extractFloat128Frac0( a );
4305	aExp = extractFloat128Exp( a );
4306	aSign = extractFloat128Sign( a );
4307	if ( aExp == 0x7FFF ) {
4308	if ( aSig0 \| aSig1 ) {
4309	return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4310	}
4311	return packFloat64( aSign, 0x7FF, 0 );
4312	}
4313	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4314	aSig0 \|= ( aSig1 != 0 );
4315	if ( aExp \|\| aSig0 ) {
4316	aSig0 \|= LIT64( 0x4000000000000000 );
4317	aExp -= 0x3C01;
4318	}
4319	return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4320
4321	}
4322
4323	#ifdef FLOATX80
4324
4325	/*----------------------------------------------------------------------------
4326	\| Returns the result of converting the quadruple-precision floating-point
4327	\| value `a' to the extended double-precision floating-point format. The
4328	\| conversion is performed according to the IEC/IEEE Standard for Binary
4329	\| Floating-Point Arithmetic.
4330	----------------------------------------------------------------------------/
4331
4332	floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4333	{
4334	flag aSign;
4335	int32 aExp;
4336	bits64 aSig0, aSig1;
4337
4338	aSig1 = extractFloat128Frac1( a );
4339	aSig0 = extractFloat128Frac0( a );
4340	aExp = extractFloat128Exp( a );
4341	aSign = extractFloat128Sign( a );
4342	if ( aExp == 0x7FFF ) {
4343	if ( aSig0 \| aSig1 ) {
4344	return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4345	}
4346	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4347	}
4348	if ( aExp == 0 ) {
4349	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4350	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4351	}
4352	else {
4353	aSig0 \|= LIT64( 0x0001000000000000 );
4354	}
4355	shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4356	return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4357
4358	}
4359
4360	#endif
4361
4362	/*----------------------------------------------------------------------------
4363	\| Rounds the quadruple-precision floating-point value `a' to an integer, and
4364	\| returns the result as a quadruple-precision floating-point value. The
4365	\| operation is performed according to the IEC/IEEE Standard for Binary
4366	\| Floating-Point Arithmetic.
4367	----------------------------------------------------------------------------/
4368
4369	float128 float128_round_to_int( float128 a STATUS_PARAM )
4370	{
4371	flag aSign;
4372	int32 aExp;
4373	bits64 lastBitMask, roundBitsMask;
4374	int8 roundingMode;
4375	float128 z;
4376
4377	aExp = extractFloat128Exp( a );
4378	if ( 0x402F <= aExp ) {
4379	if ( 0x406F <= aExp ) {
4380	if ( ( aExp == 0x7FFF )
4381	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) )
4382	) {
4383	return propagateFloat128NaN( a, a STATUS_VAR );
4384	}
4385	return a;
4386	}
4387	lastBitMask = 1;
4388	lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4389	roundBitsMask = lastBitMask - 1;
4390	z = a;
4391	roundingMode = STATUS(float_rounding_mode);
4392	if ( roundingMode == float_round_nearest_even ) {
4393	if ( lastBitMask ) {
4394	add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4395	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4396	}
4397	else {
4398	if ( (sbits64) z.low < 0 ) {
4399	++z.high;
4400	if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4401	}
4402	}
4403	}
4404	else if ( roundingMode != float_round_to_zero ) {
4405	if ( extractFloat128Sign( z )
4406	^ ( roundingMode == float_round_up ) ) {
4407	add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4408	}
4409	}
4410	z.low &= ~ roundBitsMask;
4411	}
4412	else {
4413	if ( aExp < 0x3FFF ) {
4414	if ( ( ( (bits64) ( a.high<<1 ) ) \| a.low ) == 0 ) return a;
4415	STATUS(float_exception_flags) \|= float_flag_inexact;
4416	aSign = extractFloat128Sign( a );
4417	switch ( STATUS(float_rounding_mode) ) {
4418	case float_round_nearest_even:
4419	if ( ( aExp == 0x3FFE )
4420	&& ( extractFloat128Frac0( a )
4421	\| extractFloat128Frac1( a ) )
4422	) {
4423	return packFloat128( aSign, 0x3FFF, 0, 0 );
4424	}
4425	break;
4426	case float_round_down:
4427	return
4428	aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4429	: packFloat128( 0, 0, 0, 0 );
4430	case float_round_up:
4431	return
4432	aSign ? packFloat128( 1, 0, 0, 0 )
4433	: packFloat128( 0, 0x3FFF, 0, 0 );
4434	}
4435	return packFloat128( aSign, 0, 0, 0 );
4436	}
4437	lastBitMask = 1;
4438	lastBitMask <<= 0x402F - aExp;
4439	roundBitsMask = lastBitMask - 1;
4440	z.low = 0;
4441	z.high = a.high;
4442	roundingMode = STATUS(float_rounding_mode);
4443	if ( roundingMode == float_round_nearest_even ) {
4444	z.high += lastBitMask>>1;
4445	if ( ( ( z.high & roundBitsMask ) \| a.low ) == 0 ) {
4446	z.high &= ~ lastBitMask;
4447	}
4448	}
4449	else if ( roundingMode != float_round_to_zero ) {
4450	if ( extractFloat128Sign( z )
4451	^ ( roundingMode == float_round_up ) ) {
4452	z.high \|= ( a.low != 0 );
4453	z.high += roundBitsMask;
4454	}
4455	}
4456	z.high &= ~ roundBitsMask;
4457	}
4458	if ( ( z.low != a.low ) \|\| ( z.high != a.high ) ) {
4459	STATUS(float_exception_flags) \|= float_flag_inexact;
4460	}
4461	return z;
4462
4463	}
4464
4465	/*----------------------------------------------------------------------------
4466	\| Returns the result of adding the absolute values of the quadruple-precision
4467	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4468	\| before being returned. `zSign' is ignored if the result is a NaN.
4469	\| The addition is performed according to the IEC/IEEE Standard for Binary
4470	\| Floating-Point Arithmetic.
4471	----------------------------------------------------------------------------/
4472
4473	static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4474	{
4475	int32 aExp, bExp, zExp;
4476	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4477	int32 expDiff;
4478
4479	aSig1 = extractFloat128Frac1( a );
4480	aSig0 = extractFloat128Frac0( a );
4481	aExp = extractFloat128Exp( a );
4482	bSig1 = extractFloat128Frac1( b );
4483	bSig0 = extractFloat128Frac0( b );
4484	bExp = extractFloat128Exp( b );
4485	expDiff = aExp - bExp;
4486	if ( 0 < expDiff ) {
4487	if ( aExp == 0x7FFF ) {
4488	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4489	return a;
4490	}
4491	if ( bExp == 0 ) {
4492	--expDiff;
4493	}
4494	else {
4495	bSig0 \|= LIT64( 0x0001000000000000 );
4496	}
4497	shift128ExtraRightJamming(
4498	bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4499	zExp = aExp;
4500	}
4501	else if ( expDiff < 0 ) {
4502	if ( bExp == 0x7FFF ) {
4503	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4504	return packFloat128( zSign, 0x7FFF, 0, 0 );
4505	}
4506	if ( aExp == 0 ) {
4507	++expDiff;
4508	}
4509	else {
4510	aSig0 \|= LIT64( 0x0001000000000000 );
4511	}
4512	shift128ExtraRightJamming(
4513	aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4514	zExp = bExp;
4515	}
4516	else {
4517	if ( aExp == 0x7FFF ) {
4518	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4519	return propagateFloat128NaN( a, b STATUS_VAR );
4520	}
4521	return a;
4522	}
4523	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4524	if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4525	zSig2 = 0;
4526	zSig0 \|= LIT64( 0x0002000000000000 );
4527	zExp = aExp;
4528	goto shiftRight1;
4529	}
4530	aSig0 \|= LIT64( 0x0001000000000000 );
4531	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4532	--zExp;
4533	if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4534	++zExp;
4535	shiftRight1:
4536	shift128ExtraRightJamming(
4537	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4538	roundAndPack:
4539	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4540
4541	}
4542
4543	/*----------------------------------------------------------------------------
4544	\| Returns the result of subtracting the absolute values of the quadruple-
4545	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
4546	\| difference is negated before being returned. `zSign' is ignored if the
4547	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
4548	\| Standard for Binary Floating-Point Arithmetic.
4549	----------------------------------------------------------------------------/
4550
4551	static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4552	{
4553	int32 aExp, bExp, zExp;
4554	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4555	int32 expDiff;
4556	float128 z;
4557
4558	aSig1 = extractFloat128Frac1( a );
4559	aSig0 = extractFloat128Frac0( a );
4560	aExp = extractFloat128Exp( a );
4561	bSig1 = extractFloat128Frac1( b );
4562	bSig0 = extractFloat128Frac0( b );
4563	bExp = extractFloat128Exp( b );
4564	expDiff = aExp - bExp;
4565	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4566	shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4567	if ( 0 < expDiff ) goto aExpBigger;
4568	if ( expDiff < 0 ) goto bExpBigger;
4569	if ( aExp == 0x7FFF ) {
4570	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4571	return propagateFloat128NaN( a, b STATUS_VAR );
4572	}
4573	float_raise( float_flag_invalid STATUS_VAR);
4574	z.low = float128_default_nan_low;
4575	z.high = float128_default_nan_high;
4576	return z;
4577	}
4578	if ( aExp == 0 ) {
4579	aExp = 1;
4580	bExp = 1;
4581	}
4582	if ( bSig0 < aSig0 ) goto aBigger;
4583	if ( aSig0 < bSig0 ) goto bBigger;
4584	if ( bSig1 < aSig1 ) goto aBigger;
4585	if ( aSig1 < bSig1 ) goto bBigger;
4586	return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4587	bExpBigger:
4588	if ( bExp == 0x7FFF ) {
4589	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4590	return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4591	}
4592	if ( aExp == 0 ) {
4593	++expDiff;
4594	}
4595	else {
4596	aSig0 \|= LIT64( 0x4000000000000000 );
4597	}
4598	shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4599	bSig0 \|= LIT64( 0x4000000000000000 );
4600	bBigger:
4601	sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4602	zExp = bExp;
4603	zSign ^= 1;
4604	goto normalizeRoundAndPack;
4605	aExpBigger:
4606	if ( aExp == 0x7FFF ) {
4607	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4608	return a;
4609	}
4610	if ( bExp == 0 ) {
4611	--expDiff;
4612	}
4613	else {
4614	bSig0 \|= LIT64( 0x4000000000000000 );
4615	}
4616	shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4617	aSig0 \|= LIT64( 0x4000000000000000 );
4618	aBigger:
4619	sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4620	zExp = aExp;
4621	normalizeRoundAndPack:
4622	--zExp;
4623	return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4624
4625	}
4626
4627	/*----------------------------------------------------------------------------
4628	\| Returns the result of adding the quadruple-precision floating-point values
4629	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4630	\| for Binary Floating-Point Arithmetic.
4631	----------------------------------------------------------------------------/
4632
4633	float128 float128_add( float128 a, float128 b STATUS_PARAM )
4634	{
4635	flag aSign, bSign;
4636
4637	aSign = extractFloat128Sign( a );
4638	bSign = extractFloat128Sign( b );
4639	if ( aSign == bSign ) {
4640	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4641	}
4642	else {
4643	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4644	}
4645
4646	}
4647
4648	/*----------------------------------------------------------------------------
4649	\| Returns the result of subtracting the quadruple-precision floating-point
4650	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4651	\| Standard for Binary Floating-Point Arithmetic.
4652	----------------------------------------------------------------------------/
4653
4654	float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4655	{
4656	flag aSign, bSign;
4657
4658	aSign = extractFloat128Sign( a );
4659	bSign = extractFloat128Sign( b );
4660	if ( aSign == bSign ) {
4661	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4662	}
4663	else {
4664	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4665	}
4666
4667	}
4668
4669	/*----------------------------------------------------------------------------
4670	\| Returns the result of multiplying the quadruple-precision floating-point
4671	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4672	\| Standard for Binary Floating-Point Arithmetic.
4673	----------------------------------------------------------------------------/
4674
4675	float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4676	{
4677	flag aSign, bSign, zSign;
4678	int32 aExp, bExp, zExp;
4679	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4680	float128 z;
4681
4682	aSig1 = extractFloat128Frac1( a );
4683	aSig0 = extractFloat128Frac0( a );
4684	aExp = extractFloat128Exp( a );
4685	aSign = extractFloat128Sign( a );
4686	bSig1 = extractFloat128Frac1( b );
4687	bSig0 = extractFloat128Frac0( b );
4688	bExp = extractFloat128Exp( b );
4689	bSign = extractFloat128Sign( b );
4690	zSign = aSign ^ bSign;
4691	if ( aExp == 0x7FFF ) {
4692	if ( ( aSig0 \| aSig1 )
4693	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
4694	return propagateFloat128NaN( a, b STATUS_VAR );
4695	}
4696	if ( ( bExp \| bSig0 \| bSig1 ) == 0 ) goto invalid;
4697	return packFloat128( zSign, 0x7FFF, 0, 0 );
4698	}
4699	if ( bExp == 0x7FFF ) {
4700	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4701	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4702	invalid:
4703	float_raise( float_flag_invalid STATUS_VAR);
4704	z.low = float128_default_nan_low;
4705	z.high = float128_default_nan_high;
4706	return z;
4707	}
4708	return packFloat128( zSign, 0x7FFF, 0, 0 );
4709	}
4710	if ( aExp == 0 ) {
4711	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4712	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4713	}
4714	if ( bExp == 0 ) {
4715	if ( ( bSig0 \| bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4716	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4717	}
4718	zExp = aExp + bExp - 0x4000;
4719	aSig0 \|= LIT64( 0x0001000000000000 );
4720	shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4721	mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4722	add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4723	zSig2 \|= ( zSig3 != 0 );
4724	if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4725	shift128ExtraRightJamming(
4726	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4727	++zExp;
4728	}
4729	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4730
4731	}
4732
4733	/*----------------------------------------------------------------------------
4734	\| Returns the result of dividing the quadruple-precision floating-point value
4735	\| `a' by the corresponding value `b'. The operation is performed according to
4736	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737	----------------------------------------------------------------------------/
4738
4739	float128 float128_div( float128 a, float128 b STATUS_PARAM )
4740	{
4741	flag aSign, bSign, zSign;
4742	int32 aExp, bExp, zExp;
4743	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4744	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4745	float128 z;
4746
4747	aSig1 = extractFloat128Frac1( a );
4748	aSig0 = extractFloat128Frac0( a );
4749	aExp = extractFloat128Exp( a );
4750	aSign = extractFloat128Sign( a );
4751	bSig1 = extractFloat128Frac1( b );
4752	bSig0 = extractFloat128Frac0( b );
4753	bExp = extractFloat128Exp( b );
4754	bSign = extractFloat128Sign( b );
4755	zSign = aSign ^ bSign;
4756	if ( aExp == 0x7FFF ) {
4757	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4758	if ( bExp == 0x7FFF ) {
4759	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4760	goto invalid;
4761	}
4762	return packFloat128( zSign, 0x7FFF, 0, 0 );
4763	}
4764	if ( bExp == 0x7FFF ) {
4765	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4766	return packFloat128( zSign, 0, 0, 0 );
4767	}
4768	if ( bExp == 0 ) {
4769	if ( ( bSig0 \| bSig1 ) == 0 ) {
4770	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4771	invalid:
4772	float_raise( float_flag_invalid STATUS_VAR);
4773	z.low = float128_default_nan_low;
4774	z.high = float128_default_nan_high;
4775	return z;
4776	}
4777	float_raise( float_flag_divbyzero STATUS_VAR);
4778	return packFloat128( zSign, 0x7FFF, 0, 0 );
4779	}
4780	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4781	}
4782	if ( aExp == 0 ) {
4783	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4784	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4785	}
4786	zExp = aExp - bExp + 0x3FFD;
4787	shortShift128Left(
4788	aSig0 \| LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4789	shortShift128Left(
4790	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4791	if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4792	shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4793	++zExp;
4794	}
4795	zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4796	mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4797	sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4798	while ( (sbits64) rem0 < 0 ) {
4799	--zSig0;
4800	add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4801	}
4802	zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4803	if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4804	mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4805	sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4806	while ( (sbits64) rem1 < 0 ) {
4807	--zSig1;
4808	add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4809	}
4810	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
4811	}
4812	shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4813	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4814
4815	}
4816
4817	/*----------------------------------------------------------------------------
4818	\| Returns the remainder of the quadruple-precision floating-point value `a'
4819	\| with respect to the corresponding value `b'. The operation is performed
4820	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4821	----------------------------------------------------------------------------/
4822
4823	float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4824	{
4825	flag aSign, bSign, zSign;
4826	int32 aExp, bExp, expDiff;
4827	bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4828	bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4829	sbits64 sigMean0;
4830	float128 z;
4831
4832	aSig1 = extractFloat128Frac1( a );
4833	aSig0 = extractFloat128Frac0( a );
4834	aExp = extractFloat128Exp( a );
4835	aSign = extractFloat128Sign( a );
4836	bSig1 = extractFloat128Frac1( b );
4837	bSig0 = extractFloat128Frac0( b );
4838	bExp = extractFloat128Exp( b );
4839	bSign = extractFloat128Sign( b );
4840	if ( aExp == 0x7FFF ) {
4841	if ( ( aSig0 \| aSig1 )
4842	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
4843	return propagateFloat128NaN( a, b STATUS_VAR );
4844	}
4845	goto invalid;
4846	}
4847	if ( bExp == 0x7FFF ) {
4848	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4849	return a;
4850	}
4851	if ( bExp == 0 ) {
4852	if ( ( bSig0 \| bSig1 ) == 0 ) {
4853	invalid:
4854	float_raise( float_flag_invalid STATUS_VAR);
4855	z.low = float128_default_nan_low;
4856	z.high = float128_default_nan_high;
4857	return z;
4858	}
4859	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4860	}
4861	if ( aExp == 0 ) {
4862	if ( ( aSig0 \| aSig1 ) == 0 ) return a;
4863	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4864	}
4865	expDiff = aExp - bExp;
4866	if ( expDiff < -1 ) return a;
4867	shortShift128Left(
4868	aSig0 \| LIT64( 0x0001000000000000 ),
4869	aSig1,
4870	15 - ( expDiff < 0 ),
4871	&aSig0,
4872	&aSig1
4873	);
4874	shortShift128Left(
4875	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4876	q = le128( bSig0, bSig1, aSig0, aSig1 );
4877	if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4878	expDiff -= 64;
4879	while ( 0 < expDiff ) {
4880	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4881	q = ( 4 < q ) ? q - 4 : 0;
4882	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4883	shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4884	shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4885	sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4886	expDiff -= 61;
4887	}
4888	if ( -64 < expDiff ) {
4889	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4890	q = ( 4 < q ) ? q - 4 : 0;
4891	q >>= - expDiff;
4892	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4893	expDiff += 52;
4894	if ( expDiff < 0 ) {
4895	shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896	}
4897	else {
4898	shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4899	}
4900	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4901	sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4902	}
4903	else {
4904	shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4905	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4906	}
4907	do {
4908	alternateASig0 = aSig0;
4909	alternateASig1 = aSig1;
4910	++q;
4911	sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4912	} while ( 0 <= (sbits64) aSig0 );
4913	add128(
4914	aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4915	if ( ( sigMean0 < 0 )
4916	\|\| ( ( ( sigMean0 \| sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4917	aSig0 = alternateASig0;
4918	aSig1 = alternateASig1;
4919	}
4920	zSign = ( (sbits64) aSig0 < 0 );
4921	if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4922	return
4923	normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4924
4925	}
4926
4927	/*----------------------------------------------------------------------------
4928	\| Returns the square root of the quadruple-precision floating-point value `a'.
4929	\| The operation is performed according to the IEC/IEEE Standard for Binary
4930	\| Floating-Point Arithmetic.
4931	----------------------------------------------------------------------------/
4932
4933	float128 float128_sqrt( float128 a STATUS_PARAM )
4934	{
4935	flag aSign;
4936	int32 aExp, zExp;
4937	bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4938	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4939	float128 z;
4940
4941	aSig1 = extractFloat128Frac1( a );
4942	aSig0 = extractFloat128Frac0( a );
4943	aExp = extractFloat128Exp( a );
4944	aSign = extractFloat128Sign( a );
4945	if ( aExp == 0x7FFF ) {
4946	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4947	if ( ! aSign ) return a;
4948	goto invalid;
4949	}
4950	if ( aSign ) {
4951	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) return a;
4952	invalid:
4953	float_raise( float_flag_invalid STATUS_VAR);
4954	z.low = float128_default_nan_low;
4955	z.high = float128_default_nan_high;
4956	return z;
4957	}
4958	if ( aExp == 0 ) {
4959	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4960	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4961	}
4962	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4963	aSig0 \|= LIT64( 0x0001000000000000 );
4964	zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4965	shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4966	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4967	doubleZSig0 = zSig0<<1;
4968	mul64To128( zSig0, zSig0, &term0, &term1 );
4969	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4970	while ( (sbits64) rem0 < 0 ) {
4971	--zSig0;
4972	doubleZSig0 -= 2;
4973	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
4974	}
4975	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4976	if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4977	if ( zSig1 == 0 ) zSig1 = 1;
4978	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4979	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4980	mul64To128( zSig1, zSig1, &term2, &term3 );
4981	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4982	while ( (sbits64) rem1 < 0 ) {
4983	--zSig1;
4984	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4985	term3 \|= 1;
4986	term2 \|= doubleZSig0;
4987	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4988	}
4989	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
4990	}
4991	shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4992	return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4993
4994	}
4995
4996	/*----------------------------------------------------------------------------
4997	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
4998	\| the corresponding value `b', and 0 otherwise. The comparison is performed
4999	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5000	----------------------------------------------------------------------------/
5001
5002	flag float128_eq( float128 a, float128 b STATUS_PARAM )
5003	{
5004
5005	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5006	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5007	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5008	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5009	) {
5010	if ( float128_is_signaling_nan( a )
5011	\|\| float128_is_signaling_nan( b ) ) {
5012	float_raise( float_flag_invalid STATUS_VAR);
5013	}
5014	return 0;
5015	}
5016	return
5017	( a.low == b.low )
5018	&& ( ( a.high == b.high )
5019	\|\| ( ( a.low == 0 )
5020	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5021	);
5022
5023	}
5024
5025	/*----------------------------------------------------------------------------
5026	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5027	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
5028	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029	\| Arithmetic.
5030	----------------------------------------------------------------------------/
5031
5032	flag float128_le( float128 a, float128 b STATUS_PARAM )
5033	{
5034	flag aSign, bSign;
5035
5036	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5037	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5038	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5039	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5040	) {
5041	float_raise( float_flag_invalid STATUS_VAR);
5042	return 0;
5043	}
5044	aSign = extractFloat128Sign( a );
5045	bSign = extractFloat128Sign( b );
5046	if ( aSign != bSign ) {
5047	return
5048	aSign
5049	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5050	== 0 );
5051	}
5052	return
5053	aSign ? le128( b.high, b.low, a.high, a.low )
5054	: le128( a.high, a.low, b.high, b.low );
5055
5056	}
5057
5058	/*----------------------------------------------------------------------------
5059	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5060	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5061	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062	----------------------------------------------------------------------------/
5063
5064	flag float128_lt( float128 a, float128 b STATUS_PARAM )
5065	{
5066	flag aSign, bSign;
5067
5068	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5069	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5070	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5071	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5072	) {
5073	float_raise( float_flag_invalid STATUS_VAR);
5074	return 0;
5075	}
5076	aSign = extractFloat128Sign( a );
5077	bSign = extractFloat128Sign( b );
5078	if ( aSign != bSign ) {
5079	return
5080	aSign
5081	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5082	!= 0 );
5083	}
5084	return
5085	aSign ? lt128( b.high, b.low, a.high, a.low )
5086	: lt128( a.high, a.low, b.high, b.low );
5087
5088	}
5089
5090	/*----------------------------------------------------------------------------
5091	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5092	\| the corresponding value `b', and 0 otherwise. The invalid exception is
5093	\| raised if either operand is a NaN. Otherwise, the comparison is performed
5094	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5095	----------------------------------------------------------------------------/
5096
5097	flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5098	{
5099
5100	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5101	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5102	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5103	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5104	) {
5105	float_raise( float_flag_invalid STATUS_VAR);
5106	return 0;
5107	}
5108	return
5109	( a.low == b.low )
5110	&& ( ( a.high == b.high )
5111	\|\| ( ( a.low == 0 )
5112	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5113	);
5114
5115	}
5116
5117	/*----------------------------------------------------------------------------
5118	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5119	\| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5120	\| cause an exception. Otherwise, the comparison is performed according to the
5121	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5122	----------------------------------------------------------------------------/
5123
5124	flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5125	{
5126	flag aSign, bSign;
5127
5128	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5129	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5130	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5131	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5132	) {
5133	if ( float128_is_signaling_nan( a )
5134	\|\| float128_is_signaling_nan( b ) ) {
5135	float_raise( float_flag_invalid STATUS_VAR);
5136	}
5137	return 0;
5138	}
5139	aSign = extractFloat128Sign( a );
5140	bSign = extractFloat128Sign( b );
5141	if ( aSign != bSign ) {
5142	return
5143	aSign
5144	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5145	== 0 );
5146	}
5147	return
5148	aSign ? le128( b.high, b.low, a.high, a.low )
5149	: le128( a.high, a.low, b.high, b.low );
5150
5151	}
5152
5153	/*----------------------------------------------------------------------------
5154	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5155	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5156	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5157	\| Standard for Binary Floating-Point Arithmetic.
5158	----------------------------------------------------------------------------/
5159
5160	flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5161	{
5162	flag aSign, bSign;
5163
5164	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5165	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5166	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5167	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5168	) {
5169	if ( float128_is_signaling_nan( a )
5170	\|\| float128_is_signaling_nan( b ) ) {
5171	float_raise( float_flag_invalid STATUS_VAR);
5172	}
5173	return 0;
5174	}
5175	aSign = extractFloat128Sign( a );
5176	bSign = extractFloat128Sign( b );
5177	if ( aSign != bSign ) {
5178	return
5179	aSign
5180	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5181	!= 0 );
5182	}
5183	return
5184	aSign ? lt128( b.high, b.low, a.high, a.low )
5185	: lt128( a.high, a.low, b.high, b.low );
5186
5187	}
5188
5189	#endif
5190
5191	/* misc functions */
5192	float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5193	{
5194	return int64_to_float32(a STATUS_VAR);
5195	}
5196
5197	float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5198	{
5199	return int64_to_float64(a STATUS_VAR);
5200	}
5201
5202	unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5203	{
5204	int64_t v;
5205	unsigned int res;
5206
5207	v = float32_to_int64(a STATUS_VAR);
5208	if (v < 0) {
5209	res = 0;
5210	float_raise( float_flag_invalid STATUS_VAR);
5211	} else if (v > 0xffffffff) {
5212	res = 0xffffffff;
5213	float_raise( float_flag_invalid STATUS_VAR);
5214	} else {
5215	res = v;
5216	}
5217	return res;
5218	}
5219
5220	unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5221	{
5222	int64_t v;
5223	unsigned int res;
5224
5225	v = float32_to_int64_round_to_zero(a STATUS_VAR);
5226	if (v < 0) {
5227	res = 0;
5228	float_raise( float_flag_invalid STATUS_VAR);
5229	} else if (v > 0xffffffff) {
5230	res = 0xffffffff;
5231	float_raise( float_flag_invalid STATUS_VAR);
5232	} else {
5233	res = v;
5234	}
5235	return res;
5236	}
5237
5238	unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5239	{
5240	int64_t v;
5241	unsigned int res;
5242
5243	v = float64_to_int64(a STATUS_VAR);
5244	if (v < 0) {
5245	res = 0;
5246	float_raise( float_flag_invalid STATUS_VAR);
5247	} else if (v > 0xffffffff) {
5248	res = 0xffffffff;
5249	float_raise( float_flag_invalid STATUS_VAR);
5250	} else {
5251	res = v;
5252	}
5253	return res;
5254	}
5255
5256	unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5257	{
5258	int64_t v;
5259	unsigned int res;
5260
5261	v = float64_to_int64_round_to_zero(a STATUS_VAR);
5262	if (v < 0) {
5263	res = 0;
5264	float_raise( float_flag_invalid STATUS_VAR);
5265	} else if (v > 0xffffffff) {
5266	res = 0xffffffff;
5267	float_raise( float_flag_invalid STATUS_VAR);
5268	} else {
5269	res = v;
5270	}
5271	return res;
5272	}
5273
5274	#define COMPARE(s, nan_exp) \
5275	INLINE char float ## s ## _compare_internal( float ## s a, float ## s b, \
5276	int is_quiet STATUS_PARAM ) \
5277	{ \
5278	flag aSign, bSign; \
5279	\
5280	if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5281	extractFloat ## s ## Frac( a ) ) \|\| \
5282	( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5283	extractFloat ## s ## Frac( b ) )) { \
5284	if (!is_quiet \|\| \
5285	float ## s ## _is_signaling_nan( a ) \|\| \
5286	float ## s ## _is_signaling_nan( b ) ) { \
5287	float_raise( float_flag_invalid STATUS_VAR); \
5288	} \
5289	return float_relation_unordered; \
5290	} \
5291	aSign = extractFloat ## s ## Sign( a ); \
5292	bSign = extractFloat ## s ## Sign( b ); \
5293	if ( aSign != bSign ) { \
5294	if ( (bits ## s) ( ( a \| b )<<1 ) == 0 ) { \
5295	/* zero case */ \
5296	return float_relation_equal; \
5297	} else { \
5298	return 1 - (2 * aSign); \
5299	} \
5300	} else { \
5301	if (a == b) { \
5302	return float_relation_equal; \
5303	} else { \
5304	return 1 - 2 * (aSign ^ ( a < b )); \
5305	} \
5306	} \
5307	} \
5308	\
5309	char float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5310	{ \
5311	return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5312	} \
5313	\
5314	char float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5315	{ \
5316	return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5317	}
5318
5319	COMPARE(32, 0xff)
5320	COMPARE(64, 0x7ff)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: