@@ -32,19 +32,80 @@ fn power_of_ten(e: i16) -> Fp {
32
32
Fp { f : sig, e : exp }
33
33
}
34
34
35
+ // In most architectures, floating point operations have an explicit bit size, therefore the
36
+ // precision of the computation is determined on a per-operation basis.
37
+ #[ cfg( any( not( target_arch="x86" ) , target_feature="sse2" ) ) ]
38
+ mod fpu_precision {
39
+ pub fn set_precision < T > ( ) { }
40
+ }
41
+
42
+ // On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
43
+ // The x87 FPU operates with 80 bits of precision by default, which means that operations will
44
+ // round to 80 bits causing double rounding to happen when values are eventually represented as
45
+ // 32/64 bit float values. To overcome this, the FPU control word can be set so that the
46
+ // computations are performed in the desired precision.
47
+ #[ cfg( all( target_arch="x86" , not( target_feature="sse2" ) ) ) ]
48
+ mod fpu_precision {
49
+ use mem:: size_of;
50
+ use ops:: Drop ;
51
+
52
+ /// A structure used to preserve the original value of the FPU control word, so that it can be
53
+ /// restored when the structure is dropped.
54
+ ///
55
+ /// The x87 FPU is a 16-bits register whose fields are as follows:
56
+ ///
57
+ /// | 12-15 | 10-11 | 8-9 | 6-7 | 5 | 4 | 3 | 2 | 1 | 0 |
58
+ /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:|
59
+ /// | | RC | PC | | PM | UM | OM | ZM | DM | IM |
60
+ ///
61
+ /// The documentation for all of the fields is available in the IA-32 Architectures Software
62
+ /// Developer's Manual (Volume 1).
63
+ ///
64
+ /// The only field which is relevant for the following code is PC, Precision Control. This
65
+ /// field determines the precision of the operations performed by the FPU. It can be set to:
66
+ /// - 0b00, single precision i.e. 32-bits
67
+ /// - 0b10, double precision i.e. 64-bits
68
+ /// - 0b11, double extended precision i.e. 80-bits (default state)
69
+ /// The 0b01 value is reserved and should not be used.
70
+ pub struct FPUControlWord ( u16 ) ;
71
+
72
+ fn set_cw ( cw : u16 ) {
73
+ unsafe { asm ! ( "fldcw $0" :: "m" ( cw) :: "volatile" ) }
74
+ }
75
+
76
+ /// Set the precision field of the FPU to `T` and return a `FPUControlWord`
77
+ pub fn set_precision < T > ( ) -> FPUControlWord {
78
+ let cw = 0u16 ;
79
+
80
+ // Compute the value for the Precision Control field that is appropriate for `T`.
81
+ let cw_precision = match size_of :: < T > ( ) {
82
+ 4 => 0x0000 , // 32 bits
83
+ 8 => 0x0200 , // 64 bits
84
+ _ => 0x0300 , // default, 80 bits
85
+ } ;
86
+
87
+ // Get the original value of the control word to restore it later, when the
88
+ // `FPUControlWord` structure is dropped
89
+ unsafe { asm ! ( "fnstcw $0" : "=*m" ( & cw) :: : "volatile" ) }
90
+
91
+ // Set the control word to the desired precision. This is achieved by masking away the old
92
+ // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
93
+ set_cw ( ( cw & 0xFCFF ) | cw_precision) ;
94
+
95
+ FPUControlWord ( cw)
96
+ }
97
+
98
+ impl Drop for FPUControlWord {
99
+ fn drop ( & mut self ) {
100
+ set_cw ( self . 0 )
101
+ }
102
+ }
103
+ }
104
+
35
105
/// The fast path of Bellerophon using machine-sized integers and floats.
36
106
///
37
107
/// This is extracted into a separate function so that it can be attempted before constructing
38
108
/// a bignum.
39
- ///
40
- /// The fast path crucially depends on arithmetic being correctly rounded, so on x86
41
- /// without SSE or SSE2 it will be **wrong** (as in, off by one ULP occasionally), because the x87
42
- /// FPU stack will round to 80 bit first before rounding to 64/32 bit. However, as such hardware
43
- /// is extremely rare nowadays and in fact all in-tree target triples assume an SSE2-capable
44
- /// microarchitecture, there is little incentive to deal with that. There's a test that will fail
45
- /// when SSE or SSE2 is disabled, so people building their own non-SSE copy will get a heads up.
46
- ///
47
- /// FIXME: It would nevertheless be nice if we had a good way to detect and deal with x87.
48
109
pub fn fast_path < T : RawFloat > ( integral : & [ u8 ] , fractional : & [ u8 ] , e : i64 ) -> Option < T > {
49
110
let num_digits = integral. len ( ) + fractional. len ( ) ;
50
111
// log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end,
@@ -60,9 +121,17 @@ pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Opt
60
121
if f > T :: max_sig ( ) {
61
122
return None ;
62
123
}
124
+
125
+ // The fast path crucially depends on arithmetic being rounded to the correct number of bits
126
+ // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
127
+ // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
128
+ // The `set_precision` function takes care of setting the precision on architectures which
129
+ // require setting it by changing the global state (like the control word of the x87 FPU).
130
+ let _cw = fpu_precision:: set_precision :: < T > ( ) ;
131
+
63
132
// The case e < 0 cannot be folded into the other branch. Negative powers result in
64
133
// a repeating fractional part in binary, which are rounded, which causes real
65
- // (and occasioally quite significant!) errors in the final result.
134
+ // (and occasionally quite significant!) errors in the final result.
66
135
if e >= 0 {
67
136
Some ( T :: from_int ( f) * T :: short_fast_pow10 ( e as usize ) )
68
137
} else {
0 commit comments