fp.texi 69 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801
  1. @ignore
  2. Copyright @copyright{} 2017,2019,2020 Free Software Foundation, Inc.
  3. DRAFT --- DO NOT REDISTRIBUTE --- DRAFT --- DO NOT REDISTRIBUTE --- DRAFT
  4. @end ignore
  5. @node Floating Point in Depth
  6. @chapter Floating Point in Depth
  7. @menu
  8. * Floating Representations::
  9. * Floating Type Specs::
  10. * Special Float Values::
  11. * Invalid Optimizations::
  12. * Exception Flags::
  13. * Exact Floating-Point::
  14. * Rounding::
  15. * Rounding Issues::
  16. * Significance Loss::
  17. * Fused Multiply-Add::
  18. * Error Recovery::
  19. @c * Double-Rounding Problems::
  20. * Exact Floating Constants::
  21. * Handling Infinity::
  22. * Handling NaN::
  23. * Signed Zeros::
  24. * Scaling by the Base::
  25. * Rounding Control::
  26. * Machine Epsilon::
  27. * Complex Arithmetic::
  28. * Round-Trip Base Conversion::
  29. * Further Reading::
  30. @end menu
  31. @node Floating Representations
  32. @section Floating-Point Representations
  33. @cindex floating-point representations
  34. @cindex representation of floating-point numbers
  35. @cindex IEEE 754-2008 Standard
  36. Storing numbers as @dfn{floating point} allows representation of
  37. numbers with fractional values, in a range larger than that of
  38. hardware integers. A floating-point number consists of a sign bit, a
  39. @dfn{significand} (also called the @dfn{mantissa}), and a power of a
  40. fixed base. GNU C uses the floating-point representations specified by
  41. the @cite{IEEE 754-2008 Standard for Floating-Point Arithmetic}.
  42. The IEEE 754-2008 specification defines basic binary floating-point
  43. formats of five different sizes: 16-bit, 32-bit, 64-bit, 128-bit, and
  44. 256-bit. The formats of 32, 64, and 128 bits are used for the
  45. standard C types @code{float}, @code{double}, and @code{long double}.
  46. GNU C supports the 16-bit floating point type @code{_Float16} on some
  47. platforms, but does not support the 256-bit floating point type.
  48. Each of the formats encodes the floating-point number as a sign bit.
  49. After this comes an exponent that specifies a power of 2 (with a fixed
  50. offset). Then comes the significand.
  51. The first bit of the significand, before the binary point, is always
  52. 1, so there is no need to store it in memory. It is called the
  53. @dfn{hidden bit} because it doesn't appear in the floating-point
  54. number as used in the computer itself.
  55. All of those floating-point formats are sign-magnitude representations,
  56. so +0 and @minus{}0 are different values.
  57. Besides the IEEE 754 format 128-bit float, GNU C also offers a format
  58. consisting of a pair of 64-bit floating point numbers. This lacks the
  59. full exponent range of the IEEE 128-bit format, but is useful when the
  60. underlying hardware platform does not support that.
  61. @node Floating Type Specs
  62. @section Floating-Point Type Specifications
  63. The standard library header file @file{float.h} defines a number of
  64. constants that describe the platform's implementation of
  65. floating-point types @code{float}, @code{double} and @code{long
  66. double}. They include:
  67. @findex FLT_MIN
  68. @findex DBL_MIN
  69. @findex LDBL_MIN
  70. @findex FLT_HAS_SUBNORM
  71. @findex DBL_HAS_SUBNORM
  72. @findex LDBL_HAS_SUBNORM
  73. @findex FLT_TRUE_MIN
  74. @findex DBL_TRUE_MIN
  75. @findex LDBL_TRUE_MIN
  76. @findex FLT_MAX
  77. @findex DBL_MAX
  78. @findex LDBL_MAX
  79. @findex FLT_DECIMAL_DIG
  80. @findex DBL_DECIMAL_DIG
  81. @findex LDBL_DECIMAL_DIG
  82. @table @code
  83. @item FLT_MIN
  84. @itemx DBL_MIN
  85. @itemx LDBL_MIN
  86. Defines the minimum normalized positive floating-point values that can
  87. be represented with the type.
  88. @item FLT_HAS_SUBNORM
  89. @itemx DBL_HAS_SUBNORM
  90. @itemx LDBL_HAS_SUBNORM
  91. Defines if the floating-point type supports subnormal (or ``denormalized'')
  92. numbers or not (@pxref{subnormal numbers}).
  93. @item FLT_TRUE_MIN
  94. @itemx DBL_TRUE_MIN
  95. @itemx LDBL_TRUE_MIN
  96. Defines the minimum positive values (including subnormal values) that
  97. can be represented with the type.
  98. @item FLT_MAX
  99. @itemx DBL_MAX
  100. @itemx LDBL_MAX
  101. Defines the largest values that can be represented with the type.
  102. @item FLT_DECIMAL_DIG
  103. @itemx DBL_DECIMAL_DIG
  104. @itemx LDBL_DECIMAL_DIG
  105. Defines the number of decimal digits @code{n} such that any
  106. floating-point number that can be represented in the type can be
  107. rounded to a floating-point number with @code{n} decimal digits, and
  108. back again, without losing any precision of the value.
  109. @end table
  110. @node Special Float Values
  111. @section Special Floating-Point Values
  112. @cindex special floating-point values
  113. @cindex floating-point values, special
  114. IEEE floating point provides for special values that are not ordinary
  115. numbers.
  116. @table @asis
  117. @item infinities
  118. @code{+Infinity} and @code{-Infinity} are two different infinite
  119. values, one positive and one negative. These result from
  120. operations such as @code{1 / 0}, @code{Infinity + Infinity},
  121. @code{Infinity * Infinity}, and @code{Infinity + @var{finite}}, and also
  122. from a result that is finite, but larger than the most positive possible
  123. value or smaller than the most negative possible value.
  124. @xref{Handling Infinity}, for more about working with infinities.
  125. @item NaNs (not a number)
  126. @cindex QNaN
  127. @cindex SNaN
  128. There are two special values, called Not-a-Number (NaN): a quiet
  129. NaN (QNaN), and a signaling NaN (SNaN).
  130. A QNaN is produced by operations for which the value is undefined
  131. in real arithmetic, such as @code{0 / 0}, @code{sqrt (-1)},
  132. @code{Infinity - Infinity}, and any basic operation in which an
  133. operand is a QNaN.
  134. The signaling NaN is intended for initializing
  135. otherwise-unassigned storage, and the goal is that unlike a
  136. QNaN, an SNaN @emph{does} cause an interrupt that can be caught
  137. by a software handler, diagnosed, and reported. In practice,
  138. little use has been made of signaling NaNs, because the most
  139. common CPUs in desktop and portable computers fail to implement
  140. the full IEEE 754 Standard, and supply only one kind of NaN, the
  141. quiet one. Also, programming-language standards have taken
  142. decades to catch up to the IEEE 754 standard, and implementations
  143. of those language standards make an additional delay before
  144. programmers become willing to use these features.
  145. To enable support for signaling NaNs, use the GCC command-line option
  146. @option{-fsignaling-nans}, but this is an experimental feature and may
  147. not work as expected in every situation.
  148. A NaN has a sign bit, but its value means nothing.
  149. @xref{Handling NaN}, for more about working with NaNs.
  150. @item subnormal numbers
  151. @cindex subnormal numbers
  152. @cindex underflow, floating
  153. @cindex floating underflow
  154. @anchor{subnormal numbers}
  155. It can happen that a computed floating-point value is too small to
  156. represent, such as when two tiny numbers are multiplied. The result
  157. is then said to @dfn{underflow}. The traditional behavior before
  158. the IEEE 754 Standard was to use zero as the result, and possibly to report
  159. the underflow in some sort of program output.
  160. The IEEE 754 Standard is vague about whether rounding happens
  161. before detection of floating underflow and overflow, or after, and CPU
  162. designers may choose either.
  163. However, the Standard does something unusual compared to earlier
  164. designs, and that is that when the result is smaller than the
  165. smallest @dfn{normalized} representable value (i.e., one in
  166. which the leading significand bit is @code{1}), the normalization
  167. requirement is relaxed, leading zero bits are permitted, and
  168. precision is gradually lost until there are no more bits in the
  169. significand. That phenomenon is called @dfn{gradual underflow},
  170. and it serves important numerical purposes, although it does
  171. reduce the precision of the final result. Some floating-point
  172. designs allow you to choose at compile time, or even at
  173. run time, whether underflows are gradual, or are flushed abruptly
  174. to zero. Numbers that have entered the region of gradual
  175. underflow are called @dfn{subnormal}.
  176. You can use the library functions @code{fesetround} and
  177. @code{fegetround} to set and get the rounding mode. Rounding modes
  178. are defined (if supported by the platform) in @code{fenv.h} as:
  179. @code{FE_UPWARD} to round toward positive infinity; @code{FE_DOWNWARD}
  180. to round toward negative infinity; @code{FE_TOWARDZERO} to round
  181. toward zero; and @code{FE_TONEAREST} to round to the nearest
  182. representable value, the default mode. It is best to use
  183. @code{FE_TONEAREST} except when there is a special need for some other
  184. mode.
  185. @end table
  186. @node Invalid Optimizations
  187. @section Invalid Optimizations
  188. @cindex invalid optimizations in floating-point arithmetic
  189. @cindex floating-point arithmetic invalid optimizations
  190. Signed zeros, Infinity, and NaN invalidate some optimizations by
  191. programmers and compilers that might otherwise have seemed obvious:
  192. @itemize @bullet
  193. @item
  194. @code{x + 0} and @code{x - 0} are not the same as @code{x} when
  195. @code{x} is zero, because the result depends on the rounding rule.
  196. @xref{Rounding}, for more about rounding rules.
  197. @item
  198. @code{x * 0.0} is not the same as @code{0.0} when @code{x} is
  199. Infinity, a NaN, or negative zero.
  200. @item
  201. @code{x / x} is not the same as @code{1.0} when @code{x} is Infinity,
  202. a NaN, or zero.
  203. @item
  204. @code{(x - y)} is not the same as @code{-(y - x)} because when the
  205. operands are finite and equal, one evaluates to @code{+0} and the
  206. other to @code{-0}.
  207. @item
  208. @code{x - x} is not the same as @code{0.0} when @var{x} is Infinity or
  209. a NaN.
  210. @item
  211. @code{x == x} and @code{x != x} are not equivalent to @code{1} and
  212. @code{0} when @var{x} is a NaN.
  213. @item
  214. @code{x < y} and @code{isless (x, y)} are not equivalent, because the
  215. first sets a sticky exception flag (@pxref{Exception Flags}) when an
  216. operand is a NaN, whereas the second does not affect that flag. The
  217. same holds for the other @code{isxxx} functions that are companions to
  218. relational operators. @xref{FP Comparison Functions, , , libc, The
  219. GNU C Library Reference Manual}.
  220. @end itemize
  221. The @option{-funsafe-math-optimizations} option enables
  222. these optimizations.
  223. @node Exception Flags
  224. @section Floating Arithmetic Exception Flags
  225. @cindex floating arithmetic exception flags
  226. @cindex exception flags (floating point)
  227. @cindex sticky exception flags (floating point)
  228. @cindex floating overflow
  229. @cindex overflow, floating
  230. @cindex floating underflow
  231. @cindex underflow, floating
  232. @dfn{Sticky exception flags} record the occurrence of particular
  233. conditions: once set, they remain set until the program explicitly
  234. clears them.
  235. The conditions include @emph{invalid operand},
  236. @emph{division-by_zero}, @emph{inexact result} (i.e., one that
  237. required rounding), @emph{underflow}, and @emph{overflow}. Some
  238. extended floating-point designs offer several additional exception
  239. flags. The functions @code{feclearexcept}, @code{feraiseexcept},
  240. @code{fetestexcept}, @code{fegetexceptflags}, and
  241. @code{fesetexceptflags} provide a standardized interface to those
  242. flags. @xref{Status bit operations, , , libc, The GNU C Library
  243. Reference Manual}.
  244. One important use of those @anchor{fetestexcept}flags is to do a
  245. computation that is normally expected to be exact in floating-point
  246. arithmetic, but occasionally might not be, in which case, corrective
  247. action is needed. You can clear the @emph{inexact result} flag with a
  248. call to @code{feclearexcept (FE_INEXACT)}, do the computation, and
  249. then test the flag with @code{fetestexcept (FE_INEXACT)}; the result
  250. of that call is 0 if the flag is not set (there was no rounding), and
  251. 1 when there was rounding (which, we presume, implies the program has
  252. to correct for that).
  253. @c =====================================================================
  254. @ignore
  255. @node IEEE 754 Decimal Arithmetic
  256. @section IEEE 754 Decimal Arithmetic
  257. @cindex IEEE 754 decimal arithmetic
  258. One of the difficulties that users of computers for numerical
  259. work face, whether they realize it or not, is that the computer
  260. does not operate in the number base that most people are familiar
  261. with. As a result, input decimal fractions must be converted to
  262. binary floating-point values for use in computations, and then
  263. the final results converted back to decimal for humans. Because the
  264. precision is finite and limited, and because algorithms for correct
  265. round-trip conversion between number bases were not known until the
  266. 1990s, and are still not implemented on most systems and most
  267. programming languages, the result is frequent confusion for users,
  268. when a simple expression like @code{3.0*(1.0/3.0)} evaluates to
  269. 0.999999 instead of the expected 1.0. Here is an
  270. example from a floating-point calculator that allows rounding-mode
  271. control, with the mode set to @emph{round-towards-zero}:
  272. @example
  273. for (k = 1; k <= 10; ++k)
  274. (void)printf ("%2d\t%10.6f\n", k, k*(1.0/k))
  275. 1 1.000000
  276. 2 1.000000
  277. 3 0.999999
  278. 4 1.000000
  279. 5 0.999999
  280. 6 0.999999
  281. 7 0.999999
  282. 8 1.000000
  283. 9 0.999999
  284. 10 0.999999
  285. @end example
  286. Increasing working precision can sometimes help by reducing
  287. intermediate rounding errors, but the reality is that when
  288. fractional values are involved, @emph{no amount} of extra
  289. precision can suffice for some computations. For example, the
  290. nice decimal value @code{1/10} in C99-style binary representation
  291. is @code{+0x1.999999999999ap-4}; that final digit @code{a} is the
  292. rounding of an infinite string of @code{9}'s.
  293. Financial computations in particular depend critically on correct
  294. arithmetic, and the losses due to rounding errors can be
  295. large, especially for businesses with large numbers of small
  296. transactions, such as grocery stores and telephone companies.
  297. Tax authorities are particularly picky, and demand specific
  298. rounding rules, including one that instead of rounding ties to
  299. the nearest number, rounds instead in the direction that favors
  300. the taxman.
  301. Programming languages used for business applications, notably the
  302. venerable Cobol language, have therefore always implemented
  303. financial computations in @emph{fixed-point decimal arithmetic}
  304. in software, and because of the large monetary amounts that must be
  305. processed, successive Cobol standards have increased the minimum
  306. number size from 18 to 32 decimal digits, and the most recent one
  307. requires a decimal exponent range of at least @code{[-999, +999]}.
  308. The revised IEEE 754-2008 standard therefore requires decimal
  309. floating-point arithmetic, as well as the now-widely used binary
  310. formats from 1985. Like the binary formats, the decimal formats
  311. also support Infinity, NaN, and signed zero, and the five basic
  312. operations are also required to produce correctly rounded
  313. representations of infinitely precise exact results.
  314. However, the financial applications of decimal arithmetic
  315. introduce some new features:
  316. @itemize @bullet
  317. @item
  318. There are three decimal formats occupying 32, 64, and 128 bits of
  319. storage, and offering exactly 7, 16, and 34 decimal digits of
  320. precision. If one size has @code{n} digits, the next larger size
  321. has @code{2 n + 2} digits. Thus, a future 256-bit format would
  322. supply 70 decimal digits, and at least one library already
  323. supports the 256-bit binary and decimal formats.
  324. @item
  325. Decimal arithmetic has an additional rounding mode, called
  326. @emph{round-ties-to-away-from-zero}, meaning that a four-digit
  327. rounding of @code{1.2345} is @code{1.235}, and @code{-1.2345}
  328. becomes @code{-1.235}. That rounding mode is mandated by
  329. financial laws in several countries.
  330. @item
  331. The decimal significand is an
  332. @anchor{decimal-significand}@emph{integer}, instead of a fractional
  333. value, and trailing zeros are only removed at user request. That
  334. feature allows floating-point arithmetic to emulate the
  335. @emph{fixed-point arithmetic} traditionally used in financial
  336. computations.
  337. @end itemize
  338. @noindent
  339. We can easily estimate how many digits are likely to be needed for
  340. financial work: seven billion people on Earth, with an average annual
  341. income of less than US$10,000, means a world financial base that can
  342. be represented in just 15 decimal digits. Even allowing for alternate
  343. currencies, future growth, multiyear accounting, and intermediate
  344. computations, the 34 digits supplied by the 128-bit format are more
  345. than enough for financial purposes.
  346. We return to decimal arithmetic later in this chapter
  347. (@pxref{More on decimal floating-point arithmetic}), after we have
  348. covered more about floating-point arithmetic in general.
  349. @c =====================================================================
  350. @end ignore
  351. @node Exact Floating-Point
  352. @section Exact Floating-Point Arithmetic
  353. @cindex exact floating-point arithmetic
  354. @cindex floating-point arithmetic, exact
  355. As long as the numbers are exactly representable (fractions whose
  356. denominator is a power of 2), and intermediate results do not require
  357. rounding, then floating-point arithmetic is @emph{exact}. It is easy
  358. to predict how many digits are needed for the results of arithmetic
  359. operations:
  360. @itemize @bullet
  361. @item
  362. addition and subtraction of two @var{n}-digit values with the
  363. @emph{same} exponent require at most @code{@var{n} + 1} digits, but
  364. when the exponents differ, many more digits may be needed;
  365. @item
  366. multiplication of two @var{n}-digit values requires exactly
  367. 2 @var{n} digits;
  368. @item
  369. although integer division produces a quotient and a remainder of
  370. no more than @var{n}-digits, floating-point remainder and square
  371. root may require an unbounded number of digits, and the quotient
  372. can need many more digits than can be stored.
  373. @end itemize
  374. Whenever a result requires more than @var{n} digits, rounding
  375. is needed.
  376. @c =====================================================================
  377. @node Rounding
  378. @section Rounding
  379. @cindex rounding
  380. When floating-point arithmetic produces a result that can't fit
  381. exactly in the significand of the type that's in use, it has to
  382. @dfn{round} the value. The basic arithmetic operations---addition,
  383. subtraction, multiplication, division, and square root---always produce
  384. a result that is equivalent to the exact, possibly infinite-precision
  385. result rounded to storage precision according to the current rounding
  386. rule.
  387. Rounding sets the @code{FE_INEXACT} exception flag (@pxref{Exception
  388. Flags}). This enables programs to determine that rounding has
  389. occurred.
  390. Rounding consists of adjusting the exponent to bring the significand
  391. back to the required base-point alignment, then applying the current
  392. @dfn{rounding rule} to squeeze the significand into the fixed
  393. available size.
  394. The current rule is selected at run time from four options. Here they
  395. are:
  396. @itemize *
  397. @item
  398. @emph{round-to-nearest}, with ties rounded to an even integer;
  399. @item
  400. @emph{round-up}, towards @code{+Infinity};
  401. @item
  402. @emph{round-down}, towards @code{-Infinity};
  403. @item
  404. @emph{round-towards-zero}.
  405. @end itemize
  406. Under those four rounding rules, a decimal value
  407. @code{-1.2345} that is to be rounded to a four-digit result would
  408. become @code{-1.234}, @code{-1.234}, @code{-1.235}, and
  409. @code{-1.234}, respectively.
  410. The default rounding rule is @emph{round-to-nearest}, because that has
  411. the least bias, and produces the lowest average error. When the true
  412. result lies exactly halfway between two representable machine numbers,
  413. the result is rounded to the one that ends with an even digit.
  414. The @emph{round-towards-zero} rule was common on many early computer
  415. designs, because it is the easiest to implement: it just requires
  416. silent truncation of all extra bits.
  417. The two other rules, @emph{round-up} and @emph{round-down}, are
  418. essential for implementing @dfn{interval arithmetic}, whereby
  419. each arithmetic operation produces lower and upper bounds that
  420. are guaranteed to enclose the exact result.
  421. @xref{Rounding Control}, for details on getting and setting the
  422. current rounding mode.
  423. @node Rounding Issues
  424. @section Rounding Issues
  425. @cindex rounding issues (floating point)
  426. @cindex floating-point rounding issues
  427. The default IEEE 754 rounding mode minimizes errors, and most
  428. normal computations should not suffer any serious accumulation of
  429. errors from rounding.
  430. Of course, you can contrive examples where that is not so. Here
  431. is one: iterate a square root, then attempt to recover the
  432. original value by repeated squaring.
  433. @example
  434. #include <stdio.h>
  435. #include <math.h>
  436. int main (void)
  437. @{
  438. double x = 100.0;
  439. double y;
  440. for (n = 10; n <= 100; n += 10)
  441. @{
  442. y = x;
  443. for (k = 0; k < n; ++k) y = sqrt (y);
  444. for (k = 0; k < n; ++k) y *= y;
  445. printf ("n = %3d; x = %.0f\ty = %.6f\n", n, x, y);
  446. @}
  447. return 0;
  448. @}
  449. @end example
  450. @noindent
  451. Here is the output:
  452. @example
  453. n = 10; x = 100 y = 100.000000
  454. n = 20; x = 100 y = 100.000000
  455. n = 30; x = 100 y = 99.999977
  456. n = 40; x = 100 y = 99.981025
  457. n = 50; x = 100 y = 90.017127
  458. n = 60; x = 100 y = 1.000000
  459. n = 70; x = 100 y = 1.000000
  460. n = 80; x = 100 y = 1.000000
  461. n = 90; x = 100 y = 1.000000
  462. n = 100; x = 100 y = 1.000000
  463. @end example
  464. After 50 iterations, @code{y} has barely one correct digit, and
  465. soon after, there are no correct digits.
  466. @c =====================================================================
  467. @node Significance Loss
  468. @section Significance Loss
  469. @cindex significance loss (floating point)
  470. @cindex floating-point significance loss
  471. A much more serious source of error in floating-point computation is
  472. @dfn{significance loss} from subtraction of nearly equal values. This
  473. means that the number of bits in the significand of the result is
  474. fewer than the size of the value would permit. If the values being
  475. subtracted are close enough, but still not equal, a @emph{single
  476. subtraction} can wipe out all correct digits, possibly contaminating
  477. all future computations.
  478. Floating-point calculations can sometimes be carefully designed so
  479. that significance loss is not possible, such as summing a series where
  480. all terms have the same sign. For example, the Taylor series
  481. expansions of the trigonometric and hyperbolic sines have terms of
  482. identical magnitude, of the general form @code{@var{x}**(2*@var{n} +
  483. 1) / (2*@var{n} + 1)!}. However, those in the trigonometric sine series
  484. alternate in sign, while those in the hyperbolic sine series are all
  485. positive. Here is the output of two small programs that sum @var{k}
  486. terms of the series for @code{sin (@var{x})}, and compare the computed
  487. sums with known-to-be-accurate library functions:
  488. @example
  489. x = 10 k = 51
  490. s (x) = -0.544_021_110_889_270
  491. sin (x) = -0.544_021_110_889_370
  492. x = 20 k = 81
  493. s (x) = 0.912_945_250_749_573
  494. sin (x) = 0.912_945_250_727_628
  495. x = 30 k = 109
  496. s (x) = -0.987_813_746_058_855
  497. sin (x) = -0.988_031_624_092_862
  498. x = 40 k = 137
  499. s (x) = 0.617_400_430_980_474
  500. sin (x) = 0.745_113_160_479_349
  501. x = 50 k = 159
  502. s (x) = 57_105.187_673_745_720_532
  503. sin (x) = -0.262_374_853_703_929
  504. // sinh(x) series summation with positive signs
  505. // with k terms needed to converge to machine precision
  506. x = 10 k = 47
  507. t (x) = 1.101_323_287_470_340e+04
  508. sinh (x) = 1.101_323_287_470_339e+04
  509. x = 20 k = 69
  510. t (x) = 2.425_825_977_048_951e+08
  511. sinh (x) = 2.425_825_977_048_951e+08
  512. x = 30 k = 87
  513. t (x) = 5.343_237_290_762_229e+12
  514. sinh (x) = 5.343_237_290_762_231e+12
  515. x = 40 k = 105
  516. t (x) = 1.176_926_334_185_100e+17
  517. sinh (x) = 1.176_926_334_185_100e+17
  518. x = 50 k = 121
  519. t (x) = 2.592_352_764_293_534e+21
  520. sinh (x) = 2.592_352_764_293_536e+21
  521. @end example
  522. @noindent
  523. We have added underscores to the numbers to enhance readability.
  524. The @code{sinh (@var{x})} series with positive terms can be summed to
  525. high accuracy. By contrast, the series for @code{sin (@var{x})}
  526. suffers increasing significance loss, so that when @var{x} = 30 only
  527. two correct digits remain. Soon after, all digits are wrong, and the
  528. answers are complete nonsense.
  529. An important skill in numerical programming is to recognize when
  530. significance loss is likely to contaminate a computation, and revise
  531. the algorithm to reduce this problem. Sometimes, the only practical
  532. way to do so is to compute in higher intermediate precision, which is
  533. why the extended types like @code{long double} are important.
  534. @c Formerly mentioned @code{__float128}
  535. @c =====================================================================
  536. @node Fused Multiply-Add
  537. @section Fused Multiply-Add
  538. @cindex fused multiply-add in floating-point computations
  539. @cindex floating-point fused multiply-add
  540. In 1990, when IBM introduced the POWER architecture, the CPU
  541. provided a previously unknown instruction, the @dfn{fused
  542. multiply-add} (FMA). It computes the value @code{x * y + z} with
  543. an @strong{exact} double-length product, followed by an addition with a
  544. @emph{single} rounding. Numerical computation often needs pairs of
  545. multiply and add operations, for which the FMA is well-suited.
  546. On the POWER architecture, there are two dedicated registers that
  547. hold permanent values of @code{0.0} and @code{1.0}, and the
  548. normal @emph{multiply} and @emph{add} instructions are just
  549. wrappers around the FMA that compute @code{x * y + 0.0} and
  550. @code{x * 1.0 + z}, respectively.
  551. In the early days, it appeared that the main benefit of the FMA
  552. was getting two floating-point operations for the price of one,
  553. almost doubling the performance of some algorithms. However,
  554. numerical analysts have since shown numerous uses of the FMA for
  555. significantly enhancing accuracy. We discuss one of the most
  556. important ones in the next section.
  557. A few other architectures have since included the FMA, and most
  558. provide variants for the related operations @code{x * y - z}
  559. (FMS), @code{-x * y + z} (FNMA), and @code{-x * y - z} (FNMS).
  560. @c The IEEE 754-2008 revision requires implementations to provide
  561. @c the FMA, as a sixth basic operation.
  562. The functions @code{fmaf}, @code{fma}, and @code{fmal} implement fused
  563. multiply-add for the @code{float}, @code{double}, and @code{long
  564. double} data types. Correct implementation of the FMA in software is
  565. difficult, and some systems that appear to provide those functions do
  566. not satisfy the single-rounding requirement. That situation should
  567. change as more programmers use the FMA operation, and more CPUs
  568. provide FMA in hardware.
  569. Use the @option{-ffp-contract=fast} option to allow generation of FMA
  570. instructions, or @option{-ffp-contract=off} to disallow it.
  571. @c =====================================================================
  572. @node Error Recovery
  573. @section Error Recovery
  574. @cindex error recovery (floating point)
  575. @cindex floating-point error recovery
  576. When two numbers are combined by one of the four basic
  577. operations, the result often requires rounding to storage
  578. precision. For accurate computation, one would like to be able
  579. to recover that rounding error. With historical floating-point
  580. designs, it was difficult to do so portably, but now that IEEE
  581. 754 arithmetic is almost universal, the job is much easier.
  582. For addition with the default @emph{round-to-nearest} rounding
  583. mode, we can determine the error in a sum like this:
  584. @example
  585. volatile double err, sum, tmp, x, y;
  586. if (fabs (x) >= fabs (y))
  587. @{
  588. sum = x + y;
  589. tmp = sum - x;
  590. err = y - tmp;
  591. @}
  592. else /* fabs (x) < fabs (y) */
  593. @{
  594. sum = x + y;
  595. tmp = sum - y;
  596. err = x - tmp;
  597. @}
  598. @end example
  599. @noindent
  600. @cindex twosum
  601. Now, @code{x + y} is @emph{exactly} represented by @code{sum + err}.
  602. This basic operation, which has come to be called @dfn{twosum}
  603. in the numerical-analysis literature, is the first key to tracking,
  604. and accounting for, rounding error.
  605. To determine the error in subtraction, just swap the @code{+} and
  606. @code{-} operators.
  607. We used the @code{volatile} qualifier (@pxref{volatile}) in the
  608. declaration of the variables, which forces the compiler to store and
  609. retrieve them from memory, and prevents the compiler from optimizing
  610. @code{err = y - ((x + y) - x)} into @code{err = 0}.
  611. For multiplication, we can compute the rounding error without
  612. magnitude tests with the FMA operation (@pxref{Fused Multiply-Add}),
  613. like this:
  614. @example
  615. volatile double err, prod, x, y;
  616. prod = x * y; /* @r{rounded product} */
  617. err = fma (x, y, -prod); /* @r{exact product @code{= @var{prod} + @var{err}}} */
  618. @end example
  619. For addition, subtraction, and multiplication, we can represent the
  620. exact result with the notional sum of two values. However, the exact
  621. result of division, remainder, or square root potentially requires an
  622. infinite number of digits, so we can at best approximate it.
  623. Nevertheless, we can compute an error term that is close to the true
  624. error: it is just that error value, rounded to machine precision.
  625. For division, you can approximate @code{x / y} with @code{quo + err}
  626. like this:
  627. @example
  628. volatile double err, quo, x, y;
  629. quo = x / y;
  630. err = fma (-quo, y, x) / y;
  631. @end example
  632. For square root, we can approximate @code{sqrt (x)} with @code{root +
  633. err} like this:
  634. @example
  635. volatile double err, root, x;
  636. root = sqrt (x);
  637. err = fma (-root, root, x) / (root + root);
  638. @end example
  639. With the reliable and predictable floating-point design provided
  640. by IEEE 754 arithmetic, we now have the tools we need to track
  641. errors in the five basic floating-point operations, and we can
  642. effectively simulate computing in twice working precision, which
  643. is sometimes sufficient to remove almost all traces of arithmetic
  644. errors.
  645. @c =====================================================================
  646. @ignore
  647. @node Double-Rounding Problems
  648. @section Double-Rounding Problems
  649. @cindex double-rounding problems (floating point)
  650. @cindex floating-point double-rounding problems
  651. Most developers today use 64-bit x86 processors with a 64-bit
  652. operating system, with a Streaming SIMD Extensions (SSE) instruction
  653. set. In the past, using a 32-bit x87 instruction set was common,
  654. leading to issues described in this section.
  655. To offer a few more digits of precision and a wider exponent range,
  656. the IEEE 754 Standard included an optional @emph{temporary real}
  657. format, with 11 more bits in the significand, and 4 more bits in the
  658. biased exponent.
  659. Compilers are free to exploit the longer format, and most do so.
  660. That is usually a @emph{good thing}, such as in computing a
  661. lengthy sum or product, or in implementing the computation of the
  662. hypotenuse of a right-triangle as @code{sqrt (x*x + y*y)}: the
  663. wider exponent range is critical there for avoiding disastrous
  664. overflow or underflow.
  665. @findex fesetprec
  666. @findex fegetprec
  667. However, sometimes it is critical to know what the intermediate
  668. precision and rounding mode are, such as in tracking errors with
  669. the techniques of the preceding section. Some compilers provide
  670. options to prevent the use of the 80-bit format in computations
  671. with 64-bit @code{double}, but ensuring that code is always
  672. compiled that way may be difficult. The x86 architecture has the
  673. ability to force rounding of all operations in the 80-bit
  674. registers to the 64-bit storage format, and some systems provide
  675. a software interface with the functions @code{fesetprec} and
  676. @code{fegetprec}. Unfortunately, neither of those functions is
  677. defined by the ISO Standards for C and C++, and consequently,
  678. they are not standardly available on all platforms that use
  679. the x86 floating-point design.
  680. When @code{double} computations are done in the 80-bit format,
  681. results necessarily involve a @dfn{double rounding}: first to the
  682. 64-bit significand in intermediate operations in registers, and
  683. then to the 53-bit significand when the register contents are
  684. stored to memory. Here is an example in decimal arithmetic where
  685. such a double rounding results in the wrong answer: round
  686. @code{1_234_999} from seven to five to four digits. The result is
  687. @code{1_235_000}, whereas the correct representation to four
  688. significant digits is @code{1_234_000}.
  689. @cindex -ffloat-store
  690. One way to reduce the use of the 80-bit format is to declare variables
  691. as @code{volatile double}: that way, the compiler is required to store
  692. and load intermediates from memory, rather than keeping them in 80-bit
  693. registers over long sequences of floating-point instructions. Doing
  694. so does not, however, eliminate double rounding. The now-common
  695. x86-64 architecture has separate sets of 32-bit and 64-bit
  696. floating-point registers. The option @option{-float-store} says that
  697. floating-point computation should use only those registers, thus eliminating
  698. the possibility of double rounding.
  699. @end ignore
  700. @c =====================================================================
  701. @node Exact Floating Constants
  702. @section Exact Floating-Point Constants
  703. @cindex exact specification of floating-point constants
  704. @cindex floating-point constants, exact specification of
  705. One of the frustrations that numerical programmers have suffered
  706. with since the dawn of digital computers is the inability to
  707. precisely specify numbers in their programs. On the early
  708. decimal machines, that was not an issue: you could write a
  709. constant @code{1e-30} and be confident of that exact value
  710. being used in floating-point operations. However, when the
  711. hardware works in a base other than 10, then human-specified
  712. numbers have to be converted to that base, and then converted
  713. back again at output time. The two base conversions are rarely
  714. exact, and unwanted rounding errors are introduced.
  715. @cindex hexademical floating-point constants
  716. As computers usually represent numbers in a base other than 10,
  717. numbers often must be converted to and from different bases, and
  718. rounding errors can occur during conversion. This problem is solved
  719. in C using hexademical floating-point constants. For example,
  720. @code{+0x1.fffffcp-1} is the number that is the IEEE 754 32-bit value
  721. closest to, but below, @code{1.0}. The significand is represented as a
  722. hexadecimal fraction, and the @emph{power of two} is written in
  723. decimal following the exponent letter @code{p} (the traditional
  724. exponent letter @code{e} is not possible, because it is a hexadecimal
  725. digit).
  726. In @code{printf} and @code{scanf} and related functions, you can use
  727. the @samp{%a} and @samp{%A} format specifiers for writing and reading
  728. hexadecimal floating-point values. @samp{%a} writes them with lower
  729. case letters and @samp{%A} writes them with upper case letters. For
  730. instance, this code reproduces our sample number:
  731. @example
  732. printf ("%a\n", 1.0 - pow (2.0, -23));
  733. @print{} 0x1.fffffcp-1
  734. @end example
  735. @noindent
  736. The @code{strtod} family was similarly extended to recognize
  737. numbers in that new format.
  738. If you want to ensure exact data representation for transfer of
  739. floating-point numbers between C programs on different
  740. computers, then hexadecimal constants are an optimum choice.
  741. @c =====================================================================
  742. @node Handling Infinity
  743. @section Handling Infinity
  744. @cindex infinity in floating-point arithmetic
  745. @cindex floating-point infinity
  746. As we noted earlier, the IEEE 754 model of computing is not to stop
  747. the program when exceptional conditions occur. It takes note of
  748. exceptional values or conditions by setting sticky @dfn{exception
  749. flags}, or by producing results with the special values Infinity and
  750. QNaN. In this section, we discuss Infinity; @pxref{Handling NaN} for
  751. the other.
  752. In GNU C, you can create a value of negative Infinity in software like
  753. this:
  754. @verbatim
  755. double x;
  756. x = -1.0 / 0.0;
  757. @end verbatim
  758. GNU C supplies the @code{__builtin_inf}, @code{__builtin_inff}, and
  759. @code{__builtin_infl} macros, and the GNU C Library provides the
  760. @code{INFINITY} macro, all of which are compile-time constants for
  761. positive infinity.
  762. GNU C also provides a standard function to test for an Infinity:
  763. @code{isinf (x)} returns @code{1} if the argument is a signed
  764. infinity, and @code{0} if not.
  765. Infinities can be compared, and all Infinities of the same sign are
  766. equal: there is no notion in IEEE 754 arithmetic of different kinds of
  767. Infinities, as there are in some areas of mathematics. Positive
  768. Infinity is larger than any finite value, and negative Infinity is
  769. smaller than finite value.
  770. Infinities propagate in addition, subtraction, multiplication,
  771. and square root, but in division, they disappear, because of the
  772. rule that @code{finite / Infinity} is @code{0.0}. Thus, an
  773. overflow in an intermediate computation that produces an Infinity
  774. is likely to be noticed later in the final results. The programmer can
  775. then decide whether the overflow is expected, and acceptable, or whether
  776. the code possibly has a bug, or needs to be run in higher
  777. precision, or redesigned to avoid the production of the Infinity.
  778. @c =====================================================================
  779. @node Handling NaN
  780. @section Handling NaN
  781. @cindex NaN in floating-point arithmetic
  782. @cindex not a number
  783. @cindex floating-point NaN
  784. NaNs are not numbers: they represent values from computations that
  785. produce undefined results. They have a distinctive property that
  786. makes them unlike any other floating-point value: they are
  787. @emph{unequal to everything, including themselves}! Thus, you can
  788. write a test for a NaN like this:
  789. @example
  790. if (x != x)
  791. printf ("x is a NaN\n");
  792. @end example
  793. @noindent
  794. This test works in GNU C, but some compilers might evaluate that test
  795. expression as false without properly checking for the NaN value.
  796. A more portable way to test for NaN is to use the @code{isnan}
  797. function declared in @code{math.h}:
  798. @example
  799. if (isnan (x))
  800. printf ("x is a NaN\n");
  801. @end example
  802. @noindent
  803. @xref{Floating Point Classes, , , libc, The GNU C Library Reference Manual}.
  804. One important use of NaNs is marking of missing data. For
  805. example, in statistics, such data must be omitted from
  806. computations. Use of any particular finite value for missing
  807. data would eventually collide with real data, whereas such data
  808. could never be a NaN, so it is an ideal marker. Functions that
  809. deal with collections of data that may have holes can be written
  810. to test for, and ignore, NaN values.
  811. It is easy to generate a NaN in computations: evaluating @code{0.0 /
  812. 0.0} is the commonest way, but @code{Infinity - Infinity},
  813. @code{Infinity / Infinity}, and @code{sqrt (-1.0)} also work.
  814. Functions that receive out-of-bounds arguments can choose to return a
  815. stored NaN value, such as with the @code{NAN} macro defined in
  816. @code{math.h}, but that does not set the @emph{invalid operand}
  817. exception flag, and that can fool some programs.
  818. @cindex NaNs-always-propagate rule
  819. Like Infinity, NaNs propagate in computations, but they are even
  820. stickier, because they never disappear in division. Thus, once a
  821. NaN appears in a chain of numerical operations, it is almost
  822. certain to pop out into the final results. The programmer
  823. has to decide whether that is expected, or whether there is a
  824. coding or algorithmic error that needs repair.
  825. In general, when function gets a NaN argument, it usually returns a
  826. NaN. However, there are some exceptions in the math-library functions
  827. that you need to be aware of, because they violate the
  828. @emph{NaNs-always-propagate} rule:
  829. @itemize @bullet
  830. @item
  831. @code{pow (x, 0.0)} always returns @code{1.0}, even if @code{x} is
  832. 0.0, Infinity, or a NaN.
  833. @item
  834. @code{pow (1, y)} always returns @code{1}, even if @code{y} is a NaN.
  835. @item
  836. @code{hypot (INFINITY, y)} and @code{hypot (-INFINITY, y)} both
  837. always return @code{INFINITY}, even if @code{y} is a Nan.
  838. @item
  839. If just one of the arguments to @code{fmax (x, y)} or
  840. @code{fmin (x, y)} is a NaN, it returns the other argument. If
  841. both arguments are NaNs, it returns a NaN, but there is no
  842. requirement about where it comes from: it could be @code{x}, or
  843. @code{y}, or some other quiet NaN.
  844. @end itemize
  845. NaNs are also used for the return values of math-library
  846. functions where the result is not representable in real
  847. arithmetic, or is mathematically undefined or uncertain, such as
  848. @code{sqrt (-1.0)} and @code{sin (Infinity)}. However, note that a
  849. result that is merely too big to represent should always produce
  850. an Infinity, such as with @code{exp (1000.0)} (too big) and
  851. @code{exp (Infinity)} (truly infinite).
  852. @c =====================================================================
  853. @node Signed Zeros
  854. @section Signed Zeros
  855. @cindex signed zeros in floating-point arithmetic
  856. @cindex floating-point signed zeros
  857. The sign of zero is significant, and important, because it records the
  858. creation of a value that is too small to represent, but came from
  859. either the negative axis, or from the positive axis. Such fine
  860. distinctions are essential for proper handling of @dfn{branch cuts}
  861. in complex arithmetic (@pxref{Complex Arithmetic}).
  862. The key point about signed zeros is that in comparisons, their sign
  863. does not matter: @code{0.0 == -0.0} must @emph{always} evaluate to
  864. @code{1} (true). However, they are not @emph{the same number}, and
  865. @code{-0.0} in C code stands for a negative zero.
  866. @c =====================================================================
  867. @node Scaling by the Base
  868. @section Scaling by Powers of the Base
  869. @cindex scaling floating point by powers of the base
  870. @cindex floating-point scaling by powers of the base
  871. We have discussed rounding errors several times in this chapter,
  872. but it is important to remember that when results require no more
  873. bits than the exponent and significand bits can represent, those results
  874. are @emph{exact}.
  875. One particularly useful exact operation is scaling by a power of
  876. the base. While one, in principle, could do that with code like
  877. this:
  878. @example
  879. y = x * pow (2.0, (double)k); /* @r{Undesirable scaling: avoid!} */
  880. @end example
  881. @noindent
  882. that is not advisable, because it relies on the quality of the
  883. math-library power function, and that happens to be one of the
  884. most difficult functions in the C math library to make accurate.
  885. What is likely to happen on many systems is that the returned
  886. value from @code{pow} will be close to a power of two, but
  887. slightly different, so the subsequent multiplication introduces
  888. rounding error.
  889. The correct, and fastest, way to do the scaling is either via the
  890. traditional C library function, or with its C99 equivalent:
  891. @example
  892. y = ldexp (x, k); /* @r{Traditional pre-C99 style.} */
  893. y = scalbn (x, k); /* @r{C99 style.} */
  894. @end example
  895. @noindent
  896. Both functions return @code{x * 2**k}.
  897. @xref{Normalization Functions, , , libc, The GNU C Library Reference Manual}.
  898. @c =====================================================================
  899. @node Rounding Control
  900. @section Rounding Control
  901. @cindex rounding control (floating point)
  902. @cindex floating-point rounding control
  903. Here we describe how to specify the rounding mode at run time. System
  904. header file @file{fenv.h} provides the prototypes for these functions.
  905. @xref{Rounding, , , libc, The GNU C Library Reference Manual}.
  906. @noindent
  907. That header file also provides constant names for the four rounding modes:
  908. @code{FE_DOWNWARD}, @code{FE_TONEAREST}, @code{FE_TOWARDZERO}, and
  909. @code{FE_UPWARD}.
  910. The function @code{fegetround} examines and returns the current
  911. rounding mode. On a platform with IEEE 754 floating point,
  912. the value will always equal one of those four constants.
  913. On other platforms, it may return a negative value. The function
  914. @code{fesetround} sets the current rounding mode.
  915. Changing the rounding mode can be slow, so it is useful to minimize
  916. the number of changes. For interval arithmetic, we seem to need three
  917. changes for each operation, but we really only need two, because we
  918. can write code like this example for interval addition of two reals:
  919. @example
  920. @{
  921. struct interval_double
  922. @{
  923. double hi, lo;
  924. @} v;
  925. volatile double x, y;
  926. int rule;
  927. rule = fegetround ();
  928. if (fesetround (FE_UPWARD) == 0)
  929. @{
  930. v.hi = x + y;
  931. v.lo = -(-x - y);
  932. @}
  933. else
  934. fatal ("ERROR: failed to change rounding rule");
  935. if (fesetround (rule) != 0)
  936. fatal ("ERROR: failed to restore rounding rule");
  937. @}
  938. @end example
  939. @noindent
  940. The @code{volatile} qualifier (@pxref{volatile}) is essential on x86
  941. platforms to prevent an optimizing compiler from producing the same
  942. value for both bounds.
  943. @ignore We no longer discuss the double rounding issue.
  944. The code also needs to be compiled with the
  945. option @option{-ffloat-store} that prevents use of higher precision
  946. for the basic operations, because that would introduce double rounding
  947. that could spoil the bounds guarantee of interval arithmetic.
  948. @end ignore
  949. @c =====================================================================
  950. @node Machine Epsilon
  951. @section Machine Epsilon
  952. @cindex machine epsilon (floating point)
  953. @cindex floating-point machine epsilon
  954. In any floating-point system, three attributes are particularly
  955. important to know: @dfn{base} (the number that the exponent specifies
  956. a power of), @dfn{precision} (number of digits in the significand),
  957. and @dfn{range} (difference between most positive and most negative
  958. values). The allocation of bits between exponent and significand
  959. decides the answers to those questions.
  960. A measure of the precision is the answer to the question: what is
  961. the smallest number that can be added to @code{1.0} such that the
  962. sum differs from @code{1.0}? That number is called the
  963. @dfn{machine epsilon}.
  964. We could define the needed machine-epsilon constants for @code{float},
  965. @code{double}, and @code{long double} like this:
  966. @example
  967. static const float epsf = 0x1p-23; /* @r{about 1.192e-07} */
  968. static const double eps = 0x1p-52; /* @r{about 2.220e-16} */
  969. static const long double epsl = 0x1p-63; /* @r{about 1.084e-19} */
  970. @end example
  971. @noindent
  972. Instead of the hexadecimal constants, we could also have used the
  973. Standard C macros, @code{FLT_EPSILON}, @code{DBL_EPSILON}, and
  974. @code{LDBL_EPSILON}.
  975. It is useful to be able to compute the machine epsilons at
  976. run time, and we can easily generalize the operation by replacing
  977. the constant @code{1.0} with a user-supplied value:
  978. @example
  979. double
  980. macheps (double x)
  981. @{ /* @r{Return machine epsilon for @var{x},} */
  982. @r{such that @var{x} + macheps (@var{x}) > @var{x}.} */
  983. static const double base = 2.0;
  984. double eps;
  985. if (isnan (x))
  986. eps = x;
  987. else
  988. @{
  989. eps = (x == 0.0) ? 1.0 : x;
  990. while ((x + eps / base) != x)
  991. eps /= base; /* @r{Always exact!} */
  992. @}
  993. return (eps);
  994. @}
  995. @end example
  996. @noindent
  997. If we call that function with arguments from @code{0} to
  998. @code{10}, as well as Infinity and NaN, and print the returned
  999. values in hexadecimal, we get output like this:
  1000. @example
  1001. macheps ( 0) = 0x1.0000000000000p-1074
  1002. macheps ( 1) = 0x1.0000000000000p-52
  1003. macheps ( 2) = 0x1.0000000000000p-51
  1004. macheps ( 3) = 0x1.8000000000000p-52
  1005. macheps ( 4) = 0x1.0000000000000p-50
  1006. macheps ( 5) = 0x1.4000000000000p-51
  1007. macheps ( 6) = 0x1.8000000000000p-51
  1008. macheps ( 7) = 0x1.c000000000000p-51
  1009. macheps ( 8) = 0x1.0000000000000p-49
  1010. macheps ( 9) = 0x1.2000000000000p-50
  1011. macheps ( 10) = 0x1.4000000000000p-50
  1012. macheps (Inf) = infinity
  1013. macheps (NaN) = nan
  1014. @end example
  1015. @noindent
  1016. Notice that @code{macheps} has a special test for a NaN to prevent an
  1017. infinite loop.
  1018. @ignore We no longer discuss double rounding.
  1019. To ensure that no expressions are evaluated with an intermediate higher
  1020. precision, we can compile with the @option{-fexcess-precision=standard}
  1021. option, which tells the compiler that all calculation results, including
  1022. intermediate results, are to be put on the stack, forcing rounding.
  1023. @end ignore
  1024. Our code made another test for a zero argument to avoid getting a
  1025. zero return. The returned value in that case is the smallest
  1026. representable floating-point number, here the subnormal value
  1027. @code{2**(-1074)}, which is about @code{4.941e-324}.
  1028. No special test is needed for an Infinity, because the
  1029. @code{eps}-reduction loop then terminates at the first iteration.
  1030. Our @code{macheps} function here assumes binary floating point; some
  1031. architectures may differ.
  1032. The C library includes some related functions that can also be used to
  1033. determine machine epsilons at run time:
  1034. @example
  1035. #include <math.h> /* @r{Include for these prototypes.} */
  1036. double nextafter (double x, double y);
  1037. float nextafterf (float x, float y);
  1038. long double nextafterl (long double x, long double y);
  1039. @end example
  1040. @noindent
  1041. These return the machine number nearest @var{x} in the direction of
  1042. @var{y}. For example, @code{nextafter (1.0, 2.0)} produces the same
  1043. result as @code{1.0 + macheps (1.0)} and @code{1.0 + DBL_EPSILON}.
  1044. @xref{FP Bit Twiddling, , , libc, The GNU C Library Reference Manual}.
  1045. It is important to know that the machine epsilon is not symmetric
  1046. about all numbers. At the boundaries where normalization changes the
  1047. exponent, the epsilon below @var{x} is smaller than that just above
  1048. @var{x} by a factor @code{1 / base}. For example, @code{macheps
  1049. (1.0)} returns @code{+0x1p-52}, whereas @code{macheps (-1.0)} returns
  1050. @code{+0x1p-53}. Some authors distinguish those cases by calling them
  1051. the @emph{positive} and @emph{negative}, or @emph{big} and
  1052. @emph{small}, machine epsilons. You can produce their values like
  1053. this:
  1054. @example
  1055. eps_neg = 1.0 - nextafter (1.0, -1.0);
  1056. eps_pos = nextafter (1.0, +2.0) - 1.0;
  1057. @end example
  1058. If @var{x} is a variable, such that you do not know its value at
  1059. compile time, then you can substitute literal @var{y} values with
  1060. either @code{-inf()} or @code{+inf()}, like this:
  1061. @example
  1062. eps_neg = x - nextafter (x, -inf ());
  1063. eps_pos = nextafter (x, +inf() - x);
  1064. @end example
  1065. @noindent
  1066. In such cases, if @var{x} is Infinity, then @emph{the @code{nextafter}
  1067. functions return @var{y} if @var{x} equals @var{y}}. Our two
  1068. assignments then produce @code{+0x1.fffffffffffffp+1023} (about
  1069. 1.798e+308) for @var{eps_neg} and Infinity for @var{eps_pos}. Thus,
  1070. the call @code{nextafter (INFINITY, -INFINITY)} can be used to find
  1071. the largest representable finite number, and with the call
  1072. @code{nextafter (0.0, 1.0)}, the smallest representable number (here,
  1073. @code{0x1p-1074} (about 4.491e-324), a number that we saw before as
  1074. the output from @code{macheps (0.0)}).
  1075. @c =====================================================================
  1076. @node Complex Arithmetic
  1077. @section Complex Arithmetic
  1078. @cindex complex arithmetic in floating-point calculations
  1079. @cindex floating-point arithmetic with complex numbers
  1080. We've already looked at defining and referring to complex numbers
  1081. (@pxref{Complex Data Types}). What is important to discuss here are
  1082. some issues that are unlikely to be obvious to programmers without
  1083. extensive experience in both numerical computing, and in complex
  1084. arithmetic in mathematics.
  1085. The first important point is that, unlike real arithmetic, in complex
  1086. arithmetic, the danger of significance loss is @emph{pervasive}, and
  1087. affects @emph{every one} of the basic operations, and @emph{almost
  1088. all} of the math-library functions. To understand why, recall the
  1089. rules for complex multiplication and division:
  1090. @example
  1091. a = u + I*v /* @r{First operand.} */
  1092. b = x + I*y /* @r{Second operand.} */
  1093. prod = a * b
  1094. = (u + I*v) * (x + I*y)
  1095. = (u * x - v * y) + I*(v * x + u * y)
  1096. quo = a / b
  1097. = (u + I*v) / (x + I*y)
  1098. = [(u + I*v) * (x - I*y)] / [(x + I*y) * (x - I*y)]
  1099. = [(u * x + v * y) + I*(v * x - u * y)] / (x**2 + y**2)
  1100. @end example
  1101. @noindent
  1102. There are four critical observations about those formulas:
  1103. @itemize @bullet
  1104. @item
  1105. the multiplications on the right-hand side introduce the
  1106. possibility of premature underflow or overflow;
  1107. @item
  1108. the products must be accurate to twice working precision;
  1109. @item
  1110. there is @emph{always} one subtraction on the right-hand sides
  1111. that is subject to catastrophic significance loss; and
  1112. @item
  1113. complex multiplication has up to @emph{six} rounding errors, and
  1114. complex division has @emph{ten} rounding errors.
  1115. @end itemize
  1116. @cindex branch cuts
  1117. Another point that needs careful study is the fact that many functions
  1118. in complex arithmetic have @dfn{branch cuts}. You can view a
  1119. function with a complex argument, @code{f (z)}, as @code{f (x + I*y)},
  1120. and thus, it defines a relation between a point @code{(x, y)} on the
  1121. complex plane with an elevation value on a surface. A branch cut
  1122. looks like a tear in that surface, so approaching the cut from one
  1123. side produces a particular value, and from the other side, a quite
  1124. different value. Great care is needed to handle branch cuts properly,
  1125. and even small numerical errors can push a result from one side to the
  1126. other, radically changing the returned value. As we reported earlier,
  1127. correct handling of the sign of zero is critically important for
  1128. computing near branch cuts.
  1129. The best advice that we can give to programmers who need complex
  1130. arithmetic is to always use the @emph{highest precision available},
  1131. and then to carefully check the results of test calculations to gauge
  1132. the likely accuracy of the computed results. It is easy to supply
  1133. test values of real and imaginary parts where all five basic
  1134. operations in complex arithmetic, and almost all of the complex math
  1135. functions, lose @emph{all} significance, and fail to produce even a
  1136. single correct digit.
  1137. Even though complex arithmetic makes some programming tasks
  1138. easier, it may be numerically preferable to rework the algorithm
  1139. so that it can be carried out in real arithmetic. That is
  1140. commonly possible in matrix algebra.
  1141. GNU C can perform code optimization on complex number multiplication and
  1142. division if certain boundary checks will not be needed. The
  1143. command-line option @option{-fcx-limited-range} tells the compiler that
  1144. a range reduction step is not needed when performing complex division,
  1145. and that there is no need to check if a complex multiplication or
  1146. division results in the value @code{Nan + I*NaN}. By default these
  1147. checks are enabled. You can explicitly enable them with the
  1148. @option{-fno-cx-limited-range} option.
  1149. @ignore
  1150. @c =====================================================================
  1151. @node More on Decimal Floating-Point Arithmetic
  1152. @section More on Decimal Floating-Point Arithmetic
  1153. @cindex decimal floating-point arithmetic
  1154. @cindex floating-point arithmetic, decimal
  1155. Proposed extensions to the C programming language call for the
  1156. inclusion of decimal floating-point arithmetic, which handles
  1157. floating-point numbers with a specified radix of 10, instead of the
  1158. unspecified traditional radix of 2.
  1159. The proposed new types are @code{_Decimal32}, @code{_Decimal64}, and
  1160. @code{_Decimal128}, with corresponding literal constant suffixes of
  1161. @code{df}, @code{dd}, and @code{dl}, respectively. For example, a
  1162. 32-bit decimal floating-point variable could be defined as:
  1163. @example
  1164. _Decimal32 foo = 42.123df;
  1165. @end example
  1166. We stated earlier (@pxref{decimal-significand}) that the significand
  1167. in decimal floating-point arithmetic is an integer, rather than
  1168. fractional, value. Decimal instructions do not normally alter the
  1169. exponent by normalizing nonzero significands to remove trailing zeros.
  1170. That design feature is intentional: it allows emulation of the
  1171. fixed-point arithmetic that has commonly been used for financial
  1172. computations.
  1173. One consequence of the lack of normalization is that there are
  1174. multiple representations of any number that does not use all of the
  1175. significand digits. Thus, in the 32-bit format, the values
  1176. @code{1.DF}, @code{1.0DF}, @code{1.00DF}, @dots{}, @code{1.000_000DF},
  1177. all have different bit patterns in storage, even though they compare
  1178. equal. Thus, programmers need to be careful about trailing zero
  1179. digits, because they appear in the results, and affect scaling. For
  1180. example, @code{1.0DF * 1.0DF} evaluates to @code{1.00DF}, which is
  1181. stored as @code{100 * 10**(-2)}.
  1182. In general, when you look at a decimal expression with fractional
  1183. digits, you should mentally rewrite it in integer form with suitable
  1184. powers of ten. Thus, a multiplication like @code{1.23 * 4.56} really
  1185. means @code{123 * 10**(-2) * 456 * 10**(-2)}, which evaluates to
  1186. @code{56088 * 10**(-4)}, and would be output as @code{5.6088}.
  1187. Another consequence of the decimal significand choice is that
  1188. initializing decimal floating-point values to a pattern of
  1189. all-bits-zero does not produce the expected value @code{0.}: instead,
  1190. the result is the subnormal values @code{0.e-101}, @code{0.e-398}, and
  1191. @code{0.e-6176} in the three storage formats.
  1192. GNU C currently supports basic storage and manipulation of decimal
  1193. floating-point values on some platforms, and support is expected to
  1194. grow in future releases.
  1195. @c ??? Suggest chopping the rest of this section, at least for the time
  1196. @c ??? being. Decimal floating-point support in GNU C is not yet complete,
  1197. @c ??? and functionality discussed appears to not be available on all
  1198. @c ??? platforms, and is not obviously documented for end users of GNU C. --TJR
  1199. The exponent in decimal arithmetic is called the @emph{quantum}, and
  1200. financial computations require that the quantum always be preserved.
  1201. If it is not, then rounding may have happened, and additional scaling
  1202. is required.
  1203. The function @code{samequantumd (x,y)} for 64-bit decimal arithmetic
  1204. returns @code{1} if the arguments have the same exponent, and @code{0}
  1205. otherwise.
  1206. The function @code{quantized (x,y)} returns a value of @var{x} that has
  1207. been adjusted to have the same quantum as @var{y}; that adjustment
  1208. could require rounding of the significand. For example,
  1209. @code{quantized (5.dd, 1.00dd)} returns the value @code{5.00dd}, which
  1210. is stored as @code{500 * 10**(-2)}. As another example, a sales-tax
  1211. computation might be carried out like this:
  1212. @example
  1213. decimal_long_double amount, rate, total;
  1214. amount = 0.70DL;
  1215. rate = 1.05DL;
  1216. total = quantizedl (amount * rate, 1.00DL);
  1217. @end example
  1218. @noindent
  1219. Without the call to @code{quantizedl}, the result would have been
  1220. @code{0.7350}, instead of the correctly rounded @code{0.74}. That
  1221. particular example was chosen because it illustrates yet another
  1222. difference between decimal and binary arithmetic: in the latter, the
  1223. factors each require an infinite number of bits, and their product,
  1224. when converted to decimal, looks like @code{0.734_999_999@dots{}}.
  1225. Thus, rounding the product in binary format to two decimal places
  1226. always gets @code{0.73}, which is the @emph{wrong} answer for tax
  1227. laws.
  1228. In financial computations in decimal floating-point arithmetic, the
  1229. @code{quantized} function family is expected to receive wide use
  1230. whenever multiplication or division change the desired quantum of the
  1231. result.
  1232. The function call @code{normalized (x)} returns a value that is
  1233. numerically equal to @var{x}, but with trailing zeros trimmed.
  1234. Here are some examples of its operation:
  1235. @multitable @columnfractions .5 .5
  1236. @headitem Function Call @tab Result
  1237. @item normalized (+0.00100DD) @tab +0.001DD
  1238. @item normalized (+1.00DD) @tab +1.DD
  1239. @item normalized (+1.E2DD) @tab +1E+2DD
  1240. @item normalized (+100.DD) @tab +1E+2DD
  1241. @item normalized (+100.00DD) @tab +1E+2DD
  1242. @item normalized (+NaN(0x1234)) @tab +NaN(0x1234)
  1243. @item normalized (-NaN(0x1234)) @tab -NaN(0x1234)
  1244. @item normalized (+Infinity) @tab +Infinity
  1245. @item normalized (-Infinity) @tab -Infinity
  1246. @end multitable
  1247. @noindent
  1248. The NaN examples show that payloads are preserved.
  1249. Because the @code{printf} and @code{scanf} families were designed long
  1250. before IEEE 754 decimal arithmetic, their format items do not support
  1251. distinguishing between numbers with identical values, but different
  1252. quanta, and yet, that distinction is likely to be needed in output.
  1253. The solution adopted by one early library for decimal arithmetic is to
  1254. provide a family of number-to-string conversion functions that
  1255. preserve quantization. Here is a code fragment and its output that
  1256. shows how they work.
  1257. @example
  1258. decimal_float x;
  1259. x = 123.000DF;
  1260. printf ("%%He: x = %He\n", x);
  1261. printf ("%%Hf: x = %Hf\n", x);
  1262. printf ("%%Hg: x = %Hg\n", x);
  1263. printf ("ntosdf (x) = %s\n", ntosdf (x));
  1264. %He: x = 1.230000e+02
  1265. %Hf: x = 123.000000
  1266. %Hg: x = 123
  1267. ntosdf (x) = +123.000
  1268. @end example
  1269. @noindent
  1270. The format modifier letter @code{H} indicates a 32-bit decimal value,
  1271. and the modifiers @code{DD} and @code{DL} correspond to the two other
  1272. formats.
  1273. @c =====================================================================
  1274. @end ignore
  1275. @node Round-Trip Base Conversion
  1276. @section Round-Trip Base Conversion
  1277. @cindex round-trip base conversion
  1278. @cindex base conversion (floating point)
  1279. @cindex floating-point round-trip base conversion
  1280. Most numeric programs involve converting between base-2 floating-point
  1281. numbers, as represented by the computer, and base-10 floating-point
  1282. numbers, as entered and handled by the programmer. What might not be
  1283. obvious is the number of base-2 bits vs. base-10 digits required for
  1284. each representation. Consider the following tables showing the number of
  1285. decimal digits representable in a given number of bits, and vice versa:
  1286. @multitable @columnfractions .5 .1 .1 .1 .1 .1
  1287. @item binary in @tab 24 @tab 53 @tab 64 @tab 113 @tab 237
  1288. @item decimal out @tab 9 @tab 17 @tab 21 @tab 36 @tab 73
  1289. @end multitable
  1290. @multitable @columnfractions .5 .1 .1 .1 .1
  1291. @item decimal in @tab 7 @tab 16 @tab 34 @tab 70
  1292. @item binary out @tab 25 @tab 55 @tab 114 @tab 234
  1293. @end multitable
  1294. We can compute the table numbers with these two functions:
  1295. @example
  1296. int
  1297. matula(int nbits)
  1298. @{ /* @r{Return output decimal digits needed for nbits-bits input.} */
  1299. return ((int)ceil((double)nbits / log2(10.0) + 1.0));
  1300. @}
  1301. int
  1302. goldberg(int ndec)
  1303. @{ /* @r{Return output bits needed for ndec-digits input.} */
  1304. return ((int)ceil((double)ndec / log10(2.0) + 1.0));
  1305. @}
  1306. @end example
  1307. One significant observation from those numbers is that we cannot
  1308. achieve correct round-trip conversion between the decimal and
  1309. binary formats in the same storage size! For example, we need 25
  1310. bits to represent a 7-digit value from the 32-bit decimal format,
  1311. but the binary format only has 24 available. Similar
  1312. observations hold for each of the other conversion pairs.
  1313. The general input/output base-conversion problem is astonishingly
  1314. complicated, and solutions were not generally known until the
  1315. publication of two papers in 1990 that are listed later near the end
  1316. of this chapter. For the 128-bit formats, the worst case needs more
  1317. than 11,500 decimal digits of precision to guarantee correct rounding
  1318. in a binary-to-decimal conversion!
  1319. For further details see the references for Bennett Goldberg and David
  1320. Matula.
  1321. @c =====================================================================
  1322. @node Further Reading
  1323. @section Further Reading
  1324. The subject of floating-point arithmetic is much more complex
  1325. than many programmers seem to think, and few books on programming
  1326. languages spend much time in that area. In this chapter, we have
  1327. tried to expose the reader to some of the key ideas, and to warn
  1328. of easily overlooked pitfalls that can soon lead to nonsensical
  1329. results. There are a few good references that we recommend
  1330. for further reading, and for finding other important material
  1331. about computer arithmetic:
  1332. @c =====================================================================
  1333. @c Each bibliography item has a sort key, so the bibliography can be
  1334. @c sorted in emacs with M-x sort-paragraphs on the region with the items.
  1335. @c =====================================================================
  1336. @itemize @bullet
  1337. @item @c sort-key: Abbott
  1338. Paul H. Abbott and 15 others, @cite{Architecture and software support
  1339. in IBM S/390 Parallel Enterprise Servers for IEEE Floating-Point
  1340. arithmetic}, IBM Journal of Research and Development @b{43}(5/6)
  1341. 723--760 (1999),
  1342. @uref{https://doi.org/10.1147/rd.435.0723}. This article gives
  1343. a good description of IBM's algorithm for exact decimal-to-binary
  1344. conversion, complementing earlier ones by Clinger and others.
  1345. @item @c sort-key: Beebe
  1346. Nelson H. F. Beebe, @cite{The Mathematical-Function Computation Handbook:
  1347. Programming Using the MathCW Portable Software Library},
  1348. Springer (2017), ISBN 3-319-64109-3 (hardcover), 3-319-64110-7 (e-book)
  1349. (xxxvi + 1114 pages),
  1350. @uref{https://doi.org/10.1007/978-3-319-64110-2}.
  1351. This book describes portable implementations of a large superset
  1352. of the mathematical functions available in many programming
  1353. languages, extended to a future 256-bit format (70 decimal
  1354. digits), for both binary and decimal floating point. It includes
  1355. a substantial portion of the functions described in the famous
  1356. @cite{NIST Handbook of Mathematical Functions}, Cambridge (2018),
  1357. ISBN 0-521-19225-0.
  1358. See
  1359. @uref{http://www.math.utah.edu/pub/mathcw}
  1360. for compilers and libraries.
  1361. @item @c sort-key: Clinger-1990
  1362. William D. Clinger, @cite{How to Read Floating Point Numbers
  1363. Accurately}, ACM SIGPLAN Notices @b{25}(6) 92--101 (June 1990),
  1364. @uref{https://doi.org/10.1145/93548.93557}.
  1365. See also the papers by Steele & White.
  1366. @item @c sort-key: Clinger-2004
  1367. William D. Clinger, @cite{Retrospective: How to read floating
  1368. point numbers accurately}, ACM SIGPLAN Notices @b{39}(4) 360--371 (April 2004),
  1369. @uref{http://doi.acm.org/10.1145/989393.989430}. Reprint of 1990 paper,
  1370. with additional commentary.
  1371. @item @c sort-key: Goldberg-1967
  1372. I. Bennett Goldberg, @cite{27 Bits Are Not Enough For 8-Digit Accuracy},
  1373. Communications of the ACM @b{10}(2) 105--106 (February 1967),
  1374. @uref{http://doi.acm.org/10.1145/363067.363112}. This paper,
  1375. and its companions by David Matula, address the base-conversion
  1376. problem, and show that the naive formulas are wrong by one or
  1377. two digits.
  1378. @item @c sort-key: Goldberg-1991
  1379. David Goldberg, @cite{What Every Computer Scientist Should Know
  1380. About Floating-Point Arithmetic}, ACM Computing Surveys @b{23}(1)
  1381. 5--58 (March 1991), corrigendum @b{23}(3) 413 (September 1991),
  1382. @uref{https://doi.org/10.1145/103162.103163}.
  1383. This paper has been widely distributed, and reissued in vendor
  1384. programming-language documentation. It is well worth reading,
  1385. and then rereading from time to time.
  1386. @item @c sort-key: Juffa
  1387. Norbert Juffa and Nelson H. F. Beebe, @cite{A Bibliography of
  1388. Publications on Floating-Point Arithmetic},
  1389. @uref{http://www.math.utah.edu/pub/tex/bib/fparith.bib}.
  1390. This is the largest known bibliography of publications about
  1391. floating-point, and also integer, arithmetic. It is actively
  1392. maintained, and in mid 2019, contains more than 6400 references to
  1393. original research papers, reports, theses, books, and Web sites on the
  1394. subject matter. It can be used to locate the latest research in the
  1395. field, and the historical coverage dates back to a 1726 paper on
  1396. signed-digit arithmetic, and an 1837 paper by Charles Babbage, the
  1397. intellectual father of mechanical computers. The entries for the
  1398. Abbott, Clinger, and Steele & White papers cited earlier contain
  1399. pointers to several other important related papers on the
  1400. base-conversion problem.
  1401. @item @c sort-key: Kahan
  1402. William Kahan, @cite{Branch Cuts for Complex Elementary Functions, or
  1403. Much Ado About Nothing's Sign Bit}, (1987),
  1404. @uref{http://people.freebsd.org/~das/kahan86branch.pdf}.
  1405. This Web document about the fine points of complex arithmetic
  1406. also appears in the volume edited by A. Iserles and
  1407. M. J. D. Powell, @cite{The State of the Art in Numerical
  1408. Analysis: Proceedings of the Joint IMA/SIAM Conference on the
  1409. State of the Art in Numerical Analysis held at the University of
  1410. Birmingham, 14--18 April 1986}, Oxford University Press (1987),
  1411. ISBN 0-19-853614-3 (xiv + 719 pages). Its author is the famous
  1412. chief architect of the IEEE 754 arithmetic system, and one of the
  1413. world's greatest experts in the field of floating-point
  1414. arithmetic. An entire generation of his students at the
  1415. University of California, Berkeley, have gone on to careers in
  1416. academic and industry, spreading the knowledge of how to do
  1417. floating-point arithmetic right.
  1418. @item @c sort-key: Knuth
  1419. Donald E. Knuth, @cite{A Simple Program Whose Proof Isn't},
  1420. in @cite{Beauty is our business: a birthday salute to Edsger
  1421. W. Dijkstra}, W. H. J. Feijen, A. J. M. van Gasteren,
  1422. D. Gries, and J. Misra (eds.), Springer (1990), ISBN
  1423. 1-4612-8792-8,
  1424. @uref{https://doi.org/10.1007/978-1-4612-4476-9}. This book
  1425. chapter supplies a correctness proof of the decimal to
  1426. binary, and binary to decimal, conversions in fixed-point
  1427. arithmetic in the TeX typesetting system. The proof evaded
  1428. its author for a dozen years.
  1429. @item @c sort-key: Matula-1968a
  1430. David W. Matula, @cite{In-and-out conversions},
  1431. Communications of the ACM @b{11}(1) 57--50 (January 1968),
  1432. @uref{https://doi.org/10.1145/362851.362887}.
  1433. @item @c sort-key: Matula-1968b
  1434. David W. Matula, @cite{The Base Conversion Theorem},
  1435. Proceedings of the American Mathematical Society @b{19}(3)
  1436. 716--723 (June 1968). See also other papers here by this author,
  1437. and by I. Bennett Goldberg.
  1438. @item @c sort-key: Matula-1970
  1439. David W. Matula, @cite{A Formalization of Floating-Point Numeric
  1440. Base Conversion}, IEEE Transactions on Computers @b{C-19}(8)
  1441. 681--692 (August 1970),
  1442. @uref{https://doi.org/10.1109/T-C.1970.223017}.
  1443. @item @c sort-key: Muller-2010
  1444. Jean-Michel Muller and eight others, @cite{Handbook of
  1445. Floating-Point Arithmetic}, Birkh@"auser-Boston (2010), ISBN
  1446. 0-8176-4704-X (xxiii + 572 pages),
  1447. @uref{https://doi.org/10.1007/978-0-8176-4704-9}. This is a
  1448. comprehensive treatise from a French team who are among the
  1449. world's greatest experts in floating-point arithmetic, and among
  1450. the most prolific writers of research papers in that field. They
  1451. have much to teach, and their book deserves a place on the
  1452. shelves of every serious numerical programmer.
  1453. @item @c sort-key: Muller-2018
  1454. Jean-Michel Muller and eight others, @cite{Handbook of
  1455. Floating-Point Arithmetic}, Second edition, Birkh@"auser-Boston (2018), ISBN
  1456. 3-319-76525-6 (xxv + 627 pages),
  1457. @uref{https://doi.org/10.1007/978-3-319-76526-6}. This is a new
  1458. edition of the preceding entry.
  1459. @item @c sort-key: Overton
  1460. Michael Overton, @cite{Numerical Computing with IEEE Floating
  1461. Point Arithmetic, Including One Theorem, One Rule of Thumb, and
  1462. One Hundred and One Exercises}, SIAM (2001), ISBN 0-89871-482-6
  1463. (xiv + 104 pages),
  1464. @uref{http://www.ec-securehost.com/SIAM/ot76.html}.
  1465. This is a small volume that can be covered in a few hours.
  1466. @item @c sort-key: Steele-1990
  1467. Guy L. Steele Jr. and Jon L. White, @cite{How to Print
  1468. Floating-Point Numbers Accurately}, ACM SIGPLAN Notices
  1469. @b{25}(6) 112--126 (June 1990),
  1470. @uref{https://doi.org/10.1145/93548.93559}.
  1471. See also the papers by Clinger.
  1472. @item @c sort-key: Steele-2004
  1473. Guy L. Steele Jr. and Jon L. White, @cite{Retrospective: How to
  1474. Print Floating-Point Numbers Accurately}, ACM SIGPLAN Notices
  1475. @b{39}(4) 372--389 (April 2004),
  1476. @uref{http://doi.acm.org/10.1145/989393.989431}. Reprint of 1990
  1477. paper, with additional commentary.
  1478. @item @c sort-key: Sterbenz
  1479. Pat H. Sterbenz, @cite{Floating Point Computation}, Prentice-Hall
  1480. (1974), ISBN 0-13-322495-3 (xiv + 316 pages). This often-cited book
  1481. provides solid coverage of what floating-point arithmetic was like
  1482. @emph{before} the introduction of IEEE 754 arithmetic.
  1483. @end itemize