*
@Swordfish sagte in C- double Genauigkeit:
@JamesNguyen sagte in C- double Genauigkeit:
Er rechnet einfach falsch
Is floating point math broken?
What Every Computer Scientist Should Know About Floating-Point Arithmetic
Danke, dass Du das hier anführst, bei JamesNguyen fehlt so einiges an Basiswissen.
Ich habe vor einiger Zeit ein C++-Programm geschrieben, welches die wichtigen Zahlen (es sind in C und C++ die gleichen Werte) für einen Compiler ausgibt. Sofern auf der Plattform vorhanden wird die Quadmath Library berücksichtigt, wenn man das Programm mit -DQUADMATH übersetzt und mit der passenden Bibliothek bindet, unter Linux/UNIX ist es die Linker Option -lquadmath.
#include <iostream>
#include <iomanip>
#include <cfloat>
#ifdef QUADMATH
#include <quadmath.h>
// compile with -DQUADMATH -fext-numeric-literals -lquadmath
constexpr size_t sz_f128 = sizeof(__float128);
constexpr size_t F128_MDIG = FLT128_MANT_DIG;
constexpr size_t F128_DIG = FLT128_DIG;
constexpr long F128_MEXP = FLT128_MIN_EXP;
constexpr long F128_XEXP = FLT128_MAX_EXP;
constexpr long F128_M10P = FLT128_MIN_10_EXP;
constexpr long F128_X10P = FLT128_MAX_10_EXP;
constexpr double F128_EPS = FLT128_EPSILON;
char F128_MIN[13];
char F128_MAX[13];
int r = quadmath_snprintf (F128_MIN, sizeof(F128_MIN), "%+-#*.3Qe", FLT128_MIN);
int s = quadmath_snprintf (F128_MAX, sizeof(F128_MAX), "%+-#*.3Qe", FLT128_MAX);
#else
constexpr size_t sz_f128 = 0;
constexpr size_t F128_MDIG = 0;
constexpr size_t F128_DIG = 0;
constexpr long F128_MEXP = 0;
constexpr long F128_XEXP = 0;
constexpr long F128_M10P = 0;
constexpr long F128_X10P = 0;
constexpr double F128_EPS = 0.0;
char const* const F128_MIN = " 0.000e+00";
char const* const F128_MAX = " 0.000e+00";
#endif
using namespace std;
// newline erase color and send \n
char const* const nll = "\x1b[0m\n";
char const* const nl4 = " \x1b[0m\n";
// column with description of data type
char const* const dsc = "\x1b[0;107;30m";
char const* const dsd = "\x1b[0;47;30m";
// data column style 1, 2 and 3
char const* const ts1 = "\x1b[0;106;30m ";
char const* const ts2 = " \x1b[0;103;30m ";
char const* const ts2q = " \x1b[0;103;30m ";
char const* const ts3 = " \x1b[0;106;30m ";
// column description style 1 and 2
char const* const cd1 = "\x1b[0;46;30m";
char const* const cd2 = "\x1b[0;43;30m";
double const EPS = LDBL_EPSILON;
int main () {
cout << scientific;
cout << dsc << "FLT_RADIX " << ts1 << setw(68) << FLT_RADIX << nl4;
cout << dsd << " " << cd1 << " float " << cd2 << " double " << cd1 << " long double " << cd2 << " __float128 " << nll;
cout << dsc << " sizeof " << ts1 << setw(11) << sizeof(float) << ts2 << setw(11) << sizeof(double) << ts3 << setw(11) << sizeof(long double) << ts2 << setw(11) << sz_f128 << nl4;
cout << dsc << " binary digits mant " << ts1 << setw(11) << FLT_MANT_DIG << ts2 << setw(11) << DBL_MANT_DIG << ts3 << setw(11) << LDBL_MANT_DIG << ts2 << setw(11) << F128_MDIG << nl4;
cout << dsc << "decimal digits mant " << ts1 << setw(11) << FLT_DIG << ts2 << setw(11) << DBL_DIG << ts3 << setw(11) << LDBL_DIG << ts2 << setw(11) << F128_DIG << nl4;
cout << dsc << " minimum exponent " << ts1 << setw(11) << FLT_MIN_EXP << ts2 << setw(11) << DBL_MIN_EXP << ts3 << setw(11) << LDBL_MIN_EXP << ts2 << setw(11) << F128_MEXP << nl4;
cout << dsc << " maximum exponent " << ts1 << setw(11) << FLT_MAX_EXP << ts2 << setw(11) << DBL_MAX_EXP << ts3 << setw(11) << LDBL_MAX_EXP << ts2 << setw(11) << F128_XEXP << nl4;
cout << dsc << "minimum 10 exponent " << ts1 << setw(11) << FLT_MIN_10_EXP << ts2 << setw(11) << DBL_MIN_10_EXP << ts3 << setw(11) << LDBL_MIN_10_EXP << ts2 << setw(11) << F128_M10P << nl4;
cout << dsc << "maximum 10 exponent " << ts1 << setw(11) << FLT_MAX_10_EXP << ts2 << setw(11) << DBL_MAX_10_EXP << ts3 << setw(11) << LDBL_MAX_10_EXP << ts2 << setw(11) << F128_X10P << nl4;
cout << dsc << " minimum " << ts1 << setw(11) << setprecision(3) << FLT_MIN << ts2 << setw(11) << setprecision(3) << DBL_MIN << ts3 << setw(11) << LDBL_MIN << ts2q << setw(11) << F128_MIN << nl4;
cout << dsc << " maximum " << ts1 << setw(11) << setprecision(3) << FLT_MAX << ts2 << setw(11) << setprecision(3) << DBL_MAX << ts3 << setw(11) << LDBL_MAX << ts2q << setw(11) << F128_MAX << nl4;
cout << dsc << " epsilon " << ts1 << setw(11) << setprecision(3) << FLT_EPSILON << ts2 << setw(11) << setprecision(3) << DBL_EPSILON << ts3 << setw(11) << EPS << ts2 << setw(11) << F128_EPS << nl4;
}
Das führt bei meinem g++ mit Quadmath Library zur folgenden Ausgabe.
FLT_RADIX 2
float double long double __float128
sizeof 4 8 16 16
binary digits mant 24 53 64 113
decimal digits mant 6 15 18 33
minimum exponent -125 -1021 -16381 -16381
maximum exponent 128 1024 16384 16384
minimum 10 exponent -37 -307 -4931 -4931
maximum 10 exponent 38 308 4932 4932
minimum 1.175e-38 2.225e-308 3.362e-4932 +3.362e-4932
maximum 3.403e+38 1.798e+308 1.190e+4932 +1.190e+4932
epsilon 1.192e-07 2.220e-16 1.084e-19 1.926e-34
Wichtig ist in diesem Kontext, dass man versteht, dass die Gleitkommazahlen dieser Datentypen binärkodiert sind und keine Dezimalzahlen sind. Meines Wissen ist IBMs POWER Plattform die zur Zeit einzige Hardwareplattform, die in der Lage ist Dezimalgleitkommaarithmetik in Hardware zu machen.