ch3-FloatLimits (float, double, long double limits in C/C++)

Chapter_3     Exercise_3-22     FloatBinary Exercise_3-25     Exercise_3-23







CONTENTS:     floatlimits.c     FloatLimits.cpp




floatlimits.c         download


#include <stdio.h> // for printf()
#include <float.h>
#include <math.h> // for pow() - double, powl() - long double

void printBinary(const unsigned char val); // Display a byte in binary
void printData(unsigned char*, int); // Display a data type in binary
void printData2(unsigned char*, int); // Display data type in binary (2 parts)

int main()
{
int i;

printf("FLT_DIG: %d, DBL_DIG: %d, LDBL_DIG: %d\n",
FLT_DIG, DBL_DIG, LDBL_DIG);
printf("FLT_DECIMAL_DIG: %d, DBL_DECIMAL_DIG: %d, LDBL_DECIMAL_DIG: %d\n",
FLT_DECIMAL_DIG, DBL_DECIMAL_DIG, LDBL_DECIMAL_DIG);
printf("DECIMAL_DIG: %d\n", DECIMAL_DIG);

printf("FLT_MIN_10_EXP: %d, DBL_MIN_10_EXP: %d, LDBL_MIN_10_EXP: %d\n",
FLT_MIN_10_EXP, DBL_MIN_10_EXP, LDBL_MIN_10_EXP);
printf("FLT_MAX_10_EXP: %d, DBL_MAX_10_EXP: %d, LDBL_MAX_10_EXP: %d\n",
FLT_MAX_10_EXP, DBL_MAX_10_EXP, LDBL_MAX_10_EXP);

printf("FLT_MIN_EXP: %d, DBL_MIN_EXP: %d, LDBL_MIN_EXP: %d\n",
FLT_MIN_EXP, DBL_MIN_EXP, LDBL_MIN_EXP);
printf("FLT_MAX_EXP: %d, DBL_MAX_EXP: %d, LDBL_MAX_EXP: %d\n",
FLT_MAX_EXP, DBL_MAX_EXP, LDBL_MAX_EXP);

printf("FLT_EPSILON: %g, DBL_EPSILON: %g, LDBL_EPSILON: %Lg\n",
FLT_EPSILON, DBL_EPSILON, LDBL_EPSILON);
printf("FLT_TRUE_MIN: %g, DBL_TRUE_MIN: %g, LDBL_TRUE_MIN: %Lg\n",
FLT_TRUE_MIN, DBL_TRUE_MIN, LDBL_TRUE_MIN);

printf("FLT_RADIX (Radix of exponent representation): %d\n", FLT_RADIX);
printf("float: %d mantissa digits", FLT_MANT_DIG);
printf("\t[%g, %g]\n", FLT_MIN, FLT_MAX);
printf("FLT_MIN: %g\t\t", FLT_MIN);
float f = FLT_MIN;
printData((unsigned char*)(&f), sizeof(float));
printf("FLT_TRUE_MIN: %g\t", FLT_TRUE_MIN);
f = FLT_TRUE_MIN;
printData((unsigned char*)(&f), sizeof(float));
printf("FLT_MAX: %g\t\t", FLT_MAX);
f = FLT_MAX;
printData((unsigned char*)(&f), sizeof(float));
float fmin, ftruemin, fmax;
fmin = pow(2, -126);
float fmantissa = 1.0;
for (i = 1; i <= 23; i++)
{fmantissa += pow(2, -i);}
fmax = fmantissa * pow(2, 127);
printf("float range (computed):\t\t[%g, %g]\n", fmin, fmax);
printf("fmin (computed): %g\t", fmin);
printData((unsigned char*)(&fmin), sizeof(float));
ftruemin = pow(2, -126) * pow(2, -23); // pow(2, -149); // 2^(-150) ~ 0
printf("ftruemin (comp): %g\t", ftruemin);
printData((unsigned char*)(&ftruemin), sizeof(float));
printf("fmax (computed): %g\t", fmax);
printData((unsigned char*)(&fmax), sizeof(float));
unsigned u = 1; // 00000000000000000000000000000001
u <<= 23; // u = pow(2, 23); // 00000000100000000000000000000000
fmin = *((float*)&u); // sizeof(float) = sizeof(unsigned) = sizeof(int)
printf("fmin shifted: %g\t", fmin);
printData((unsigned char*)(&fmin), sizeof(float));
unsigned v = 1; // 00000000000000000000000000000001
ftruemin = *((float*)&v); // sizeof(float) = sizeof(unsigned)
printf("ftruemin shift: %g\t", ftruemin);
printData((unsigned char*)(&ftruemin), sizeof(float));
v = ~0; // 11111111111111111111111111111111
v >>= 1; // 01111111111111111111111111111111
v &= ~u; // 01111111011111111111111111111111
fmax = *((float*)&v); // sizeof(float) = sizeof(unsigned)
printf("fmax shifted: %g\t", fmax);
printData((unsigned char*)(&fmax), sizeof(float));
putchar('\n');

printf("double: %d mantissa digits", DBL_MANT_DIG);
printf("\t[%g, %g]\n", DBL_MIN, DBL_MAX);
printf("DBL_MIN: %g\n", DBL_MIN);
double d = DBL_MIN;
printData((unsigned char*)(&d), sizeof(double));
printf("DBL_TRUE_MIN: %g\n", DBL_TRUE_MIN);
d = DBL_TRUE_MIN;
printData((unsigned char*)(&d), sizeof(double));
printf("DBL_MAX: %g\n", DBL_MAX);
d = DBL_MAX;
printData((unsigned char*)(&d), sizeof(double));
double dmin, dtruemin, dmax;
dmin = pow(2, -1022);
double dmantissa = 1.0;
for (i = 1; i <= 52; i++)
{dmantissa += pow(2, -i);}
dmax = dmantissa * pow(2, 1023);
printf("double range (computed):\t[%g, %g]\n", dmin, dmax);
printf("dmin (computed): %g\n", dmin);
printData((unsigned char*)(&dmin), sizeof(double));
dtruemin = pow(2, -1022) * pow(2, -52); // pow(2, -1074); // 2^(-1075) ~ 0
printf("dtruemin (comp): %g\n", dtruemin);
printData((unsigned char*)(&dtruemin), sizeof(double));
printf("dmax (computed): %g\n", dmax);
printData((unsigned char*)(&dmax), sizeof(double));
long unsigned lu = 1; // 0...01
lu <<= 52; // lu = pow(2, 52); // 0000000000010...0
dmin = *((double*)&lu); // sizeof(double) = sizeof(long unsigned) = sizeof(long)
printf("dmin shifted: %g\n", dmin);
printData((unsigned char*)(&dmin), sizeof(double));
long unsigned lv = 1; // 0...01
dtruemin = *((double*)&lv); // sizeof(double) = sizeof(long unsigned)
printf("dtruemin shift: %g\n", dtruemin);
printData((unsigned char*)(&dtruemin), sizeof(double));
lv = ~0; // 1...1
lv >>= 1; // 011...11
lv &= ~lu; // 01111111111011...11
dmax = *((double*)&lv); // sizeof(double) = sizeof(long unsigned)
printf("dmax shifted: %g\n", dmax);
printData((unsigned char*)(&dmax), sizeof(double));
putchar('\n');
/*
On my computer,
sizeof(double) = sizeof(long) = sizeof(long long) = 8
sizeof(long double) = 2 * sizeof(double) = 16
`long double' stored on 128 bits =
48 zeros (or garbage) + 80 bits (extended precision)
*/
printf("long double: %d mantissa digits", LDBL_MANT_DIG);
printf("\t[%Lg, %Lg]\n", LDBL_MIN, LDBL_MAX);
printf("LDBL_MIN: %Lg\n", LDBL_MIN);
long double ld = LDBL_MIN;
printData2((unsigned char*)(&ld), sizeof(long double));
printf("LDBL_TRUE_MIN: %Lg\n", LDBL_TRUE_MIN);
ld = LDBL_TRUE_MIN;
printData2((unsigned char*)(&ld), sizeof(long double));
printf("LDBL_MAX: %Lg\n", LDBL_MAX);
ld = LDBL_MAX;
printData2((unsigned char*)(&ld), sizeof(long double));
long double ldmin, ldtruemin, ldmax;
ldmin = powl(2, -16382);
long double ldmantissa = 1.0;
for (i = 1; i <= 63; i++)
{ldmantissa += powl(2, -i);}
ldmax = ldmantissa * powl(2, 16383);
printf("long double range (computed):\t[%Lg, %Lg]\n", ldmin, ldmax);
printf("ldmin (computed): %Lg\n", ldmin);
printData2((unsigned char*)(&ldmin), sizeof(long double));
ldtruemin = powl(2, -16382) * powl(2, -63); // powl(2, -16445); // 2^(-16446) ~ 0
printf("ldtruemin (comp): %Lg\n", ldtruemin);
printData2((unsigned char*)(&ldtruemin), sizeof(long double));
printf("ldmax (computed): %Lg\n", ldmax);
printData2((unsigned char*)(&ldmax), sizeof(long double));
long unsigned uarr[2] = {1, 1}; // 0...01, 0...01
uarr[0] <<= 63; // uarr[0] = powl(2, 63); // 10...0
// sizeof(long double) = 2 * sizeof(long unsigned) = 2 * sizeof(long):
ldmin = *((long double*)uarr);
printf("ldmin shifted: %Lg\n", ldmin);
printData2((unsigned char*)(&ldmin), sizeof(long double));
long unsigned varr[2] = {1, 0}; // 0...01, 0...0
ldtruemin = *((long double*)varr); // 0...01
printf("dtruemin shift: %Lg\n", ldtruemin);
printData2((unsigned char*)(&ldtruemin), sizeof(long double));
varr[0] = varr[1] = ~0; // 1...1
varr[1] >>= 49; // 0...0111111111111111 (49 zeros)
varr[1] &= ~uarr[1]; // 0..0111111111111110 (50 zeros)
/*
// Alternative for varr[1]:
varr[1] >>= 50; // 0...011111111111111 (50 zeros)
varr[1] <<= 1; // 0..0111111111111110 (50 zeros)
*/
ldmax = *((long double*)varr); // sizeof(long double) = 2 * sizeof(long)
printf("ldmax shifted: %Lg\n", ldmax);
printData2((unsigned char*)(&ldmax), sizeof(long double));

return 0;
}

void printBinary(const unsigned char val) // Display a byte in binary
{
int i;

for(i = 7; i >= 0; i--)
{ // print bits from first (most significant) to last (least significant)
if(val & (1 << i)) // set (1) bit
{putchar('1');}
else {putchar('0');} // 0 bit
}
}

void printData(unsigned char* cp, int size) // Display a data type in binary
{ // sizeof(float) = 4: cp[3], cp[2], cp[1], cp[0]
for(; size > 0; size--)
{printBinary(cp[size-1]);}
putchar('\n');
}
// Display a data type in binary (2 parts):
void printData2(unsigned char* cp, int size) // for long double
{ // sizeof(long double) = 16, 2 parts of 8 bytes
int half = size / 2; // we assume size is even
for(; size > half; size--) // second half:
{printBinary(cp[size-1]);} // cp[15], ..., cp[8]
putchar('\n');
for(; size > 0; size--) // first half:
{printBinary(cp[size-1]);} // cp[7], ..., cp[0]
putchar('\n');
}
/*
gcc -E floatlimits.c // preprocess to show the contents of header files
// On disk: /usr/lib/gcc/x86_64-linux-gnu/9/include/float.h
gcc -E floatlimits.c > headers.txt // save to file
// Compile and run:
gcc floatlimits.c -o floatlimits -lm // link math library
./floatlimits
FLT_DIG: 6, DBL_DIG: 15, LDBL_DIG: 18
FLT_DECIMAL_DIG: 9, DBL_DECIMAL_DIG: 17, LDBL_DECIMAL_DIG: 21
DECIMAL_DIG: 21
FLT_MIN_10_EXP: -37, DBL_MIN_10_EXP: -307, LDBL_MIN_10_EXP: -4931
FLT_MAX_10_EXP: 38, DBL_MAX_10_EXP: 308, LDBL_MAX_10_EXP: 4932
FLT_MIN_EXP: -125, DBL_MIN_EXP: -1021, LDBL_MIN_EXP: -16381
FLT_MAX_EXP: 128, DBL_MAX_EXP: 1024, LDBL_MAX_EXP: 16384
FLT_EPSILON: 1.19209e-07, DBL_EPSILON: 2.22045e-16, LDBL_EPSILON: 1.0842e-19
FLT_TRUE_MIN: 1.4013e-45, DBL_TRUE_MIN: 4.94066e-324, LDBL_TRUE_MIN: 3.6452e-4951
FLT_RADIX (Radix of exponent representation): 2 // base 2 (binary)
float: 24 mantissa digits [1.17549e-38, 3.40282e+38]
FLT_MIN: 1.17549e-38 00000000100000000000000000000000
FLT_TRUE_MIN: 1.4013e-45 00000000000000000000000000000001
FLT_MAX: 3.40282e+38 01111111011111111111111111111111
float range (computed): [1.17549e-38, 3.40282e+38]
fmin (computed): 1.17549e-38 00000000100000000000000000000000
ftruemin (comp): 1.4013e-45 00000000000000000000000000000001
fmax (computed): 3.40282e+38 01111111011111111111111111111111
fmin shifted: 1.17549e-38 00000000100000000000000000000000
ftruemin shift: 1.4013e-45 00000000000000000000000000000001
fmax shifted: 3.40282e+38 01111111011111111111111111111111

double: 53 mantissa digits [2.22507e-308, 1.79769e+308]
DBL_MIN: 2.22507e-308
0000000000010000000000000000000000000000000000000000000000000000
DBL_TRUE_MIN: 4.94066e-324
0000000000000000000000000000000000000000000000000000000000000001
DBL_MAX: 1.79769e+308
0111111111101111111111111111111111111111111111111111111111111111
double range (computed): [2.22507e-308, 1.79769e+308]
dmin (computed): 2.22507e-308
0000000000010000000000000000000000000000000000000000000000000000
dtruemin (comp): 4.94066e-324
0000000000000000000000000000000000000000000000000000000000000001
dmax (computed): 1.79769e+308
0111111111101111111111111111111111111111111111111111111111111111
dmin shifted: 2.22507e-308
0000000000010000000000000000000000000000000000000000000000000000
dtruemin shift: 4.94066e-324
0000000000000000000000000000000000000000000000000000000000000001
dmax shifted: 1.79769e+308
0111111111101111111111111111111111111111111111111111111111111111

long double: 64 mantissa digits [3.3621e-4932, 1.18973e+4932]
LDBL_MIN: 3.3621e-4932
0000000000000000000000000000000000000000000000000000000000000001
1000000000000000000000000000000000000000000000000000000000000000
LDBL_TRUE_MIN: 3.6452e-4951
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000001
LDBL_MAX: 1.18973e+4932
0000000000000000000000000000000000000000000000000111111111111110
1111111111111111111111111111111111111111111111111111111111111111
long double range (computed): [3.3621e-4932, 1.18973e+4932]
ldmin (computed): 3.3621e-4932
0000000000000000000000000000000000000000000000000000000000000001
1000000000000000000000000000000000000000000000000000000000000000
ldtruemin (comp): 3.6452e-4951
0000000000000000000000000000000000000000111100000000000000000000
0000000000000000000000000000000000000000000000000000000000000001
ldmax (computed): 1.18973e+4932
0000000000000000011111111111110001100111100011100111111111111110
1111111111111111111111111111111111111111111111111111111111111111
ldmin shifted: 3.3621e-4932
0000000000000000000000000000000000000000000000000000000000000001
1000000000000000000000000000000000000000000000000000000000000000
dtruemin shift: 3.6452e-4951
0000000000000000000000000000000000000000111100000000000000000000
0000000000000000000000000000000000000000000000000000000000000001
ldmax shifted: 1.18973e+4932
0000000000000000011111111111110000011011011001000111111111111110
1111111111111111111111111111111111111111111111111111111111111111
*/





Notes:  See Exercise_2-1 on the blog Kernighan_and_Ritchie, Chapter_2, Sec. 2.2, as well as Exercise_2-1 on clc-wiki-kr (solutions to exercises from "The C Programming Language", 2nd edition, by Kernighan and Ritchie).
See also Single-precision, Double-precision, and Extended_precision (for long_double) floating-point formats on Wikipedia, as well as Exercise_3-29 (for printing the floats in binary on groups of bits).
Note how the preprocessor replaces the constants in the generated file (headers.txt, last part).











FloatLimits.cpp         download


#include <iostream>
#include <cfloat>
#include <cmath> // for pow() - double, powl() - long double
using std::cout;
using std::endl;

void printBinary(const unsigned char val); // Display a byte in binary
void printData(unsigned char*, int); // Display a data type in binary
void printData2(unsigned char*, int); // Display data type in binary (2 parts)

int main()
{
cout << "FLT_DIG: " << FLT_DIG << ", DBL_DIG: " << DBL_DIG
<< ", LDBL_DIG: " << LDBL_DIG << endl;
cout << "FLT_DECIMAL_DIG: " << FLT_DECIMAL_DIG << ", DBL_DECIMAL_DIG: " <<
DBL_DECIMAL_DIG << ", LDBL_DECIMAL_DIG: " << LDBL_DECIMAL_DIG << endl;
cout << "DECIMAL_DIG: " << DECIMAL_DIG << endl;

cout << "FLT_MIN_10_EXP: " << FLT_MIN_10_EXP << ", DBL_MIN_10_EXP: " <<
DBL_MIN_10_EXP << ", LDBL_MIN_10_EXP: " << LDBL_MIN_10_EXP << endl;
cout << "FLT_MAX_10_EXP: " << FLT_MAX_10_EXP << ", DBL_MAX_10_EXP: " <<
DBL_MAX_10_EXP << ", LDBL_MAX_10_EXP: " << LDBL_MAX_10_EXP << endl;

cout << "FLT_MIN_EXP: " << FLT_MIN_EXP << ", DBL_MIN_EXP: " <<
DBL_MIN_EXP << ", LDBL_MIN_EXP: " << LDBL_MIN_EXP << endl;
cout << "FLT_MAX_EXP: " << FLT_MAX_EXP << ", DBL_MAX_EXP: " <<
DBL_MAX_EXP << ", LDBL_MAX_EXP: " << LDBL_MAX_EXP << endl;

cout << "FLT_EPSILON: " << FLT_EPSILON << ", DBL_EPSILON: " <<
DBL_EPSILON << ", LDBL_EPSILON: " << LDBL_EPSILON << endl;
cout << "FLT_TRUE_MIN: " << FLT_TRUE_MIN << ", DBL_TRUE_MIN: " <<
DBL_TRUE_MIN << ", LDBL_TRUE_MIN: " << LDBL_TRUE_MIN << endl;

cout << "FLT_RADIX (Radix of exponent representation): "
<< FLT_RADIX << endl;
cout << "float: " << FLT_MANT_DIG << " mantissa digits";
cout << "\t[" << FLT_MIN << ", " << FLT_MAX << "]" << endl;
cout << "FLT_MIN: " << FLT_MIN << "\t\t";
float f = FLT_MIN;
printData(reinterpret_cast<unsigned char*>(&f), sizeof(float));
cout << "FLT_TRUE_MIN: " << FLT_TRUE_MIN << "\t";
f = FLT_TRUE_MIN;
printData(reinterpret_cast<unsigned char*>(&f), sizeof(float));
cout << "FLT_MAX: " << FLT_MAX << "\t\t";
f = FLT_MAX;
printData(reinterpret_cast<unsigned char*>(&f), sizeof(float));
float fmin, ftruemin, fmax;
fmin = pow(2, -126);
float fmantissa = 1.0;
for (int i = 1; i <= 23; i++)
{fmantissa += pow(2, -i);}
fmax = fmantissa * pow(2, 127);
cout << "float range (computed):\t\t[" << fmin << ", " << fmax << "]\n";
cout << "fmin (computed): " << fmin << "\t";
printData(reinterpret_cast<unsigned char*>(&fmin), sizeof(float));
ftruemin = pow(2, -126) * pow(2, -23); // pow(2, -149); // 2^(-150) ~ 0
cout << "ftruemin (comp): " << ftruemin << "\t";
printData(reinterpret_cast<unsigned char*>(&ftruemin), sizeof(float));
cout << "fmax (computed): " << fmax << "\t";
printData(reinterpret_cast<unsigned char*>(&fmax), sizeof(float));
unsigned u = 1; // 00000000000000000000000000000001
u <<= 23; // u = pow(2, 23); // 00000000100000000000000000000000
fmin = *((float*)&u); // sizeof(float) = sizeof(unsigned) = sizeof(int)
cout << "fmin shifted: " << fmin << "\t";
printData(reinterpret_cast<unsigned char*>(&fmin), sizeof(float));
unsigned v = 1; // 00000000000000000000000000000001
ftruemin = *((float*)&v); // sizeof(float) = sizeof(unsigned)
cout << "ftruemin shift: " << ftruemin << "\t";
printData(reinterpret_cast<unsigned char*>(&ftruemin), sizeof(float));
v = ~0; // 11111111111111111111111111111111
v >>= 1; // 01111111111111111111111111111111
v &= ~u; // 01111111011111111111111111111111
fmax = *((float*)&v); // sizeof(float) = sizeof(unsigned)
cout << "fmax shifted: " << fmax << "\t";
printData(reinterpret_cast<unsigned char*>(&fmax), sizeof(float));
cout << endl;

cout << "double: " << DBL_MANT_DIG << " mantissa digits";
cout << "\t[" << DBL_MIN << ", " << DBL_MAX << "]" << endl;
cout << "DBL_MIN: " << DBL_MIN << endl;
double d = DBL_MIN;
printData(reinterpret_cast<unsigned char*>(&d), sizeof(double));
cout << "DBL_TRUE_MIN: " << DBL_TRUE_MIN << endl;
d = DBL_TRUE_MIN;
printData(reinterpret_cast<unsigned char*>(&d), sizeof(double));
cout << "DBL_MAX: " << DBL_MAX << endl;
d = DBL_MAX;
printData(reinterpret_cast<unsigned char*>(&d), sizeof(double));
double dmin, dtruemin, dmax;
dmin = pow(2, -1022);
double dmantissa = 1.0;
for (int i = 1; i <= 52; i++)
{dmantissa += pow(2, -i);}
dmax = dmantissa * pow(2, 1023);
cout << "double range (computed):\t[" << dmin << ", " << dmax << "]\n";
cout << "dmin (computed): " << dmin << endl;
printData(reinterpret_cast<unsigned char*>(&dmin), sizeof(double));
dtruemin = pow(2, -1022) * pow(2, -52); // pow(2, -1074); // 2^(-1075) ~ 0
cout << "dtruemin (comp): " << dtruemin << endl;
printData(reinterpret_cast<unsigned char*>(&dtruemin), sizeof(double));
cout << "dmax (computed): " << dmax << endl;
printData(reinterpret_cast<unsigned char*>(&dmax), sizeof(double));
long unsigned lu = 1; // 0...01
lu <<= 52; // lu = pow(2, 52); // 0000000000010...0
dmin = *((double*)&lu); // sizeof(double) = sizeof(long unsigned) = sizeof(long)
cout << "dmin shifted: " << dmin << endl;
printData(reinterpret_cast<unsigned char*>(&dmin), sizeof(double));
long unsigned lv = 1; // 0...01
dtruemin = *((double*)&lv); // sizeof(double) = sizeof(long unsigned)
cout << "dtruemin shift: " << dtruemin << endl;
printData(reinterpret_cast<unsigned char*>(&dtruemin), sizeof(double));
lv = ~0; // 1...1
lv >>= 1; // 011...11
lv &= ~lu; // 01111111111011...11
dmax = *((double*)&lv); // sizeof(double) = sizeof(long unsigned)
cout << "dmax shifted: " << dmax << endl;
printData(reinterpret_cast<unsigned char*>(&dmax), sizeof(double));
cout << endl;
/*
On my computer,
sizeof(double) = sizeof(long) = sizeof(long long) = 8
sizeof(long double) = 2 * sizeof(double) = 16
`long double' stored on 128 bits =
48 zeros (or garbage) + 80 bits (extended precision)
*/
cout << "long double: " << LDBL_MANT_DIG << " mantissa digits";
cout << "\t[" << LDBL_MIN << ", " << LDBL_MAX << "]" << endl;
cout << "LDBL_MIN: " << LDBL_MIN << endl;
long double ld = LDBL_MIN;
printData2(reinterpret_cast<unsigned char*>(&ld), sizeof(long double));
cout << "LDBL_TRUE_MIN: " << LDBL_TRUE_MIN << endl;
ld = LDBL_TRUE_MIN;
printData2(reinterpret_cast<unsigned char*>(&ld), sizeof(long double));
cout << "LDBL_MAX: " << LDBL_MAX << endl;
ld = LDBL_MAX;
printData2(reinterpret_cast<unsigned char*>(&ld), sizeof(long double));
long double ldmin, ldtruemin, ldmax;
ldmin = powl(2, -16382);
long double ldmantissa = 1.0;
for (int i = 1; i <= 63; i++)
{ldmantissa += powl(2, -i);}
ldmax = ldmantissa * powl(2, 16383);
cout << "long double range (computed):\t[" << ldmin << ", " << ldmax << "]\n";
cout << "ldmin (computed): " << ldmin << endl;
printData2(reinterpret_cast<unsigned char*>(&ldmin), sizeof(long double));
ldtruemin = powl(2, -16382) * powl(2, -63); // powl(2, -16445); // 2^(-16446) ~ 0
cout << "ldtruemin (comp): " << ldtruemin << endl;
printData2(reinterpret_cast<unsigned char*>(&ldtruemin), sizeof(long double));
cout << "ldmax (computed): " << ldmax << endl;
printData2(reinterpret_cast<unsigned char*>(&ldmax), sizeof(long double));
long unsigned uarr[2] = {1, 1}; // 0...01, 0...01
uarr[0] <<= 63; // uarr[0] = powl(2, 63); // 10...0
// sizeof(long double) = 2 * sizeof(long unsigned) = 2 * sizeof(long):
ldmin = *(reinterpret_cast<long double*>(uarr));
cout << "ldmin shifted: " << ldmin << endl;
printData2(reinterpret_cast<unsigned char*>(&ldmin), sizeof(long double));
long unsigned varr[2] = {1, 0}; // 0...01, 0...0
ldtruemin = *(reinterpret_cast<long double*>(varr)); // 0...01
cout << "dtruemin shift: " << ldtruemin << endl;
printData2(reinterpret_cast<unsigned char*>(&ldtruemin), sizeof(long double));
varr[0] = varr[1] = ~0; // 1...1
varr[1] >>= 49; // 0...0111111111111111 (49 zeros)
varr[1] &= ~uarr[1]; // 0..0111111111111110 (50 zeros)
/*
// Alternative for varr[1]:
varr[1] >>= 50; // 0...011111111111111 (50 zeros)
varr[1] <<= 1; // 0..0111111111111110 (50 zeros)
*/
ldmax = *(reinterpret_cast<long double*>(varr));
cout << "ldmax shifted: " << ldmax << endl;
printData2(reinterpret_cast<unsigned char*>(&ldmax), sizeof(long double));

return 0;
}

void printBinary(const unsigned char val) // Display a byte in binary
{
for(int i = 7; i >= 0; i--)
{ // print bits from first (most significant) to last (least significant)
if(val & (1 << i)) // set (1) bit
{cout << '1';}
else {cout << '0';} // 0 bit
}
}

void printData(unsigned char* cp, int size) // Display a data type in binary
{ // sizeof(float) = 4: cp[3], cp[2], cp[1], cp[0]
for(; size > 0; size--)
{printBinary(cp[size-1]);}
cout << endl;
}
// Display a data type in binary (2 parts):
void printData2(unsigned char* cp, int size) // for long double
{ // sizeof(long double) = 16, 2 parts of 8 bytes
int half = size / 2; // we assume size is even
for(; size > half; size--) // second half:
{printBinary(cp[size-1]);} // cp[15], ..., cp[8]
cout << endl;
for(; size > 0; size--) // first half:
{printBinary(cp[size-1]);} // cp[7], ..., cp[0]
cout << endl;
}
/*
g++ -std=c++17 -E FloatLimits.cpp // preprocess to show contents of headers
// c++17 for FLT_DECIMAL_DIG, DBL_DECIMAL_DIG, LDBL_DECIMAL_DIG,
// FLT_TRUE_MIN, DBL_TRUE_MIN, LDBL_TRUE_MIN
// On disk: /usr/lib/gcc/x86_64-linux-gnu/9/include/float.h
// /usr/include/c++/9/cfloat
g++ -std=c++17 -E FloatLimits.cpp > Headers.txt // save to file
// Compile and run:
g++ -std=c++17 FloatLimits.cpp -o FloatLimits
./FloatLimits
FLT_DIG: 6, DBL_DIG: 15, LDBL_DIG: 18
FLT_DECIMAL_DIG: 9, DBL_DECIMAL_DIG: 17, LDBL_DECIMAL_DIG: 21
DECIMAL_DIG: 21
FLT_MIN_10_EXP: -37, DBL_MIN_10_EXP: -307, LDBL_MIN_10_EXP: -4931
FLT_MAX_10_EXP: 38, DBL_MAX_10_EXP: 308, LDBL_MAX_10_EXP: 4932
FLT_MIN_EXP: -125, DBL_MIN_EXP: -1021, LDBL_MIN_EXP: -16381
FLT_MAX_EXP: 128, DBL_MAX_EXP: 1024, LDBL_MAX_EXP: 16384
FLT_EPSILON: 1.19209e-07, DBL_EPSILON: 2.22045e-16, LDBL_EPSILON: 1.0842e-19
FLT_TRUE_MIN: 1.4013e-45, DBL_TRUE_MIN: 4.94066e-324, LDBL_TRUE_MIN: 3.6452e-4951
FLT_RADIX (Radix of exponent representation): 2 // base 2 (binary)
float: 24 mantissa digits [1.17549e-38, 3.40282e+38]
FLT_MIN: 1.17549e-38 00000000100000000000000000000000
FLT_TRUE_MIN: 1.4013e-45 00000000000000000000000000000001
FLT_MAX: 3.40282e+38 01111111011111111111111111111111
float range (computed): [1.17549e-38, 3.40282e+38]
fmin (computed): 1.17549e-38 00000000100000000000000000000000
ftruemin (comp): 1.4013e-45 00000000000000000000000000000001
fmax (computed): 3.40282e+38 01111111011111111111111111111111
fmin shifted: 1.17549e-38 00000000100000000000000000000000
ftruemin shift: 1.4013e-45 00000000000000000000000000000001
fmax shifted: 3.40282e+38 01111111011111111111111111111111

double: 53 mantissa digits [2.22507e-308, 1.79769e+308]
DBL_MIN: 2.22507e-308
0000000000010000000000000000000000000000000000000000000000000000
DBL_TRUE_MIN: 4.94066e-324
0000000000000000000000000000000000000000000000000000000000000001
DBL_MAX: 1.79769e+308
0111111111101111111111111111111111111111111111111111111111111111
double range (computed): [2.22507e-308, 1.79769e+308]
dmin (computed): 2.22507e-308
0000000000010000000000000000000000000000000000000000000000000000
dtruemin (comp): 4.94066e-324
0000000000000000000000000000000000000000000000000000000000000001
dmax (computed): 1.79769e+308
0111111111101111111111111111111111111111111111111111111111111111
dmin shifted: 2.22507e-308
0000000000010000000000000000000000000000000000000000000000000000
dtruemin shift: 4.94066e-324
0000000000000000000000000000000000000000000000000000000000000001
dmax shifted: 1.79769e+308
0111111111101111111111111111111111111111111111111111111111111111

long double: 64 mantissa digits [3.3621e-4932, 1.18973e+4932]
LDBL_MIN: 3.3621e-4932
0000000000000000000000000000000000000000000000000000000000000001
1000000000000000000000000000000000000000000000000000000000000000
LDBL_TRUE_MIN: 3.6452e-4951
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000001
LDBL_MAX: 1.18973e+4932
0000000000000000000000000000000000000000000000000111111111111110
1111111111111111111111111111111111111111111111111111111111111111
long double range (computed): [3.3621e-4932, 1.18973e+4932]
ldmin (computed): 3.3621e-4932
0000000000000000010101011001100000010111000011110000000000000001
1000000000000000000000000000000000000000000000000000000000000000
ldtruemin (comp): 3.6452e-4951
0000000000000000000000000000000100000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000001
ldmax (computed): 1.18973e+4932
0000000000000000010101011001100000010111000011110111111111111110
1111111111111111111111111111111111111111111111111111111111111111
ldmin shifted: 3.3621e-4932
0000000000000000010101011001100000010111000011110000000000000001
1000000000000000000000000000000000000000000000000000000000000000
dtruemin shift: 3.6452e-4951
0000000000000000000000000000000100000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000001
ldmax shifted: 1.18973e+4932
0000000000000000010101011001100000010111000011110111111111111110
1111111111111111111111111111111111111111111111111111111111111111
*/





Notes:  We have to preprocess and compile with the option for C++ 2017 (-std=c++17) for the constants FLT_DECIMAL_DIG, DBL_DECIMAL_DIG, LDBL_DECIMAL_DIG, FLT_TRUE_MIN, DBL_TRUE_MIN, LDBL_TRUE_MIN.
Note how the preprocessor replaces these constants (and others) in the generated file (Headers.txt, last part).

Note the difference in syntax for casting in C and C++:
(long double*)uarr
reinterpret_cast<long double*>(uarr)
reinterpret_cast<long double*>uarr // compile error









Chapter_3     Exercise_3-22     FloatBinary BACK_TO_TOP Exercise_3-25     Exercise_3-23



Comments

Popular posts from this blog

Contents