Investigating floats

float variables are, on most current hardware, instrumented as as single precision IEEE 754 value. We know that floating point values only approximate real values, but it is often hard to visualise. When we have the binary decimal 0.12345 we are really saying something like 1*10-1 + 2*10-2 + 3*10-3 .... Thus a binary decimal 0.10110 describes 1*2-1 + 0*2-2 + 1*2-3 ...

So clearly, with one decimal digit we can describe the range 0.0 - 0.9, but with one binary digit we can describe only 0 and 1/2 = 0.5. The fractions we can represent exactly are the dyadic rationals. Other rationals repeat (just like in base 10) and others are irrational. So for example, 0.1 can not be repesented exactly without infinite precision.

The program below illustrates how a single precision floating point value works by showing what bits are set and adding them up.

For example

$ ./float .5
0.500000 = 1 * (1/2^0) * 2^-1
0.500000 = 1 * (1/1) * 2^-1
0.500000 = 1 * 1 * 0.500000
0.500000 = 0.5
$ ./float .1
0.100000 = 1 * (1/2^0 + 1/2^1 + 1/2^4 + 1/2^5 + 1/2^8 + 1/2^9 + 1/2^12 + 1/2^13 + 1/2^16 + 1/2^17 + 1/2^20 + 1/2^21 + 1/2^23) * 2^-4
0.100000 = 1 * (1/1 + 1/2 + 1/16 + 1/32 + 1/256 + 1/512 + 1/4096 + 1/8192 + 1/65536 + 1/131072 + 1/1048576 + 1/2097152 + 1/8388608) * 2^-4
0.100000 = 1 * 1.60000002384 * 0.062500
0.100000 = 0.10000000149

I use doubles to show how the errors creep in.

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* return 2^n */
int two_to_pos(int n)
{
    if (n == 0)
        return 1;
    return 2 * two_to_pos(n - 1);
}

double two_to_neg(int n)
{
    if (n == 0)
        return 1;
    return 1.0 / (two_to_pos(abs(n)));
}

double two_to(int n)
{
    if (n >= 0)
        return two_to_pos(n);
    if (n < 0)
        return two_to_neg(n);
    return 0;
}

/* Go through some memory "m" which is the 24 bit significand of a
   floating point number.  We work "backwards" from the bits
   furthest on the right, for no particular reason. */
double calc_float(int m, int bit)
{
    /* 23 bits; this terminates recursion */
    if (bit > 23)
        return 0;

    /* if the bit is set, it represents the value 1/2^bit */
    if ((m >> bit) & 1)
        return 1.0L/two_to(23 - bit) + calc_float(m, bit + 1);

    /* otherwise go to the next bit */
    return calc_float(m, bit + 1);
}

int main(int argc, char *argv[])
{
    float f;
    int m,i,sign,exponent,significand;

    if (argc != 2)
    {
        printf("usage: float 123.456\n");
        exit(1);
    }

    if (sscanf(argv[1], "%f", &f) != 1)
    {
        printf("invalid input\n");
        exit(1);
    }

    /* We need to "fool" the compiler, as if we start to use casts
       (e.g. (int)f) it will actually do a conversion for us.  We
       want access to the raw bits, so we just copy it into a same
       sized variable. */
    memcpy(&m, &f, 4);

    /* The sign bit is the first bit */
    sign = (m >> 31) & 0x1;

    /* Exponent is 8 bits following the sign bit */
    exponent = ((m >> 23) & 0xFF) - 127;

    /* Significand fills out the float, the first bit is implied
       to be 1, hence the 24 bit OR value below. */
    significand = (m & 0x7FFFFF) | 0x800000;

    /* print out a power representation */
    printf("%f = %d * (", f, sign ? -1 : 1);
    for(i = 23 ; i >= 0 ; i--)
    {
        if ((significand >> i) & 1)
            printf("%s1/2^%d", (i == 23) ? "" : " + ",
                   23-i);
    }
    printf(") * 2^%d\n", exponent);

    /* print out a fractional representation */
    printf("%f = %d * (", f, sign ? -1 : 1);
    for(i = 23 ; i >= 0 ; i--)
    {
        if ((significand >> i) & 1)
            printf("%s1/%d", (i == 23) ? "" : " + ",
                   (int)two_to(23-i));
    }
    printf(") * 2^%d\n", exponent);

    /* convert this into decimal and print it out */
    printf("%f = %d * %.12g * %f\n",
           f,
           (sign ? -1 : 1),
           calc_float(significand, 0),
           two_to(exponent));

    /* do the math this time */
    printf("%f = %.12g\n",
           f,
           (sign ? -1 : 1) *
           calc_float(significand, 0) *
           two_to(exponent)
        );

    return 0;
}