C compose unsigned 32-bit integer from four 8-bit integers

153 views Asked by At

On an 8-bit platform, I am composing an unsigned 32-bit integer from 4 8-bit integers like this:

uint8_t buf[4];
uint32_t large = 0;
large |= ((uint32_t)buf[0]) << 24;
large |= ((uint32_t)buf[1]) << 16;
large |= buf[2] << 8;
large |= buf[3] << 0;

Without the casts the compiler understandably complains:

bmp.c:100:23: warning: left shift count >= width of type [-Wshift-count-overflow]
  100 |     large |= (buf[1]) << 16;
      |                       ^~

Are these casts expensive (I would guess yes) and can this be done more efficiently?

Here is what I think is the relevant disassembly from avr-gcc (GCC) 13.2.0:

000060ee <.L29>:
        large |= ((uint32_t)buf[1]) << 16;
    60ee:       91 2c           mov     r9, r1
    60f0:       a1 2c           mov     r10, r1         
    60f2:       b1 2c           mov     r11, r1

000060f4 <.Loc.91>:
        large |= buf[3] << 0;   
    60f4:       a9 2a           or      r10, r25
    
000060f6 <.Loc.92>:
        large |= buf[2] << 8;
    60f6:       50 e0           ldi     r21, 0x00       ; 0
    
000060f8 <.Loc.93>:
    60f8:       54 2f           mov     r21, r20
    60fa:       44 27           eor     r20, r20
    60fc:       05 2e           mov     r0, r21 
    60fe:       00 0c           add     r0, r0
    6100:       66 0b           sbc     r22, r22
    6102:       77 0b           sbc     r23, r23

00006104 <.Loc.94>:
        large |= buf[3] << 0;
    6104:       84 2a           or      r8, r20 
    6106:       95 2a           or      r9, r21 
    6108:       a6 2a           or      r10, r22
    610a:       b7 2a           or      r11, r23
    610c:       b8 2a           or      r11, r24
    610e:       80 92 04 01     sts     0x0104, r8      ; 0x800104 <large>
    6112:       90 92 05 01     sts     0x0105, r9      ; 0x800105 <large+0x1>
    6116:       a0 92 06 01     sts     0x0106, r10     ; 0x800106 <large+0x2>
    611a:       b0 92 07 01     sts     0x0107, r11     ; 0x800107 <large+0x3>
3

There are 3 answers

7
gulpr On BEST ANSWER

The single expression you suggest yields 15 instead of 20 instructions if I am interpreting the disassembly correctly - nice!

No - it is a problem of the undefined/implemetation defined behaviour in the code, when it is written correctly it does not matter. I would also suggest using pointer notation in parameters (as C passes arrays as pointers) and declare parameters as const if function is not changing them. It helps compiler with optimizations (even abstracting from const correctness)

uint32_t foo(const uint8_t *buf)
{
    uint32_t large = 0;
    large |= ((uint32_t)buf[0]) << 24;
    large |= ((uint32_t)buf[1]) << 16;
    large |= (uint32_t)buf[2] << 8;
    large |= buf[3] << 0;
    return large;
}


uint32_t bar(const uint8_t *buf)
{
    return (uint32_t) buf[0] << 24 | (uint32_t) buf[1] << 16 | (uint32_t) buf[2] << 8 | buf[3];
}

Both generate the same machine code:

foo:
.L__stack_usage = 0
        mov r30,r24
        mov r31,r25
        ld r22,Z
        ldd r23,Z+1
        ldd r24,Z+2
        ldd r25,Z+3
        rcall __bswapsi2
        ret
bar:
.L__stack_usage = 0
        mov r30,r24
        mov r31,r25
        ld r22,Z
        ldd r23,Z+1
        ldd r24,Z+2
        ldd r25,Z+3
        rcall __bswapsi2
        ret

https://godbolt.org/z/b7o4114EP

Also AVR compiler assumes little endian and you "composing" the uint32_t number from the big-endian representation.

If endianness match then I would suggest using memcpy

memcpy(&large, buff, sizeof(large));

Optimizing compilers will not call memcpy

uint32_t bar(uint8_t buf[4])
{
    uint32_t large;
    memcpy(&large, buf, sizeof(large));
    return large
}

bar:
.L__stack_usage = 0
        mov r30,r24
        mov r31,r25
        ld r22,Z
        ldd r23,Z+1
        ldd r24,Z+2
        ldd r25,Z+3
        ret

But to more interesting using unions makes code much more efficient if the buf data is big endian

uint32_t foo(const uint8_t *buf)
{
    union 
    {
        uint32_t large;
        uint8_t small[4];
    }d = {.small = {[0] = buf[0], [1] = buf[1], [2] = buf[2], [3] = buf[3]}};
    return d.large;
}

uint32_t bar(const uint8_t *buf)
{
    union 
    {
        uint32_t large;
        uint8_t small[4];
    }d = {.small = {[0] = buf[3], [1] = buf[2], [2] = buf[1], [3] = buf[0]}};
    return d.large;
}

and the resulting code:

foo:
.L__stack_usage = 0
        mov r30,r24
        mov r31,r25
        ldd r23,Z+1
        ld r22,Z
        ldd r24,Z+2
        ldd r25,Z+3
ret
bar:
.L__stack_usage = 0
        mov r30,r24
        mov r31,r25
        ldd r23,Z+2
        ldd r22,Z+3
        ldd r24,Z+1
        ld r25,Z
ret
3
Eric Postpischil On

Are these casts expensive (I would guess yes)…

No.

… and can this be done more efficiently?

Knowledge of idioms for merging bytes into a multiple-byte word using shifts and ORs has long been built into GCC. However, because int is 16 bits in the C implementation you are using, you need a cast on buf[2] as in:

uint32_t bar(uint8_t buf[4])
{
    return (uint32_t) buf[0] << 24 | (uint32_t) buf[1] << 16 | (uint32_t) buf[2] << 8 | buf[3];
}
0
emacs drives me nuts On

Just for completeness, one can use vector types which is GNU-C:

#include <stdint.h>

// A vector of 4 x uint8_t.
typedef uint8_t __attribute__((vector_size(4))) v4u8_t;

uint32_t func1 (const uint8_t *buf) {
    v4u8_t v4 = { buf[1], buf[0], buf[3], buf[2] };
    
    return (uint32_t) v4;
}

avr-gcc v13 generates the following code with -Os -mmcu=atmega8:

    movw r30,r24
    ldd r22,Z+1
    ld r23,Z
    ldd r24,Z+3
    ldd r25,Z+2

Sadly, with v14 there are no more four 1-byte loads but one 32-bit load with additional code:

    movw r30,r24
    ld r22,Z
    ldd r23,Z+1
    ldd r24,Z+2
    ldd r25,Z+3
    rcall __bswapsi2
    movw r18,r22
    movw r22,r24
    movw r24,r18