ARM support: provide compiler a better popcount function

Just the 32-bit one for now. The default uses lookup tables and is ungainly and bloated. Change-Id: I4a2eb31defb1f4d6f6853b65fe6dacc380d6ffc0
2025-11-09 21:22:39 -05:00 · 2017-09-07 15:41:52 -04:00 · 2017-09-07 15:41:52 -04:00 · c6d5cd74a8
commit c6d5cd74a8
parent 28591f2e92
1 changed files with 31 additions and 0 deletions
--- a/lib/arm_support/support-arm.S
+++ b/lib/arm_support/support-arm.S
@ -701,3 +701,34 @@ __aeabi_idivmod:
    .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
    .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
 #endif
 /*
 * int __popcountsi2(unsigned int x)
 * int __popcountdi2(unsigned long x)
 */
    .section    .text.__popcountsi2, "ax", %progbits
    .global     __popcountsi2
    .type       __popcountsi2, %function
    .global     __popcountdi2
    .type       __popcountdi2, %function
    .set        __popcountdi2, __popcountsi2
 __popcountsi2:
    mov r1, #0x33           @ r1 = 0x33333333
    orr r1, r1, r1, lsl #8  @ ...
    orr r1, r1, r1, lsl #16 @ ...
    eor r2, r1, r1, lsl #1  @ r2 = 0x55555555
    and r2, r2, r0, lsr #1  @ r2 = (x >> 1) & 0x55555555
    sub r0, r0, r2          @ x = x - ((x >> 1) & 0x55555555)
    and r2, r1, r0          @ r2 = x & 0x33333333
    and r1, r1, r0, lsr #2  @ r1 = (x >> 2) & 0x33333333
    add r0, r2, r1          @ x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
    mov r1, #0x0f           @ r1 = 0x0f0f0f0f
    orr r1, r1, r1, lsl #8  @ ...
    orr r1, r1, r1, lsl #16 @ ...
    add r0, r0, lsr #4      @ x = x + (x >> 4)
    and r0, r0, r1          @ x = (x + (x >> 4)) & 0x0f0f0f0f
    add r0, r0, lsr #16     @ x = x + (x >> 16)
    add r0, r0, lsr #8      @ x = x + (x >> 8)
    and r0, r0, #0x3f       @ x &= 0x3f
    bx  lr                  @ return x
   .size    __popcountsi2, .-__popcountsi2