ARM support: provide compiler a better popcount function

Just the 32-bit one for now. The default uses lookup tables and is
ungainly and bloated.

Change-Id: I4a2eb31defb1f4d6f6853b65fe6dacc380d6ffc0
This commit is contained in:
Michael Sevakis 2017-09-07 15:41:52 -04:00
parent 28591f2e92
commit c6d5cd74a8

View file

@ -701,3 +701,34 @@ __aeabi_idivmod:
.byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89 .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
.byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81 .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
#endif #endif
/*
* int __popcountsi2(unsigned int x)
* int __popcountdi2(unsigned long x)
*/
.section .text.__popcountsi2, "ax", %progbits
.global __popcountsi2
.type __popcountsi2, %function
.global __popcountdi2
.type __popcountdi2, %function
.set __popcountdi2, __popcountsi2
__popcountsi2:
mov r1, #0x33 @ r1 = 0x33333333
orr r1, r1, r1, lsl #8 @ ...
orr r1, r1, r1, lsl #16 @ ...
eor r2, r1, r1, lsl #1 @ r2 = 0x55555555
and r2, r2, r0, lsr #1 @ r2 = (x >> 1) & 0x55555555
sub r0, r0, r2 @ x = x - ((x >> 1) & 0x55555555)
and r2, r1, r0 @ r2 = x & 0x33333333
and r1, r1, r0, lsr #2 @ r1 = (x >> 2) & 0x33333333
add r0, r2, r1 @ x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
mov r1, #0x0f @ r1 = 0x0f0f0f0f
orr r1, r1, r1, lsl #8 @ ...
orr r1, r1, r1, lsl #16 @ ...
add r0, r0, lsr #4 @ x = x + (x >> 4)
and r0, r0, r1 @ x = (x + (x >> 4)) & 0x0f0f0f0f
add r0, r0, lsr #16 @ x = x + (x >> 16)
add r0, r0, lsr #8 @ x = x + (x >> 8)
and r0, r0, #0x3f @ x &= 0x3f
bx lr @ return x
.size __popcountsi2, .-__popcountsi2