mirror of
https://github.com/Rockbox/rockbox.git
synced 2026-04-11 16:37:45 -04:00
arm: add optimized ARMv7-M memcpy implementation
This assembly implementation is marginally faster than the non-size-optimized C version for large copies, but is around half the code size. Unaligned loads/stores will be used on platforms that support it: though slower than aligned accesses, this is still faster than copying byte-by-byte and has the advantage of simplicity and small code size. Change-Id: Ieee73d7557318d510601583f190ef3aa018c9121
This commit is contained in:
parent
01f96e40a7
commit
7adb9cd1b4
2 changed files with 81 additions and 2 deletions
|
|
@ -21,8 +21,7 @@ arm/memmove.S
|
|||
arm/memset.S
|
||||
strlen.c
|
||||
#elif defined(CPU_ARM_MICRO)
|
||||
memcpy.c
|
||||
mempcpy.c
|
||||
arm/memcpy-armv7m.S
|
||||
memmove.c
|
||||
arm/memset.S
|
||||
strlen.c
|
||||
|
|
|
|||
80
firmware/asm/arm/memcpy-armv7m.S
Normal file
80
firmware/asm/arm/memcpy-armv7m.S
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
#include "config.h"
|
||||
|
||||
#if (CONFIG_CPU == STM32H743)
|
||||
# define CPU_SUPPORTS_UNALIGNED_ACCESS
|
||||
#endif
|
||||
|
||||
.syntax unified
|
||||
.section .icode,"ax",%progbits
|
||||
|
||||
.global memcpy
|
||||
.type memcpy,%function
|
||||
.global mempcpy
|
||||
.type mempcpy,%function
|
||||
|
||||
/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
|
||||
/* Prototype: void *mempcpy(void *dest, const void *src, size_t n); */
|
||||
|
||||
/*
|
||||
* r0 = dest
|
||||
* r1 = src
|
||||
* r2 = n
|
||||
* r3 = scratch
|
||||
* ip = saved r0
|
||||
*/
|
||||
|
||||
mempcpy:
|
||||
add ip, r0, r2
|
||||
b 0f
|
||||
|
||||
memcpy:
|
||||
movs ip, r0
|
||||
|
||||
0:
|
||||
#if !defined(CPU_SUPPORTS_UNALIGNED_ACCESS)
|
||||
/* fall back to byte copy loop if operands not aligned */
|
||||
ands r3, r0, #3
|
||||
bne 5f
|
||||
ands r3, r1, #3
|
||||
bne 5f
|
||||
#endif
|
||||
|
||||
/* check remaining bytes >= 16 */
|
||||
subs r2, r2, #16
|
||||
blt 2f
|
||||
|
||||
1: /* copy 16-byte blocks */
|
||||
ldr r3, [r1, #0]
|
||||
str r3, [r0, #0]
|
||||
ldr r3, [r1, #4]
|
||||
str r3, [r0, #4]
|
||||
ldr r3, [r1, #8]
|
||||
str r3, [r0, #8]
|
||||
ldr r3, [r1, #12]
|
||||
str r3, [r0, #12]
|
||||
adds r0, #16
|
||||
adds r1, #16
|
||||
subs r2, r2, #16
|
||||
bge 1b
|
||||
|
||||
2: /* get count of remaining bytes (0-15) */
|
||||
and r2, r2, #15
|
||||
|
||||
3: /* copy 4 bytes at a time */
|
||||
subs r2, r2, #4
|
||||
ldrge r3, [r1], #4
|
||||
strge r3, [r0], #4
|
||||
bgt 3b
|
||||
|
||||
4: /* get count of remaining bytes (0-3) */
|
||||
and r2, #3
|
||||
|
||||
5: /* copy 1 byte at a time */
|
||||
subs r2, r2, #1
|
||||
ldrbge r3, [r1], #1
|
||||
strbge r3, [r0], #1
|
||||
bgt 5b
|
||||
|
||||
6: /* return */
|
||||
movs r0, ip
|
||||
bx lr
|
||||
Loading…
Add table
Add a link
Reference in a new issue