--- /dev/null 2004-02-02 20:32:13.000000000 +0000 +++ sysdeps/arm/memcpy.S 2004-03-20 13:25:27.000000000 +0000 @@ -0,0 +1,241 @@ +/* + * Optimized memcpy implementation for ARM processors + * + * Author: Nicolas Pitre + * Created: Dec 23, 2003 + * Copyright: (C) MontaVista Software, Inc. + * + * This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +#include <sysdep.h> + + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#else +#define pull lsl +#define push lsr +#endif + +/* + * Enable data preload for architectures that support it (ARMv5 and above) + */ +#if defined(__ARM_ARCH_5__) || \ + defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5TE__) +#define PLD(code...) code +#else +#define PLD(code...) +#endif + + +/* char * memcpy (char *dst, const char *src) */ + +ENTRY(memcpy) + subs r2, r2, #4 + stmfd sp!, {r0, r4, lr} + blt 7f + ands ip, r0, #3 + PLD( pld [r1, #0] ) + bne 8f + ands ip, r1, #3 + bne 9f + +1: subs r2, r2, #4 + blt 6f + subs r2, r2, #8 + blt 5f + subs r2, r2, #16 + blt 4f + + PLD( subs r2, r2, #65 ) + stmfd sp!, {r5 - r8} + PLD( blt 3f ) + PLD( pld [r1, #32] ) + + PLD( @ cache alignment ) + PLD( ands ip, r1, #31 ) + PLD( pld [r1, #64] ) + PLD( beq 2f ) + PLD( rsb ip, ip, #32 ) + PLD( cmp r2, ip ) + PLD( pld [r1, #96] ) + PLD( blt 2f ) + PLD( cmp ip, #16 ) + PLD( sub r2, r2, ip ) + PLD( ldmgeia r1!, {r3 - r6} ) + PLD( stmgeia r0!, {r3 - r6} ) + PLD( beq 2f ) + PLD( and ip, ip, #15 ) + PLD( cmp ip, #8 ) + PLD( ldr r3, [r1], #4 ) + PLD( ldrge r4, [r1], #4 ) + PLD( ldrgt r5, [r1], #4 ) + PLD( str r3, [r0], #4 ) + PLD( strge r4, [r0], #4 ) + PLD( strgt r5, [r0], #4 ) + +2: PLD( pld [r1, #96] ) +3: ldmia r1!, {r3 - r8, ip, lr} + subs r2, r2, #32 + stmia r0!, {r3 - r8, ip, lr} + bge 2b + PLD( cmn r2, #65 ) + PLD( bge 3b ) + PLD( add r2, r2, #65 ) + tst r2, #31 + ldmfd sp!, {r5 - r8} + ldmeqfd sp!, {r0, r4, pc} + + tst r2, #16 +4: ldmneia r1!, {r3, r4, ip, lr} + stmneia r0!, {r3, r4, ip, lr} + + tst r2, #8 +5: ldmneia r1!, {r3, r4} + stmneia r0!, {r3, r4} + + tst r2, #4 +6: ldrne r3, [r1], #4 + strne r3, [r0], #4 + +7: ands r2, r2, #3 + ldmeqfd sp!, {r0, r4, pc} + + cmp r2, #2 + ldrb r3, [r1], #1 + ldrgeb r4, [r1], #1 + ldrgtb ip, [r1] + strb r3, [r0], #1 + strgeb r4, [r0], #1 + strgtb ip, [r0] + ldmfd sp!, {r0, r4, pc} + +8: rsb ip, ip, #4 + cmp ip, #2 + ldrb r3, [r1], #1 + ldrgeb r4, [r1], #1 + ldrgtb lr, [r1], #1 + strb r3, [r0], #1 + strgeb r4, [r0], #1 + strgtb lr, [r0], #1 + subs r2, r2, ip + blt 7b + ands ip, r1, #3 + beq 1b + +9: bic r1, r1, #3 + cmp ip, #2 + ldr lr, [r1], #4 + beq 17f + bgt 18f + + + .macro forward_copy_shift pull push + + cmp r2, #12 + PLD( pld [r1, #0] ) + blt 15f + subs r2, r2, #28 + stmfd sp!, {r5 - r9} + blt 13f + + PLD( subs r2, r2, #97 ) + PLD( blt 12f ) + PLD( pld [r1, #32] ) + + PLD( @ cache alignment ) + PLD( rsb ip, r1, #36 ) + PLD( pld [r1, #64] ) + PLD( ands ip, ip, #31 ) + PLD( pld [r1, #96] ) + PLD( beq 11f ) + PLD( cmp r2, ip ) + PLD( pld [r1, #128] ) + PLD( blt 11f ) + PLD( sub r2, r2, ip ) +10: PLD( mov r3, lr, pull #\pull ) + PLD( ldr lr, [r1], #4 ) + PLD( subs ip, ip, #4 ) + PLD( orr r3, r3, lr, push #\push ) + PLD( str r3, [r0], #4 ) + PLD( bgt 10b ) + +11: PLD( pld [r1, #128] ) +12: mov r3, lr, pull #\pull + ldmia r1!, {r4 - r9, ip, lr} + subs r2, r2, #32 + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, r7, push #\push + mov r7, r7, pull #\pull + orr r7, r7, r8, push #\push + mov r8, r8, pull #\pull + orr r8, r8, r9, push #\push + mov r9, r9, pull #\pull + orr r9, r9, ip, push #\push + mov ip, ip, pull #\pull + orr ip, ip, lr, push #\push + stmia r0!, {r3 - r9, ip} + bge 11b + PLD( cmn r2, #97 ) + PLD( bge 12b ) + PLD( add r2, r2, #97 ) + cmn r2, #16 + blt 14f +13: mov r3, lr, pull #\pull + ldmia r1!, {r4 - r6, lr} + sub r2, r2, #16 + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, lr, push #\push + stmia r0!, {r3 - r6} +14: adds r2, r2, #28 + ldmfd sp!, {r5 - r9} + blt 16f +15: mov r3, lr, pull #\pull + ldr lr, [r1], #4 + subs r2, r2, #4 + orr r3, r3, lr, push #\push + str r3, [r0], #4 + bge 15b +16: + .endm + + + forward_copy_shift pull=8 push=24 + sub r1, r1, #3 + b 7b + +17: forward_copy_shift pull=16 push=16 + sub r1, r1, #2 + b 7b + +18: forward_copy_shift pull=24 push=8 + sub r1, r1, #1 + b 7b + + .size memcpy, . - memcpy +END(memcpy) +libc_hidden_builtin_def (memcpy) --- /dev/null 2004-02-02 20:32:13.000000000 +0000 +++ sysdeps/arm/memmove.S 2004-03-20 18:37:23.000000000 +0000 @@ -0,0 +1,251 @@ +/* + * Optimized memmove implementation for ARM processors + * + * Author: Nicolas Pitre + * Created: Dec 23, 2003 + * Copyright: (C) MontaVista Software, Inc. + * + * This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +#include <sysdep.h> + + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#else +#define pull lsl +#define push lsr +#endif + +/* + * Enable data preload for architectures that support it (ARMv5 and above) + */ +#if defined(__ARM_ARCH_5__) || \ + defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5TE__) +#define PLD(code...) code +#else +#define PLD(code...) +#endif + + +/* char * memmove (char *dst, const char *src) */ +ENTRY(memmove) + subs ip, r0, r1 + cmphi r2, ip + bls memcpy(PLT) + + stmfd sp!, {r0, r4, lr} + add r1, r1, r2 + add r0, r0, r2 + subs r2, r2, #4 + blt 25f + ands ip, r0, #3 + PLD( pld [r1, #-4] ) + bne 26f + ands ip, r1, #3 + bne 27f + +19: subs r2, r2, #4 + blt 24f + subs r2, r2, #8 + blt 23f + subs r2, r2, #16 + blt 22f + + PLD( pld [r1, #-32] ) + PLD( subs r2, r2, #96 ) + stmfd sp!, {r5 - r8} + PLD( blt 21f ) + + PLD( @ cache alignment ) + PLD( ands ip, r1, #31 ) + PLD( pld [r1, #-64] ) + PLD( beq 20f ) + PLD( cmp r2, ip ) + PLD( pld [r1, #-96] ) + PLD( blt 20f ) + PLD( cmp ip, #16 ) + PLD( sub r2, r2, ip ) + PLD( ldmgedb r1!, {r3 - r6} ) + PLD( stmgedb r0!, {r3 - r6} ) + PLD( beq 20f ) + PLD( and ip, ip, #15 ) + PLD( cmp ip, #8 ) + PLD( ldr r3, [r1, #-4]! ) + PLD( ldrge r4, [r1, #-4]! ) + PLD( ldrgt r5, [r1, #-4]! ) + PLD( str r3, [r0, #-4]! ) + PLD( strge r4, [r0, #-4]! ) + PLD( strgt r5, [r0, #-4]! ) + +20: PLD( pld [r1, #-96] ) + PLD( pld [r1, #-128] ) +21: ldmdb r1!, {r3, r4, ip, lr} + subs r2, r2, #32 + stmdb r0!, {r3, r4, ip, lr} + ldmdb r1!, {r3, r4, ip, lr} + stmgedb r0!, {r3, r4, ip, lr} + ldmgedb r1!, {r3, r4, ip, lr} + stmgedb r0!, {r3, r4, ip, lr} + ldmgedb r1!, {r3, r4, ip, lr} + subges r2, r2, #32 + stmdb r0!, {r3, r4, ip, lr} + bge 20b + PLD( cmn r2, #96 ) + PLD( bge 21b ) + PLD( add r2, r2, #96 ) + tst r2, #31 + ldmfd sp!, {r5 - r8} + ldmeqfd sp!, {r0, r4, pc} + + tst r2, #16 +22: ldmnedb r1!, {r3, r4, ip, lr} + stmnedb r0!, {r3, r4, ip, lr} + + tst r2, #8 +23: ldmnedb r1!, {r3, r4} + stmnedb r0!, {r3, r4} + + tst r2, #4 +24: ldrne r3, [r1, #-4]! + strne r3, [r0, #-4]! + +25: ands r2, r2, #3 + ldmeqfd sp!, {r0, r4, pc} + + cmp r2, #2 + ldrb r3, [r1, #-1] + ldrgeb r4, [r1, #-2] + ldrgtb ip, [r1, #-3] + strb r3, [r0, #-1] + strgeb r4, [r0, #-2] + strgtb ip, [r0, #-3] + ldmfd sp!, {r0, r4, pc} + +26: cmp ip, #2 + ldrb r3, [r1, #-1]! + ldrgeb r4, [r1, #-1]! + ldrgtb lr, [r1, #-1]! + strb r3, [r0, #-1]! + strgeb r4, [r0, #-1]! + strgtb lr, [r0, #-1]! + subs r2, r2, ip + blt 25b + ands ip, r1, #3 + beq 19b + +27: bic r1, r1, #3 + cmp ip, #2 + ldr r3, [r1] + beq 35f + blt 36f + + + .macro backward_copy_shift push pull + + cmp r2, #12 + PLD( pld [r1, #-4] ) + blt 33f + subs r2, r2, #28 + stmfd sp!, {r5 - r9} + blt 31f + + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #-32] ) + PLD( blt 30f ) + PLD( pld [r1, #-64] ) + + PLD( @ cache alignment ) + PLD( ands ip, r1, #31 ) + PLD( pld [r1, #-96] ) + PLD( beq 29f ) + PLD( cmp r2, ip ) + PLD( pld [r1, #-128] ) + PLD( blt 29f ) + PLD( sub r2, r2, ip ) +28: PLD( mov r4, r3, push #\push ) + PLD( ldr r3, [r1, #-4]! ) + PLD( subs ip, ip, #4 ) + PLD( orr r4, r4, r3, pull #\pull ) + PLD( str r4, [r0, #-4]! ) + PLD( bgt 28b ) + +29: PLD( pld [r1, #-128] ) +30: mov lr, r3, push #\push + ldmdb r1!, {r3 - r9, ip} + subs r2, r2, #32 + orr lr, lr, ip, pull #\pull + mov ip, ip, push #\push + orr ip, ip, r9, pull #\pull + mov r9, r9, push #\push + orr r9, r9, r8, pull #\pull + mov r8, r8, push #\push + orr r8, r8, r7, pull #\pull + mov r7, r7, push #\push + orr r7, r7, r6, pull #\pull + mov r6, r6, push #\push + orr r6, r6, r5, pull #\pull + mov r5, r5, push #\push + orr r5, r5, r4, pull #\pull + mov r4, r4, push #\push + orr r4, r4, r3, pull #\pull + stmdb r0!, {r4 - r9, ip, lr} + bge 29b + PLD( cmn r2, #96 ) + PLD( bge 30b ) + PLD( add r2, r2, #96 ) + cmn r2, #16 + blt 32f +31: mov r7, r3, push #\push + ldmdb r1!, {r3 - r6} + sub r2, r2, #16 + orr r7, r7, r6, pull #\pull + mov r6, r6, push #\push + orr r6, r6, r5, pull #\pull + mov r5, r5, push #\push + orr r5, r5, r4, pull #\pull + mov r4, r4, push #\push + orr r4, r4, r3, pull #\pull + stmdb r0!, {r4 - r7} +32: adds r2, r2, #28 + ldmfd sp!, {r5 - r9} + blt 34f +33: mov r4, r3, push #\push + ldr r3, [r1, #-4]! + subs r2, r2, #4 + orr r4, r4, r3, pull #\pull + str r4, [r0, #-4]! + bge 33b +34: + .endm + + + backward_copy_shift push=8 pull=24 + add r1, r1, #3 + b 25b + +35: backward_copy_shift push=16 pull=16 + add r1, r1, #2 + b 25b + +36: backward_copy_shift push=24 pull=8 + add r1, r1, #1 + b 25b + + .size memmove, . - memmove +END(memmove) +libc_hidden_builtin_def (memmove) --- /dev/null 2004-02-02 20:32:13.000000000 +0000 +++ sysdeps/arm/bcopy.S 2004-03-20 18:37:48.000000000 +0000 @@ -0,0 +1,255 @@ +/* + * Optimized memmove implementation for ARM processors + * + * Author: Nicolas Pitre + * Created: Dec 23, 2003 + * Copyright: (C) MontaVista Software, Inc. + * + * This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +#include <sysdep.h> + + +/* + * Endian independent macros for shifting bytes within registers. + */ +#ifndef __ARMEB__ +#define pull lsr +#define push lsl +#else +#define pull lsl +#define push lsr +#endif + +/* + * Enable data preload for architectures that support it (ARMv5 and above) + */ +#if defined(__ARM_ARCH_5__) || \ + defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5TE__) +#define PLD(code...) code +#else +#define PLD(code...) +#endif + +dst .req r1 +src .req r0 + +/* void *bcopy (const char *src, char *dst, size_t size) */ +ENTRY(bcopy) + subs ip, dst, src + cmphi r2, ip + movls r3, r0 + movls r0, r1 + movls r1, r3 + bls memcpy(PLT) + + stmfd sp!, {r4, lr} + add src, src, r2 + add dst, dst, r2 + subs r2, r2, #4 + blt 25f + ands ip, dst, #3 + PLD( pld [src, #-4] ) + bne 26f + ands ip, src, #3 + bne 27f + +19: subs r2, r2, #4 + blt 24f + subs r2, r2, #8 + blt 23f + subs r2, r2, #16 + blt 22f + + PLD( pld [src, #-32] ) + PLD( subs r2, r2, #96 ) + stmfd sp!, {r5 - r8} + PLD( blt 21f ) + + PLD( @ cache alignment ) + PLD( ands ip, src, #31 ) + PLD( pld [src, #-64] ) + PLD( beq 20f ) + PLD( cmp r2, ip ) + PLD( pld [src, #-96] ) + PLD( blt 20f ) + PLD( cmp ip, #16 ) + PLD( sub r2, r2, ip ) + PLD( ldmgedb src!, {r3 - r6} ) + PLD( stmgedb dst!, {r3 - r6} ) + PLD( beq 20f ) + PLD( and ip, ip, #15 ) + PLD( cmp ip, #8 ) + PLD( ldr r3, [src, #-4]! ) + PLD( ldrge r4, [src, #-4]! ) + PLD( ldrgt r5, [src, #-4]! ) + PLD( str r3, [dst, #-4]! ) + PLD( strge r4, [dst, #-4]! ) + PLD( strgt r5, [dst, #-4]! ) + +20: PLD( pld [src, #-96] ) + PLD( pld [src, #-128] ) +21: ldmdb src!, {r3, r4, ip, lr} + subs r2, r2, #32 + stmdb dst!, {r3, r4, ip, lr} + ldmdb src!, {r3, r4, ip, lr} + stmgedb dst!, {r3, r4, ip, lr} + ldmgedb src!, {r3, r4, ip, lr} + stmgedb dst!, {r3, r4, ip, lr} + ldmgedb src!, {r3, r4, ip, lr} + subges r2, r2, #32 + stmdb dst!, {r3, r4, ip, lr} + bge 20b + PLD( cmn r2, #96 ) + PLD( bge 21b ) + PLD( add r2, r2, #96 ) + tst r2, #31 + ldmfd sp!, {r5 - r8} + ldmeqfd sp!, {r4, pc} + + tst r2, #16 +22: ldmnedb src!, {r3, r4, ip, lr} + stmnedb dst!, {r3, r4, ip, lr} + + tst r2, #8 +23: ldmnedb src!, {r3, r4} + stmnedb dst!, {r3, r4} + + tst r2, #4 +24: ldrne r3, [src, #-4]! + strne r3, [dst, #-4]! + +25: ands r2, r2, #3 + ldmeqfd sp!, {dst, r4, pc} + + cmp r2, #2 + ldrb r3, [src, #-1] + ldrgeb r4, [src, #-2] + ldrgtb ip, [src, #-3] + strb r3, [dst, #-1] + strgeb r4, [dst, #-2] + strgtb ip, [dst, #-3] + ldmfd sp!, {dst, r4, pc} + +26: cmp ip, #2 + ldrb r3, [src, #-1]! + ldrgeb r4, [src, #-1]! + ldrgtb lr, [src, #-1]! + strb r3, [dst, #-1]! + strgeb r4, [dst, #-1]! + strgtb lr, [dst, #-1]! + subs r2, r2, ip + blt 25b + ands ip, src, #3 + beq 19b + +27: bic src, src, #3 + cmp ip, #2 + ldr r3, [src] + beq 35f + blt 36f + + + .macro backward_copy_shift push pull + + cmp r2, #12 + PLD( pld [src, #-4] ) + blt 33f + subs r2, r2, #28 + stmfd sp!, {r5 - r9} + blt 31f + + PLD( subs r2, r2, #96 ) + PLD( pld [src, #-32] ) + PLD( blt 30f ) + PLD( pld [src, #-64] ) + + PLD( @ cache alignment ) + PLD( ands ip, src, #31 ) + PLD( pld [src, #-96] ) + PLD( beq 29f ) + PLD( cmp r2, ip ) + PLD( pld [src, #-128] ) + PLD( blt 29f ) + PLD( sub r2, r2, ip ) +28: PLD( mov r4, r3, push #\push ) + PLD( ldr r3, [src, #-4]! ) + PLD( subs ip, ip, #4 ) + PLD( orr r4, r4, r3, pull #\pull ) + PLD( str r4, [dst, #-4]! ) + PLD( bgt 28b ) + +29: PLD( pld [src, #-128] ) +30: mov lr, r3, push #\push + ldmdb src!, {r3 - r9, ip} + subs r2, r2, #32 + orr lr, lr, ip, pull #\pull + mov ip, ip, push #\push + orr ip, ip, r9, pull #\pull + mov r9, r9, push #\push + orr r9, r9, r8, pull #\pull + mov r8, r8, push #\push + orr r8, r8, r7, pull #\pull + mov r7, r7, push #\push + orr r7, r7, r6, pull #\pull + mov r6, r6, push #\push + orr r6, r6, r5, pull #\pull + mov r5, r5, push #\push + orr r5, r5, r4, pull #\pull + mov r4, r4, push #\push + orr r4, r4, r3, pull #\pull + stmdb dst!, {r4 - r9, ip, lr} + bge 29b + PLD( cmn r2, #96 ) + PLD( bge 30b ) + PLD( add r2, r2, #96 ) + cmn r2, #16 + blt 32f +31: mov r7, r3, push #\push + ldmdb src!, {r3 - r6} + sub r2, r2, #16 + orr r7, r7, r6, pull #\pull + mov r6, r6, push #\push + orr r6, r6, r5, pull #\pull + mov r5, r5, push #\push + orr r5, r5, r4, pull #\pull + mov r4, r4, push #\push + orr r4, r4, r3, pull #\pull + stmdb dst!, {r4 - r7} +32: adds r2, r2, #28 + ldmfd sp!, {r5 - r9} + blt 34f +33: mov r4, r3, push #\push + ldr r3, [src, #-4]! + subs r2, r2, #4 + orr r4, r4, r3, pull #\pull + str r4, [dst, #-4]! + bge 33b +34: + .endm + + + backward_copy_shift push=8 pull=24 + add src, src, #3 + b 25b + +35: backward_copy_shift push=16 pull=16 + add src, src, #2 + b 25b + +36: backward_copy_shift push=24 pull=8 + add src, src, #1 + b 25b + + .size bcopy, . - bcopy +END(bcopy)