summaryrefslogtreecommitdiff
path: root/glibc/glibc-cvs/arm-memcpy.patch
diff options
context:
space:
mode:
authorGerald Britton <gbritton@doomcom.org>2004-04-27 02:53:54 +0000
committerGerald Britton <gbritton@doomcom.org>2004-04-27 02:53:54 +0000
commit5c76c320af6f82794823215c081469d3de8e64e4 (patch)
tree324384db2b3a349459fb4fa28fc910448ec52ae7 /glibc/glibc-cvs/arm-memcpy.patch
parentf63c99c34855fff1cbb932457b08bdd29b3777ff (diff)
- binutils 2.15.90.0.3 / gcc 3.4.0 toolchain
- support in glibc and uclibc for gcc 3.4.0 - new target setup for uclibc, TARGET_OS=linux-uclibc to match the uclibc project's setup and modern config.sub already has support for it. BKrev: 408dcb42JOaGKxGg3PSn6IwU4Kimfw
Diffstat (limited to 'glibc/glibc-cvs/arm-memcpy.patch')
-rw-r--r--glibc/glibc-cvs/arm-memcpy.patch756
1 files changed, 756 insertions, 0 deletions
diff --git a/glibc/glibc-cvs/arm-memcpy.patch b/glibc/glibc-cvs/arm-memcpy.patch
index e69de29bb2..7fe0040bea 100644
--- a/glibc/glibc-cvs/arm-memcpy.patch
+++ b/glibc/glibc-cvs/arm-memcpy.patch
@@ -0,0 +1,756 @@
+--- /dev/null 2004-02-02 20:32:13.000000000 +0000
++++ sysdeps/arm/memcpy.S 2004-03-20 13:25:27.000000000 +0000
+@@ -0,0 +1,241 @@
++/*
++ * Optimized memcpy implementation for ARM processors
++ *
++ * Author: Nicolas Pitre
++ * Created: Dec 23, 2003
++ * Copyright: (C) MontaVista Software, Inc.
++ *
++ * This file is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * This file is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ */
++
++#include <sysdep.h>
++
++
++/*
++ * Endian independent macros for shifting bytes within registers.
++ */
++#ifndef __ARMEB__
++#define pull lsr
++#define push lsl
++#else
++#define pull lsl
++#define push lsr
++#endif
++
++/*
++ * Enable data preload for architectures that support it (ARMv5 and above)
++ */
++#if defined(__ARM_ARCH_5__) || \
++ defined(__ARM_ARCH_5T__) || \
++ defined(__ARM_ARCH_5TE__)
++#define PLD(code...) code
++#else
++#define PLD(code...)
++#endif
++
++
++/* char * memcpy (char *dst, const char *src) */
++
++ENTRY(memcpy)
++ subs r2, r2, #4
++ stmfd sp!, {r0, r4, lr}
++ blt 7f
++ ands ip, r0, #3
++ PLD( pld [r1, #0] )
++ bne 8f
++ ands ip, r1, #3
++ bne 9f
++
++1: subs r2, r2, #4
++ blt 6f
++ subs r2, r2, #8
++ blt 5f
++ subs r2, r2, #16
++ blt 4f
++
++ PLD( subs r2, r2, #65 )
++ stmfd sp!, {r5 - r8}
++ PLD( blt 3f )
++ PLD( pld [r1, #32] )
++
++ PLD( @ cache alignment )
++ PLD( ands ip, r1, #31 )
++ PLD( pld [r1, #64] )
++ PLD( beq 2f )
++ PLD( rsb ip, ip, #32 )
++ PLD( cmp r2, ip )
++ PLD( pld [r1, #96] )
++ PLD( blt 2f )
++ PLD( cmp ip, #16 )
++ PLD( sub r2, r2, ip )
++ PLD( ldmgeia r1!, {r3 - r6} )
++ PLD( stmgeia r0!, {r3 - r6} )
++ PLD( beq 2f )
++ PLD( and ip, ip, #15 )
++ PLD( cmp ip, #8 )
++ PLD( ldr r3, [r1], #4 )
++ PLD( ldrge r4, [r1], #4 )
++ PLD( ldrgt r5, [r1], #4 )
++ PLD( str r3, [r0], #4 )
++ PLD( strge r4, [r0], #4 )
++ PLD( strgt r5, [r0], #4 )
++
++2: PLD( pld [r1, #96] )
++3: ldmia r1!, {r3 - r8, ip, lr}
++ subs r2, r2, #32
++ stmia r0!, {r3 - r8, ip, lr}
++ bge 2b
++ PLD( cmn r2, #65 )
++ PLD( bge 3b )
++ PLD( add r2, r2, #65 )
++ tst r2, #31
++ ldmfd sp!, {r5 - r8}
++ ldmeqfd sp!, {r0, r4, pc}
++
++ tst r2, #16
++4: ldmneia r1!, {r3, r4, ip, lr}
++ stmneia r0!, {r3, r4, ip, lr}
++
++ tst r2, #8
++5: ldmneia r1!, {r3, r4}
++ stmneia r0!, {r3, r4}
++
++ tst r2, #4
++6: ldrne r3, [r1], #4
++ strne r3, [r0], #4
++
++7: ands r2, r2, #3
++ ldmeqfd sp!, {r0, r4, pc}
++
++ cmp r2, #2
++ ldrb r3, [r1], #1
++ ldrgeb r4, [r1], #1
++ ldrgtb ip, [r1]
++ strb r3, [r0], #1
++ strgeb r4, [r0], #1
++ strgtb ip, [r0]
++ ldmfd sp!, {r0, r4, pc}
++
++8: rsb ip, ip, #4
++ cmp ip, #2
++ ldrb r3, [r1], #1
++ ldrgeb r4, [r1], #1
++ ldrgtb lr, [r1], #1
++ strb r3, [r0], #1
++ strgeb r4, [r0], #1
++ strgtb lr, [r0], #1
++ subs r2, r2, ip
++ blt 7b
++ ands ip, r1, #3
++ beq 1b
++
++9: bic r1, r1, #3
++ cmp ip, #2
++ ldr lr, [r1], #4
++ beq 17f
++ bgt 18f
++
++
++ .macro forward_copy_shift pull push
++
++ cmp r2, #12
++ PLD( pld [r1, #0] )
++ blt 15f
++ subs r2, r2, #28
++ stmfd sp!, {r5 - r9}
++ blt 13f
++
++ PLD( subs r2, r2, #97 )
++ PLD( blt 12f )
++ PLD( pld [r1, #32] )
++
++ PLD( @ cache alignment )
++ PLD( rsb ip, r1, #36 )
++ PLD( pld [r1, #64] )
++ PLD( ands ip, ip, #31 )
++ PLD( pld [r1, #96] )
++ PLD( beq 11f )
++ PLD( cmp r2, ip )
++ PLD( pld [r1, #128] )
++ PLD( blt 11f )
++ PLD( sub r2, r2, ip )
++10: PLD( mov r3, lr, pull #\pull )
++ PLD( ldr lr, [r1], #4 )
++ PLD( subs ip, ip, #4 )
++ PLD( orr r3, r3, lr, push #\push )
++ PLD( str r3, [r0], #4 )
++ PLD( bgt 10b )
++
++11: PLD( pld [r1, #128] )
++12: mov r3, lr, pull #\pull
++ ldmia r1!, {r4 - r9, ip, lr}
++ subs r2, r2, #32
++ orr r3, r3, r4, push #\push
++ mov r4, r4, pull #\pull
++ orr r4, r4, r5, push #\push
++ mov r5, r5, pull #\pull
++ orr r5, r5, r6, push #\push
++ mov r6, r6, pull #\pull
++ orr r6, r6, r7, push #\push
++ mov r7, r7, pull #\pull
++ orr r7, r7, r8, push #\push
++ mov r8, r8, pull #\pull
++ orr r8, r8, r9, push #\push
++ mov r9, r9, pull #\pull
++ orr r9, r9, ip, push #\push
++ mov ip, ip, pull #\pull
++ orr ip, ip, lr, push #\push
++ stmia r0!, {r3 - r9, ip}
++ bge 11b
++ PLD( cmn r2, #97 )
++ PLD( bge 12b )
++ PLD( add r2, r2, #97 )
++ cmn r2, #16
++ blt 14f
++13: mov r3, lr, pull #\pull
++ ldmia r1!, {r4 - r6, lr}
++ sub r2, r2, #16
++ orr r3, r3, r4, push #\push
++ mov r4, r4, pull #\pull
++ orr r4, r4, r5, push #\push
++ mov r5, r5, pull #\pull
++ orr r5, r5, r6, push #\push
++ mov r6, r6, pull #\pull
++ orr r6, r6, lr, push #\push
++ stmia r0!, {r3 - r6}
++14: adds r2, r2, #28
++ ldmfd sp!, {r5 - r9}
++ blt 16f
++15: mov r3, lr, pull #\pull
++ ldr lr, [r1], #4
++ subs r2, r2, #4
++ orr r3, r3, lr, push #\push
++ str r3, [r0], #4
++ bge 15b
++16:
++ .endm
++
++
++ forward_copy_shift pull=8 push=24
++ sub r1, r1, #3
++ b 7b
++
++17: forward_copy_shift pull=16 push=16
++ sub r1, r1, #2
++ b 7b
++
++18: forward_copy_shift pull=24 push=8
++ sub r1, r1, #1
++ b 7b
++
++ .size memcpy, . - memcpy
++END(memcpy)
++libc_hidden_builtin_def (memcpy)
+--- /dev/null 2004-02-02 20:32:13.000000000 +0000
++++ sysdeps/arm/memmove.S 2004-03-20 18:37:23.000000000 +0000
+@@ -0,0 +1,251 @@
++/*
++ * Optimized memmove implementation for ARM processors
++ *
++ * Author: Nicolas Pitre
++ * Created: Dec 23, 2003
++ * Copyright: (C) MontaVista Software, Inc.
++ *
++ * This file is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * This file is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ */
++
++#include <sysdep.h>
++
++
++/*
++ * Endian independent macros for shifting bytes within registers.
++ */
++#ifndef __ARMEB__
++#define pull lsr
++#define push lsl
++#else
++#define pull lsl
++#define push lsr
++#endif
++
++/*
++ * Enable data preload for architectures that support it (ARMv5 and above)
++ */
++#if defined(__ARM_ARCH_5__) || \
++ defined(__ARM_ARCH_5T__) || \
++ defined(__ARM_ARCH_5TE__)
++#define PLD(code...) code
++#else
++#define PLD(code...)
++#endif
++
++
++/* char * memmove (char *dst, const char *src) */
++ENTRY(memmove)
++ subs ip, r0, r1
++ cmphi r2, ip
++ bls memcpy(PLT)
++
++ stmfd sp!, {r0, r4, lr}
++ add r1, r1, r2
++ add r0, r0, r2
++ subs r2, r2, #4
++ blt 25f
++ ands ip, r0, #3
++ PLD( pld [r1, #-4] )
++ bne 26f
++ ands ip, r1, #3
++ bne 27f
++
++19: subs r2, r2, #4
++ blt 24f
++ subs r2, r2, #8
++ blt 23f
++ subs r2, r2, #16
++ blt 22f
++
++ PLD( pld [r1, #-32] )
++ PLD( subs r2, r2, #96 )
++ stmfd sp!, {r5 - r8}
++ PLD( blt 21f )
++
++ PLD( @ cache alignment )
++ PLD( ands ip, r1, #31 )
++ PLD( pld [r1, #-64] )
++ PLD( beq 20f )
++ PLD( cmp r2, ip )
++ PLD( pld [r1, #-96] )
++ PLD( blt 20f )
++ PLD( cmp ip, #16 )
++ PLD( sub r2, r2, ip )
++ PLD( ldmgedb r1!, {r3 - r6} )
++ PLD( stmgedb r0!, {r3 - r6} )
++ PLD( beq 20f )
++ PLD( and ip, ip, #15 )
++ PLD( cmp ip, #8 )
++ PLD( ldr r3, [r1, #-4]! )
++ PLD( ldrge r4, [r1, #-4]! )
++ PLD( ldrgt r5, [r1, #-4]! )
++ PLD( str r3, [r0, #-4]! )
++ PLD( strge r4, [r0, #-4]! )
++ PLD( strgt r5, [r0, #-4]! )
++
++20: PLD( pld [r1, #-96] )
++ PLD( pld [r1, #-128] )
++21: ldmdb r1!, {r3, r4, ip, lr}
++ subs r2, r2, #32
++ stmdb r0!, {r3, r4, ip, lr}
++ ldmdb r1!, {r3, r4, ip, lr}
++ stmgedb r0!, {r3, r4, ip, lr}
++ ldmgedb r1!, {r3, r4, ip, lr}
++ stmgedb r0!, {r3, r4, ip, lr}
++ ldmgedb r1!, {r3, r4, ip, lr}
++ subges r2, r2, #32
++ stmdb r0!, {r3, r4, ip, lr}
++ bge 20b
++ PLD( cmn r2, #96 )
++ PLD( bge 21b )
++ PLD( add r2, r2, #96 )
++ tst r2, #31
++ ldmfd sp!, {r5 - r8}
++ ldmeqfd sp!, {r0, r4, pc}
++
++ tst r2, #16
++22: ldmnedb r1!, {r3, r4, ip, lr}
++ stmnedb r0!, {r3, r4, ip, lr}
++
++ tst r2, #8
++23: ldmnedb r1!, {r3, r4}
++ stmnedb r0!, {r3, r4}
++
++ tst r2, #4
++24: ldrne r3, [r1, #-4]!
++ strne r3, [r0, #-4]!
++
++25: ands r2, r2, #3
++ ldmeqfd sp!, {r0, r4, pc}
++
++ cmp r2, #2
++ ldrb r3, [r1, #-1]
++ ldrgeb r4, [r1, #-2]
++ ldrgtb ip, [r1, #-3]
++ strb r3, [r0, #-1]
++ strgeb r4, [r0, #-2]
++ strgtb ip, [r0, #-3]
++ ldmfd sp!, {r0, r4, pc}
++
++26: cmp ip, #2
++ ldrb r3, [r1, #-1]!
++ ldrgeb r4, [r1, #-1]!
++ ldrgtb lr, [r1, #-1]!
++ strb r3, [r0, #-1]!
++ strgeb r4, [r0, #-1]!
++ strgtb lr, [r0, #-1]!
++ subs r2, r2, ip
++ blt 25b
++ ands ip, r1, #3
++ beq 19b
++
++27: bic r1, r1, #3
++ cmp ip, #2
++ ldr r3, [r1]
++ beq 35f
++ blt 36f
++
++
++ .macro backward_copy_shift push pull
++
++ cmp r2, #12
++ PLD( pld [r1, #-4] )
++ blt 33f
++ subs r2, r2, #28
++ stmfd sp!, {r5 - r9}
++ blt 31f
++
++ PLD( subs r2, r2, #96 )
++ PLD( pld [r1, #-32] )
++ PLD( blt 30f )
++ PLD( pld [r1, #-64] )
++
++ PLD( @ cache alignment )
++ PLD( ands ip, r1, #31 )
++ PLD( pld [r1, #-96] )
++ PLD( beq 29f )
++ PLD( cmp r2, ip )
++ PLD( pld [r1, #-128] )
++ PLD( blt 29f )
++ PLD( sub r2, r2, ip )
++28: PLD( mov r4, r3, push #\push )
++ PLD( ldr r3, [r1, #-4]! )
++ PLD( subs ip, ip, #4 )
++ PLD( orr r4, r4, r3, pull #\pull )
++ PLD( str r4, [r0, #-4]! )
++ PLD( bgt 28b )
++
++29: PLD( pld [r1, #-128] )
++30: mov lr, r3, push #\push
++ ldmdb r1!, {r3 - r9, ip}
++ subs r2, r2, #32
++ orr lr, lr, ip, pull #\pull
++ mov ip, ip, push #\push
++ orr ip, ip, r9, pull #\pull
++ mov r9, r9, push #\push
++ orr r9, r9, r8, pull #\pull
++ mov r8, r8, push #\push
++ orr r8, r8, r7, pull #\pull
++ mov r7, r7, push #\push
++ orr r7, r7, r6, pull #\pull
++ mov r6, r6, push #\push
++ orr r6, r6, r5, pull #\pull
++ mov r5, r5, push #\push
++ orr r5, r5, r4, pull #\pull
++ mov r4, r4, push #\push
++ orr r4, r4, r3, pull #\pull
++ stmdb r0!, {r4 - r9, ip, lr}
++ bge 29b
++ PLD( cmn r2, #96 )
++ PLD( bge 30b )
++ PLD( add r2, r2, #96 )
++ cmn r2, #16
++ blt 32f
++31: mov r7, r3, push #\push
++ ldmdb r1!, {r3 - r6}
++ sub r2, r2, #16
++ orr r7, r7, r6, pull #\pull
++ mov r6, r6, push #\push
++ orr r6, r6, r5, pull #\pull
++ mov r5, r5, push #\push
++ orr r5, r5, r4, pull #\pull
++ mov r4, r4, push #\push
++ orr r4, r4, r3, pull #\pull
++ stmdb r0!, {r4 - r7}
++32: adds r2, r2, #28
++ ldmfd sp!, {r5 - r9}
++ blt 34f
++33: mov r4, r3, push #\push
++ ldr r3, [r1, #-4]!
++ subs r2, r2, #4
++ orr r4, r4, r3, pull #\pull
++ str r4, [r0, #-4]!
++ bge 33b
++34:
++ .endm
++
++
++ backward_copy_shift push=8 pull=24
++ add r1, r1, #3
++ b 25b
++
++35: backward_copy_shift push=16 pull=16
++ add r1, r1, #2
++ b 25b
++
++36: backward_copy_shift push=24 pull=8
++ add r1, r1, #1
++ b 25b
++
++ .size memmove, . - memmove
++END(memmove)
++libc_hidden_builtin_def (memmove)
+--- /dev/null 2004-02-02 20:32:13.000000000 +0000
++++ sysdeps/arm/bcopy.S 2004-03-20 18:37:48.000000000 +0000
+@@ -0,0 +1,255 @@
++/*
++ * Optimized memmove implementation for ARM processors
++ *
++ * Author: Nicolas Pitre
++ * Created: Dec 23, 2003
++ * Copyright: (C) MontaVista Software, Inc.
++ *
++ * This file is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * This file is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ */
++
++#include <sysdep.h>
++
++
++/*
++ * Endian independent macros for shifting bytes within registers.
++ */
++#ifndef __ARMEB__
++#define pull lsr
++#define push lsl
++#else
++#define pull lsl
++#define push lsr
++#endif
++
++/*
++ * Enable data preload for architectures that support it (ARMv5 and above)
++ */
++#if defined(__ARM_ARCH_5__) || \
++ defined(__ARM_ARCH_5T__) || \
++ defined(__ARM_ARCH_5TE__)
++#define PLD(code...) code
++#else
++#define PLD(code...)
++#endif
++
++dst .req r1
++src .req r0
++
++/* void *bcopy (const char *src, char *dst, size_t size) */
++ENTRY(bcopy)
++ subs ip, dst, src
++ cmphi r2, ip
++ movls r3, r0
++ movls r0, r1
++ movls r1, r3
++ bls memcpy(PLT)
++
++ stmfd sp!, {r4, lr}
++ add src, src, r2
++ add dst, dst, r2
++ subs r2, r2, #4
++ blt 25f
++ ands ip, dst, #3
++ PLD( pld [src, #-4] )
++ bne 26f
++ ands ip, src, #3
++ bne 27f
++
++19: subs r2, r2, #4
++ blt 24f
++ subs r2, r2, #8
++ blt 23f
++ subs r2, r2, #16
++ blt 22f
++
++ PLD( pld [src, #-32] )
++ PLD( subs r2, r2, #96 )
++ stmfd sp!, {r5 - r8}
++ PLD( blt 21f )
++
++ PLD( @ cache alignment )
++ PLD( ands ip, src, #31 )
++ PLD( pld [src, #-64] )
++ PLD( beq 20f )
++ PLD( cmp r2, ip )
++ PLD( pld [src, #-96] )
++ PLD( blt 20f )
++ PLD( cmp ip, #16 )
++ PLD( sub r2, r2, ip )
++ PLD( ldmgedb src!, {r3 - r6} )
++ PLD( stmgedb dst!, {r3 - r6} )
++ PLD( beq 20f )
++ PLD( and ip, ip, #15 )
++ PLD( cmp ip, #8 )
++ PLD( ldr r3, [src, #-4]! )
++ PLD( ldrge r4, [src, #-4]! )
++ PLD( ldrgt r5, [src, #-4]! )
++ PLD( str r3, [dst, #-4]! )
++ PLD( strge r4, [dst, #-4]! )
++ PLD( strgt r5, [dst, #-4]! )
++
++20: PLD( pld [src, #-96] )
++ PLD( pld [src, #-128] )
++21: ldmdb src!, {r3, r4, ip, lr}
++ subs r2, r2, #32
++ stmdb dst!, {r3, r4, ip, lr}
++ ldmdb src!, {r3, r4, ip, lr}
++ stmgedb dst!, {r3, r4, ip, lr}
++ ldmgedb src!, {r3, r4, ip, lr}
++ stmgedb dst!, {r3, r4, ip, lr}
++ ldmgedb src!, {r3, r4, ip, lr}
++ subges r2, r2, #32
++ stmdb dst!, {r3, r4, ip, lr}
++ bge 20b
++ PLD( cmn r2, #96 )
++ PLD( bge 21b )
++ PLD( add r2, r2, #96 )
++ tst r2, #31
++ ldmfd sp!, {r5 - r8}
++ ldmeqfd sp!, {r4, pc}
++
++ tst r2, #16
++22: ldmnedb src!, {r3, r4, ip, lr}
++ stmnedb dst!, {r3, r4, ip, lr}
++
++ tst r2, #8
++23: ldmnedb src!, {r3, r4}
++ stmnedb dst!, {r3, r4}
++
++ tst r2, #4
++24: ldrne r3, [src, #-4]!
++ strne r3, [dst, #-4]!
++
++25: ands r2, r2, #3
++ ldmeqfd sp!, {dst, r4, pc}
++
++ cmp r2, #2
++ ldrb r3, [src, #-1]
++ ldrgeb r4, [src, #-2]
++ ldrgtb ip, [src, #-3]
++ strb r3, [dst, #-1]
++ strgeb r4, [dst, #-2]
++ strgtb ip, [dst, #-3]
++ ldmfd sp!, {dst, r4, pc}
++
++26: cmp ip, #2
++ ldrb r3, [src, #-1]!
++ ldrgeb r4, [src, #-1]!
++ ldrgtb lr, [src, #-1]!
++ strb r3, [dst, #-1]!
++ strgeb r4, [dst, #-1]!
++ strgtb lr, [dst, #-1]!
++ subs r2, r2, ip
++ blt 25b
++ ands ip, src, #3
++ beq 19b
++
++27: bic src, src, #3
++ cmp ip, #2
++ ldr r3, [src]
++ beq 35f
++ blt 36f
++
++
++ .macro backward_copy_shift push pull
++
++ cmp r2, #12
++ PLD( pld [src, #-4] )
++ blt 33f
++ subs r2, r2, #28
++ stmfd sp!, {r5 - r9}
++ blt 31f
++
++ PLD( subs r2, r2, #96 )
++ PLD( pld [src, #-32] )
++ PLD( blt 30f )
++ PLD( pld [src, #-64] )
++
++ PLD( @ cache alignment )
++ PLD( ands ip, src, #31 )
++ PLD( pld [src, #-96] )
++ PLD( beq 29f )
++ PLD( cmp r2, ip )
++ PLD( pld [src, #-128] )
++ PLD( blt 29f )
++ PLD( sub r2, r2, ip )
++28: PLD( mov r4, r3, push #\push )
++ PLD( ldr r3, [src, #-4]! )
++ PLD( subs ip, ip, #4 )
++ PLD( orr r4, r4, r3, pull #\pull )
++ PLD( str r4, [dst, #-4]! )
++ PLD( bgt 28b )
++
++29: PLD( pld [src, #-128] )
++30: mov lr, r3, push #\push
++ ldmdb src!, {r3 - r9, ip}
++ subs r2, r2, #32
++ orr lr, lr, ip, pull #\pull
++ mov ip, ip, push #\push
++ orr ip, ip, r9, pull #\pull
++ mov r9, r9, push #\push
++ orr r9, r9, r8, pull #\pull
++ mov r8, r8, push #\push
++ orr r8, r8, r7, pull #\pull
++ mov r7, r7, push #\push
++ orr r7, r7, r6, pull #\pull
++ mov r6, r6, push #\push
++ orr r6, r6, r5, pull #\pull
++ mov r5, r5, push #\push
++ orr r5, r5, r4, pull #\pull
++ mov r4, r4, push #\push
++ orr r4, r4, r3, pull #\pull
++ stmdb dst!, {r4 - r9, ip, lr}
++ bge 29b
++ PLD( cmn r2, #96 )
++ PLD( bge 30b )
++ PLD( add r2, r2, #96 )
++ cmn r2, #16
++ blt 32f
++31: mov r7, r3, push #\push
++ ldmdb src!, {r3 - r6}
++ sub r2, r2, #16
++ orr r7, r7, r6, pull #\pull
++ mov r6, r6, push #\push
++ orr r6, r6, r5, pull #\pull
++ mov r5, r5, push #\push
++ orr r5, r5, r4, pull #\pull
++ mov r4, r4, push #\push
++ orr r4, r4, r3, pull #\pull
++ stmdb dst!, {r4 - r7}
++32: adds r2, r2, #28
++ ldmfd sp!, {r5 - r9}
++ blt 34f
++33: mov r4, r3, push #\push
++ ldr r3, [src, #-4]!
++ subs r2, r2, #4
++ orr r4, r4, r3, pull #\pull
++ str r4, [dst, #-4]!
++ bge 33b
++34:
++ .endm
++
++
++ backward_copy_shift push=8 pull=24
++ add src, src, #3
++ b 25b
++
++35: backward_copy_shift push=16 pull=16
++ add src, src, #2
++ b 25b
++
++36: backward_copy_shift push=24 pull=8
++ add src, src, #1
++ b 25b
++
++ .size bcopy, . - bcopy
++END(bcopy)