--- /dev/null	2004-02-02 20:32:13.000000000 +0000
+++ sysdeps/arm/memcpy.S	2004-03-20 13:25:27.000000000 +0000
@@ -0,0 +1,241 @@
+/*
+ *   Optimized memcpy implementation for ARM processors
+ *
+ *	Author: 	Nicolas Pitre
+ *	Created:	Dec 23, 2003
+ *	Copyright:	(C) MontaVista Software, Inc.
+ *
+ *   This file is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU Lesser General Public
+ *   License as published by the Free Software Foundation; either
+ *   version 2.1 of the License, or (at your option) any later version.
+ *
+ *   This file is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   Lesser General Public License for more details.
+ */
+
+#include <sysdep.h>
+
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#else
+#define pull            lsl
+#define push            lsr
+#endif
+
+/*
+ * Enable data preload for architectures that support it (ARMv5 and above)
+ */
+#if defined(__ARM_ARCH_5__) || \
+    defined(__ARM_ARCH_5T__) || \
+    defined(__ARM_ARCH_5TE__)
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+
+/* char * memcpy (char *dst, const char *src) */
+
+ENTRY(memcpy)
+		subs	r2, r2, #4
+		stmfd	sp!, {r0, r4, lr}
+		blt	7f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	8f
+		ands	ip, r1, #3
+		bne	9f
+
+1:		subs	r2, r2, #4
+		blt	6f
+		subs	r2, r2, #8
+		blt	5f
+		subs	r2, r2, #16
+		blt	4f
+
+	PLD(	subs	r2, r2, #65		)
+		stmfd	sp!, {r5 - r8}
+	PLD(	blt	3f			)
+	PLD(	pld	[r1, #32]		)
+
+	PLD(	@ cache alignment		)
+	PLD(	ands	ip, r1, #31		)
+	PLD(	pld	[r1, #64]		)
+	PLD(	beq	2f			)
+	PLD(	rsb	ip, ip, #32		)
+	PLD(	cmp	r2, ip			)
+	PLD(	pld	[r1, #96]		)
+	PLD(	blt	2f			)
+	PLD(	cmp	ip, #16			)
+	PLD(	sub	r2, r2, ip		)
+	PLD(	ldmgeia	r1!, {r3 - r6}		)
+	PLD(	stmgeia	r0!, {r3 - r6}		)
+	PLD(	beq	2f			)
+	PLD(	and	ip, ip, #15		)
+	PLD(	cmp	ip, #8			)
+	PLD(	ldr	r3, [r1], #4		)
+	PLD(	ldrge	r4, [r1], #4		)
+	PLD(	ldrgt	r5, [r1], #4		)
+	PLD(	str	r3, [r0], #4		)
+	PLD(	strge	r4, [r0], #4		)
+	PLD(	strgt	r5, [r0], #4		)
+
+2:	PLD(	pld	[r1, #96]		)
+3:		ldmia	r1!, {r3 - r8, ip, lr}
+		subs	r2, r2, #32
+		stmia	r0!, {r3 - r8, ip, lr}
+		bge	2b
+	PLD(	cmn	r2, #65			)
+	PLD(	bge	3b			)
+	PLD(	add	r2, r2, #65		)
+		tst	r2, #31
+		ldmfd	sp!, {r5 - r8}
+		ldmeqfd	sp!, {r0, r4, pc}
+
+		tst	r2, #16
+4:		ldmneia	r1!, {r3, r4, ip, lr}
+		stmneia	r0!, {r3, r4, ip, lr}
+
+		tst	r2, #8
+5:		ldmneia	r1!, {r3, r4}
+		stmneia	r0!, {r3, r4}
+
+		tst	r2, #4
+6:		ldrne	r3, [r1], #4
+		strne	r3, [r0], #4
+
+7:		ands	r2, r2, #3
+		ldmeqfd	sp!, {r0, r4, pc}
+
+		cmp	r2, #2
+		ldrb	r3, [r1], #1
+		ldrgeb	r4, [r1], #1
+		ldrgtb	ip, [r1]
+		strb	r3, [r0], #1
+		strgeb	r4, [r0], #1
+		strgtb	ip, [r0]
+		ldmfd	sp!, {r0, r4, pc}
+
+8:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldrb	r3, [r1], #1
+		ldrgeb	r4, [r1], #1
+		ldrgtb	lr, [r1], #1
+		strb	r3, [r0], #1
+		strgeb	r4, [r0], #1
+		strgtb	lr, [r0], #1
+		subs	r2, r2, ip
+		blt	7b
+		ands	ip, r1, #3
+		beq	1b
+
+9:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr	lr, [r1], #4
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		cmp	r2, #12
+	PLD(	pld	[r1, #0]		)
+		blt	15f
+		subs	r2, r2, #28
+		stmfd	sp!, {r5 - r9}
+		blt	13f
+
+	PLD(	subs	r2, r2, #97		)
+	PLD(	blt	12f			)
+	PLD(	pld	[r1, #32]		)
+
+	PLD(	@ cache alignment		)
+	PLD(	rsb	ip, r1, #36		)
+	PLD(	pld	[r1, #64]		)
+	PLD(	ands	ip, ip, #31		)
+	PLD(	pld	[r1, #96]		)
+	PLD(	beq	11f			)
+	PLD(	cmp	r2, ip			)
+	PLD(	pld	[r1, #128]		)
+	PLD(	blt	11f			)
+	PLD(	sub	r2, r2, ip		)
+10:	PLD(	mov	r3, lr, pull #\pull	)
+	PLD(	ldr	lr, [r1], #4		)
+	PLD(	subs	ip, ip, #4		)
+	PLD(	orr	r3, r3, lr, push #\push	)
+	PLD(	str	r3, [r0], #4		)
+	PLD(	bgt	10b			)
+
+11:	PLD(	pld	[r1, #128]		)
+12:		mov	r3, lr, pull #\pull
+		ldmia	r1!, {r4 - r9, ip, lr}
+		subs	r2, r2, #32
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		stmia	r0!, {r3 - r9, ip}
+		bge	11b
+	PLD(	cmn	r2, #97			)
+	PLD(	bge	12b			)
+	PLD(	add	r2, r2, #97		)
+		cmn	r2, #16
+		blt	14f
+13:		mov	r3, lr, pull #\pull
+		ldmia	r1!, {r4 - r6, lr}
+		sub	r2, r2, #16
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, lr, push #\push
+		stmia	r0!, {r3 - r6}
+14:		adds	r2, r2, #28
+		ldmfd	sp!, {r5 - r9}
+		blt	16f
+15:		mov	r3, lr, pull #\pull
+		ldr	lr, [r1], #4
+		subs	r2, r2, #4
+		orr	r3, r3, lr, push #\push
+		str	r3, [r0], #4
+		bge	15b
+16:
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+		sub	r1, r1, #3
+		b	7b
+
+17:		forward_copy_shift	pull=16	push=16
+		sub	r1, r1, #2
+		b	7b
+
+18:		forward_copy_shift	pull=24	push=8
+		sub	r1, r1, #1
+		b	7b
+
+		.size	memcpy, . - memcpy
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
--- /dev/null	2004-02-02 20:32:13.000000000 +0000
+++ sysdeps/arm/memmove.S	2004-03-20 18:37:23.000000000 +0000
@@ -0,0 +1,251 @@
+/*
+ *   Optimized memmove implementation for ARM processors
+ *
+ *	Author: 	Nicolas Pitre
+ *	Created:	Dec 23, 2003
+ *	Copyright:	(C) MontaVista Software, Inc.
+ *
+ *   This file is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU Lesser General Public
+ *   License as published by the Free Software Foundation; either
+ *   version 2.1 of the License, or (at your option) any later version.
+ *
+ *   This file is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   Lesser General Public License for more details.
+ */
+
+#include <sysdep.h>
+
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#else
+#define pull            lsl
+#define push            lsr
+#endif
+
+/*
+ * Enable data preload for architectures that support it (ARMv5 and above)
+ */
+#if defined(__ARM_ARCH_5__) || \
+    defined(__ARM_ARCH_5T__) || \
+    defined(__ARM_ARCH_5TE__)
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+
+/* char * memmove (char *dst, const char *src) */
+ENTRY(memmove)
+		subs	ip, r0, r1
+		cmphi	r2, ip
+		bls	memcpy(PLT)
+
+		stmfd	sp!, {r0, r4, lr}
+		add	r1, r1, r2
+		add	r0, r0, r2
+		subs	r2, r2, #4
+		blt	25f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #-4]		)
+		bne	26f
+		ands	ip, r1, #3
+		bne	27f
+
+19:		subs	r2, r2, #4
+		blt	24f
+		subs	r2, r2, #8
+		blt	23f
+		subs	r2, r2, #16
+		blt	22f
+
+	PLD(	pld	[r1, #-32]		)
+	PLD(	subs	r2, r2, #96		)
+		stmfd	sp!, {r5 - r8}
+	PLD(	blt	21f			)
+
+	PLD(	@ cache alignment		)
+	PLD(	ands	ip, r1, #31		)
+	PLD(	pld	[r1, #-64]		)
+	PLD(	beq	20f			)
+	PLD(	cmp	r2, ip			)
+	PLD(	pld	[r1, #-96]		)
+	PLD(	blt	20f			)
+	PLD(	cmp	ip, #16			)
+	PLD(	sub	r2, r2, ip		)
+	PLD(	ldmgedb	r1!, {r3 - r6}		)
+	PLD(	stmgedb	r0!, {r3 - r6}		)
+	PLD(	beq	20f			)
+	PLD(	and	ip, ip, #15		)
+	PLD(	cmp	ip, #8			)
+	PLD(	ldr	r3, [r1, #-4]!		)
+	PLD(	ldrge	r4, [r1, #-4]!		)
+	PLD(	ldrgt	r5, [r1, #-4]!		)
+	PLD(	str	r3, [r0, #-4]!		)
+	PLD(	strge	r4, [r0, #-4]!		)
+	PLD(	strgt	r5, [r0, #-4]!		)
+
+20:	PLD(	pld	[r1, #-96]		)
+	PLD(	pld	[r1, #-128]		)
+21:		ldmdb	r1!, {r3, r4, ip, lr}
+		subs	r2, r2, #32
+		stmdb	r0!, {r3, r4, ip, lr}
+		ldmdb	r1!, {r3, r4, ip, lr}
+		stmgedb	r0!, {r3, r4, ip, lr}
+		ldmgedb	r1!, {r3, r4, ip, lr}
+		stmgedb	r0!, {r3, r4, ip, lr}
+		ldmgedb	r1!, {r3, r4, ip, lr}
+		subges	r2, r2, #32
+		stmdb	r0!, {r3, r4, ip, lr}
+		bge	20b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	21b			)
+	PLD(	add	r2, r2, #96		)
+		tst	r2, #31
+		ldmfd	sp!, {r5 - r8}
+		ldmeqfd	sp!, {r0, r4, pc}
+
+		tst	r2, #16
+22:		ldmnedb	r1!, {r3, r4, ip, lr}
+		stmnedb	r0!, {r3, r4, ip, lr}
+
+		tst	r2, #8
+23:		ldmnedb	r1!, {r3, r4}
+		stmnedb	r0!, {r3, r4}
+
+		tst	r2, #4
+24:		ldrne	r3, [r1, #-4]!
+		strne	r3, [r0, #-4]!
+
+25:		ands	r2, r2, #3
+		ldmeqfd	sp!, {r0, r4, pc}
+
+		cmp	r2, #2
+		ldrb	r3, [r1, #-1]
+		ldrgeb	r4, [r1, #-2]
+		ldrgtb	ip, [r1, #-3]
+		strb	r3, [r0, #-1]
+		strgeb	r4, [r0, #-2]
+		strgtb	ip, [r0, #-3]
+		ldmfd	sp!, {r0, r4, pc}
+
+26:		cmp	ip, #2
+		ldrb	r3, [r1, #-1]!
+		ldrgeb	r4, [r1, #-1]!
+		ldrgtb	lr, [r1, #-1]!
+		strb	r3, [r0, #-1]!
+		strgeb	r4, [r0, #-1]!
+		strgtb	lr, [r0, #-1]!
+		subs	r2, r2, ip
+		blt	25b
+		ands	ip, r1, #3
+		beq	19b
+
+27:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr	r3, [r1]
+		beq	35f
+		blt	36f
+
+
+		.macro	backward_copy_shift push pull
+
+		cmp	r2, #12
+	PLD(	pld	[r1, #-4]		)
+		blt	33f
+		subs	r2, r2, #28
+		stmfd	sp!, {r5 - r9}
+		blt	31f
+
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #-32]		)
+	PLD(	blt	30f			)
+	PLD(	pld	[r1, #-64]		)
+
+	PLD(	@ cache alignment		)
+	PLD(	ands	ip, r1, #31		)
+	PLD(	pld	[r1, #-96]		)
+	PLD(	beq	29f			)
+	PLD(	cmp	r2, ip			)
+	PLD(	pld	[r1, #-128]		)
+	PLD(	blt	29f			)
+	PLD(	sub	r2, r2, ip		)
+28:	PLD(	mov	r4, r3, push #\push	)
+	PLD(	ldr	r3, [r1, #-4]!		)
+	PLD(	subs	ip, ip, #4		)
+	PLD(	orr	r4, r4, r3, pull #\pull	)
+	PLD(	str	r4, [r0, #-4]!		)
+	PLD(	bgt	28b			)
+
+29:	PLD(	pld	[r1, #-128]		)
+30:		mov	lr, r3, push #\push
+		ldmdb	r1!, {r3 - r9, ip}
+		subs	r2, r2, #32
+		orr	lr, lr, ip, pull #\pull
+		mov	ip, ip, push #\push
+		orr	ip, ip, r9, pull #\pull
+		mov	r9, r9, push #\push
+		orr	r9, r9, r8, pull #\pull
+		mov	r8, r8, push #\push
+		orr	r8, r8, r7, pull #\pull
+		mov	r7, r7, push #\push
+		orr	r7, r7, r6, pull #\pull
+		mov	r6, r6, push #\push
+		orr	r6, r6, r5, pull #\pull
+		mov	r5, r5, push #\push
+		orr	r5, r5, r4, pull #\pull
+		mov	r4, r4, push #\push
+		orr	r4, r4, r3, pull #\pull
+		stmdb	r0!, {r4 - r9, ip, lr}
+		bge	29b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	30b			)
+	PLD(	add	r2, r2, #96		)
+		cmn	r2, #16
+		blt	32f
+31:		mov	r7, r3, push #\push
+		ldmdb	r1!, {r3 - r6}
+		sub	r2, r2, #16
+		orr	r7, r7, r6, pull #\pull
+		mov	r6, r6, push #\push
+		orr	r6, r6, r5, pull #\pull
+		mov	r5, r5, push #\push
+		orr	r5, r5, r4, pull #\pull
+		mov	r4, r4, push #\push
+		orr	r4, r4, r3, pull #\pull
+		stmdb	r0!, {r4 - r7}
+32:		adds	r2, r2, #28
+		ldmfd	sp!, {r5 - r9}
+		blt	34f
+33:		mov	r4, r3, push #\push
+		ldr	r3, [r1, #-4]!
+		subs	r2, r2, #4
+		orr	r4, r4, r3, pull #\pull
+		str	r4, [r0, #-4]!
+		bge	33b
+34:
+		.endm
+
+
+		backward_copy_shift	push=8	pull=24
+		add	r1, r1, #3
+		b	25b
+
+35:		backward_copy_shift	push=16	pull=16
+		add	r1, r1, #2
+		b	25b
+
+36:		backward_copy_shift	push=24	pull=8
+		add	r1, r1, #1
+		b	25b
+
+		.size	memmove, . - memmove
+END(memmove)
+libc_hidden_builtin_def (memmove)
--- /dev/null	2004-02-02 20:32:13.000000000 +0000
+++ sysdeps/arm/bcopy.S	2004-03-20 18:37:48.000000000 +0000
@@ -0,0 +1,255 @@
+/*
+ *   Optimized memmove implementation for ARM processors
+ *
+ *	Author: 	Nicolas Pitre
+ *	Created:	Dec 23, 2003
+ *	Copyright:	(C) MontaVista Software, Inc.
+ *
+ *   This file is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU Lesser General Public
+ *   License as published by the Free Software Foundation; either
+ *   version 2.1 of the License, or (at your option) any later version.
+ *
+ *   This file is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   Lesser General Public License for more details.
+ */
+
+#include <sysdep.h>
+
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#else
+#define pull            lsl
+#define push            lsr
+#endif
+
+/*
+ * Enable data preload for architectures that support it (ARMv5 and above)
+ */
+#if defined(__ARM_ARCH_5__) || \
+    defined(__ARM_ARCH_5T__) || \
+    defined(__ARM_ARCH_5TE__)
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+dst		.req	r1
+src		.req	r0
+
+/* void *bcopy (const char *src, char *dst, size_t size) */
+ENTRY(bcopy)
+		subs	ip, dst, src
+		cmphi	r2, ip
+		movls	r3, r0
+		movls	r0, r1
+		movls	r1, r3
+		bls	memcpy(PLT)
+
+		stmfd	sp!, {r4, lr}
+		add	src, src, r2
+		add	dst, dst, r2
+		subs	r2, r2, #4
+		blt	25f
+		ands	ip, dst, #3
+	PLD(	pld	[src, #-4]		)
+		bne	26f
+		ands	ip, src, #3
+		bne	27f
+
+19:		subs	r2, r2, #4
+		blt	24f
+		subs	r2, r2, #8
+		blt	23f
+		subs	r2, r2, #16
+		blt	22f
+
+	PLD(	pld	[src, #-32]		)
+	PLD(	subs	r2, r2, #96		)
+		stmfd	sp!, {r5 - r8}
+	PLD(	blt	21f			)
+
+	PLD(	@ cache alignment		)
+	PLD(	ands	ip, src, #31		)
+	PLD(	pld	[src, #-64]		)
+	PLD(	beq	20f			)
+	PLD(	cmp	r2, ip			)
+	PLD(	pld	[src, #-96]		)
+	PLD(	blt	20f			)
+	PLD(	cmp	ip, #16			)
+	PLD(	sub	r2, r2, ip		)
+	PLD(	ldmgedb	src!, {r3 - r6}		)
+	PLD(	stmgedb	dst!, {r3 - r6}		)
+	PLD(	beq	20f			)
+	PLD(	and	ip, ip, #15		)
+	PLD(	cmp	ip, #8			)
+	PLD(	ldr	r3, [src, #-4]!		)
+	PLD(	ldrge	r4, [src, #-4]!		)
+	PLD(	ldrgt	r5, [src, #-4]!		)
+	PLD(	str	r3, [dst, #-4]!		)
+	PLD(	strge	r4, [dst, #-4]!		)
+	PLD(	strgt	r5, [dst, #-4]!		)
+
+20:	PLD(	pld	[src, #-96]		)
+	PLD(	pld	[src, #-128]		)
+21:		ldmdb	src!, {r3, r4, ip, lr}
+		subs	r2, r2, #32
+		stmdb	dst!, {r3, r4, ip, lr}
+		ldmdb	src!, {r3, r4, ip, lr}
+		stmgedb	dst!, {r3, r4, ip, lr}
+		ldmgedb	src!, {r3, r4, ip, lr}
+		stmgedb	dst!, {r3, r4, ip, lr}
+		ldmgedb	src!, {r3, r4, ip, lr}
+		subges	r2, r2, #32
+		stmdb	dst!, {r3, r4, ip, lr}
+		bge	20b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	21b			)
+	PLD(	add	r2, r2, #96		)
+		tst	r2, #31
+		ldmfd	sp!, {r5 - r8}
+		ldmeqfd	sp!, {r4, pc}
+
+		tst	r2, #16
+22:		ldmnedb	src!, {r3, r4, ip, lr}
+		stmnedb	dst!, {r3, r4, ip, lr}
+
+		tst	r2, #8
+23:		ldmnedb	src!, {r3, r4}
+		stmnedb	dst!, {r3, r4}
+
+		tst	r2, #4
+24:		ldrne	r3, [src, #-4]!
+		strne	r3, [dst, #-4]!
+
+25:		ands	r2, r2, #3
+		ldmeqfd	sp!, {dst, r4, pc}
+
+		cmp	r2, #2
+		ldrb	r3, [src, #-1]
+		ldrgeb	r4, [src, #-2]
+		ldrgtb	ip, [src, #-3]
+		strb	r3, [dst, #-1]
+		strgeb	r4, [dst, #-2]
+		strgtb	ip, [dst, #-3]
+		ldmfd	sp!, {dst, r4, pc}
+
+26:		cmp	ip, #2
+		ldrb	r3, [src, #-1]!
+		ldrgeb	r4, [src, #-1]!
+		ldrgtb	lr, [src, #-1]!
+		strb	r3, [dst, #-1]!
+		strgeb	r4, [dst, #-1]!
+		strgtb	lr, [dst, #-1]!
+		subs	r2, r2, ip
+		blt	25b
+		ands	ip, src, #3
+		beq	19b
+
+27:		bic	src, src, #3
+		cmp	ip, #2
+		ldr	r3, [src]
+		beq	35f
+		blt	36f
+
+
+		.macro	backward_copy_shift push pull
+
+		cmp	r2, #12
+	PLD(	pld	[src, #-4]		)
+		blt	33f
+		subs	r2, r2, #28
+		stmfd	sp!, {r5 - r9}
+		blt	31f
+
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[src, #-32]		)
+	PLD(	blt	30f			)
+	PLD(	pld	[src, #-64]		)
+
+	PLD(	@ cache alignment		)
+	PLD(	ands	ip, src, #31		)
+	PLD(	pld	[src, #-96]		)
+	PLD(	beq	29f			)
+	PLD(	cmp	r2, ip			)
+	PLD(	pld	[src, #-128]		)
+	PLD(	blt	29f			)
+	PLD(	sub	r2, r2, ip		)
+28:	PLD(	mov	r4, r3, push #\push	)
+	PLD(	ldr	r3, [src, #-4]!		)
+	PLD(	subs	ip, ip, #4		)
+	PLD(	orr	r4, r4, r3, pull #\pull	)
+	PLD(	str	r4, [dst, #-4]!		)
+	PLD(	bgt	28b			)
+
+29:	PLD(	pld	[src, #-128]		)
+30:		mov	lr, r3, push #\push
+		ldmdb	src!, {r3 - r9, ip}
+		subs	r2, r2, #32
+		orr	lr, lr, ip, pull #\pull
+		mov	ip, ip, push #\push
+		orr	ip, ip, r9, pull #\pull
+		mov	r9, r9, push #\push
+		orr	r9, r9, r8, pull #\pull
+		mov	r8, r8, push #\push
+		orr	r8, r8, r7, pull #\pull
+		mov	r7, r7, push #\push
+		orr	r7, r7, r6, pull #\pull
+		mov	r6, r6, push #\push
+		orr	r6, r6, r5, pull #\pull
+		mov	r5, r5, push #\push
+		orr	r5, r5, r4, pull #\pull
+		mov	r4, r4, push #\push
+		orr	r4, r4, r3, pull #\pull
+		stmdb	dst!, {r4 - r9, ip, lr}
+		bge	29b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	30b			)
+	PLD(	add	r2, r2, #96		)
+		cmn	r2, #16
+		blt	32f
+31:		mov	r7, r3, push #\push
+		ldmdb	src!, {r3 - r6}
+		sub	r2, r2, #16
+		orr	r7, r7, r6, pull #\pull
+		mov	r6, r6, push #\push
+		orr	r6, r6, r5, pull #\pull
+		mov	r5, r5, push #\push
+		orr	r5, r5, r4, pull #\pull
+		mov	r4, r4, push #\push
+		orr	r4, r4, r3, pull #\pull
+		stmdb	dst!, {r4 - r7}
+32:		adds	r2, r2, #28
+		ldmfd	sp!, {r5 - r9}
+		blt	34f
+33:		mov	r4, r3, push #\push
+		ldr	r3, [src, #-4]!
+		subs	r2, r2, #4
+		orr	r4, r4, r3, pull #\pull
+		str	r4, [dst, #-4]!
+		bge	33b
+34:
+		.endm
+
+
+		backward_copy_shift	push=8	pull=24
+		add	src, src, #3
+		b	25b
+
+35:		backward_copy_shift	push=16	pull=16
+		add	src, src, #2
+		b	25b
+
+36:		backward_copy_shift	push=24	pull=8
+		add	src, src, #1
+		b	25b
+
+		.size	bcopy, . - bcopy
+END(bcopy)