--- linux-2.6.orig/arch/arm/kernel/entry-armv.S
+++ linux-2.6/arch/arm/kernel/entry-armv.S
@@ -269,6 +269,12 @@
 	add	r5, sp, #S_PC
 	ldmia	r7, {r2 - r4}			@ Get USR pc, cpsr
 
+#if __LINUX_ARM_ARCH__ < 6
+	@ make sure our user space atomic helper is aborted
+	cmp	r2, #VIRT_OFFSET
+	bichs	r3, r3, #PSR_Z_BIT
+#endif
+
 	@
 	@ We are now ready to fill in the remaining blanks on the stack:
 	@
@@ -499,8 +505,12 @@
 	mra	r4, r5, acc0
 	stmia   ip, {r4, r5}
 #endif
+#ifdef CONFIG_HAS_TLS_REG
+	mcr	p15, 0, r3, c13, c0, 3		@ set TLS register
+#else
 	mov	r4, #0xffff0fff
-	str	r3, [r4, #-3]			@ Set TLS ptr
+	str	r3, [r4, #-15]			@ TLS val at 0xffff0ff0
+#endif
 	mcr	p15, 0, r6, c3, c0, 0		@ Set domain register
 #ifdef CONFIG_VFP
 	@ Always disable VFP so we can lazily save/restore the old
@@ -519,6 +529,196 @@
 	ldmib	r2, {r4 - sl, fp, sp, pc}	@ Load all regs saved previously
 
 	__INIT
+
+/*
+ * User helpers.
+ *
+ * These are segment of kernel provided user code reachable from user space
+ * at a fixed address in kernel memory.  This is used to provide user space
+ * with some operations which require kernel help because of unimplemented
+ * native feature and/or instructions in many ARM CPUs. The idea is for
+ * this code to be executed directly in user mode for best efficiency but
+ * which is too intimate with the kernel counter part to be left to user
+ * libraries.  In fact this code might even differ from one CPU to another
+ * depending on the available  instruction set and restrictions like on
+ * SMP systems.  In other words, the kernel reserves the right to change
+ * this code as needed without warning. Only the entry points and their
+ * results are guaranteed to be stable.
+ *
+ * Each segment is 32-byte aligned and will be moved to the top of the high
+ * vector page.  New segments (if ever needed) must be added in front of
+ * existing ones.  This mechanism should be used only for things that are
+ * really small and justified, and not be abused freely.
+ *
+ * User space is expected to implement those things inline when optimizing
+ * for a processor that has the necessary native support, but only if such
+ * resulting binaries are already to be incompatible with earlier ARM
+ * processors due to the use of unsupported instructions other than what
+ * is provided here.  In other words don't make binaries unable to run on
+ * earlier processors just for the sake of not using these kernel helpers
+ * if your compiled code is not going to use the new instructions for other
+ * purpose.
+ */
+
+	.align	5
+__kuser_helper_start:
+
+/*
+ * Reference prototype:
+ *
+ *	int __kernel_cmpxchg(int oldval, int newval, int *ptr)
+ *
+ * Input:
+ *
+ *	r0 = oldval
+ *	r1 = newval
+ *	r2 = ptr
+ *	lr = return address
+ *
+ * Output:
+ *
+ *	r0 = returned value (zero or non-zero)
+ *	C flag = set if r0 == 0, clear if r0 != 0
+ *
+ * Clobbered:
+ *
+ *	r3, ip, flags
+ *
+ * Definition and user space usage example:
+ *
+ *	typedef int (__kernel_cmpxchg_t)(int oldval, int newval, int *ptr);
+ *	#define __kernel_cmpxchg (*(__kernel_cmpxchg_t *)0xffff0fc0)
+ *
+ * Atomically store newval in *ptr if *ptr is equal to oldval for user space.
+ * Return zero if *ptr was changed or non-zero if no exchange happened.
+ * The C flag is also set if *ptr was changed to allow for assembly
+ * optimization in the calling code.
+ *
+ * For example, a user space atomic_add implementation could look like this:
+ *
+ * #define atomic_add(ptr, val) \
+ *	({ register unsigned int *__ptr asm("r2") = (ptr); \
+ *	   register unsigned int __result asm("r1"); \
+ *	   asm volatile ( \
+ *	       "1: @ atomic_add\n\t" \
+ *	       "ldr	r0, [r2]\n\t" \
+ *	       "mov	r3, #0xffff0fff\n\t" \
+ *	       "add	lr, pc, #4\n\t" \
+ *	       "add	r1, r0, %2\n\t" \
+ *	       "add	pc, r3, #(0xffff0fc0 - 0xffff0fff)\n\t" \
+ *	       "bcc	1b" \
+ *	       : "=&r" (__result) \
+ *	       : "r" (__ptr), "rIL" (val) \
+ *	       : "r0","r3","ip","lr","cc","memory" ); \
+ *	   __result; })
+ */
+
+__kuser_cmpxchg:				@ 0xffff0fc0
+
+#if __LINUX_ARM_ARCH__ < 6
+
+	/*
+	 * Theory of operation:
+	 *
+	 * We set the Z flag before loading oldval. If ever an exception
+	 * occurs we can not be sure the loaded value will still be the same
+	 * when the exception returns, therefore the user exception handler
+	 * will clear the Z flag whenever the interrupted user code was
+	 * actually from the kernel address space (see the usr_entry macro).
+	 *
+	 * The post-increment on the str is used to prevent a race with an
+	 * exception happening just after the str instruction which would
+	 * clear the Z flag although the exchange was done.
+	 */
+	teq	ip, ip			@ set Z flag
+	ldr	ip, [r2]		@ load current val
+	add	r3, r2, #1		@ prepare store ptr
+	teqeq	ip, r0			@ compare with oldval if still allowed
+	streq	r1, [r3, #-1]!		@ store newval if still allowed
+	subs	r0, r2, r3		@ if r2 == r3 the str occured
+	mov	pc, lr
+
+#else
+
+	ldrex	r3, [r2]
+	subs	r3, r3, r0
+	strexeq	r3, r1, [r2]
+	rsbs	r0, r3, #0
+	mov	pc, lr
+
+#endif
+
+	.align	5
+
+/*
+ * Reference prototype:
+ *
+ *	int __kernel_get_tls(void)
+ *
+ * Input:
+ *
+ *	lr = return address
+ *
+ * Output:
+ *
+ *	r0 = TLS value
+ *
+ * Clobbered:
+ *
+ *	the Z flag might be lost
+ *
+ * Definition and user space usage example:
+ *
+ *	typedef int (__kernel_get_tls_t)(void);
+ *	#define __kernel_get_tls (*(__kernel_get_tls_t *)0xffff0fe0)
+ *
+ * Get the TLS value as previously set via the __ARM_NR_set_tls syscall.
+ *
+ * This could be used as follows:
+ *
+ * #define __kernel_get_tls() \
+ *	({ register unsigned int __val asm("r0"); \
+ *         asm( "mov r0, #0xffff0fff; mov lr, pc; sub pc, r0, #31" \
+ *	        : "=r" (__val) : : "lr","cc" ); \
+ *	   __val; })
+ */
+
+__kuser_get_tls:				@ 0xffff0fe0
+
+#ifndef CONFIG_HAS_TLS_REG
+
+	ldr	r0, [pc, #(16 - 8)]		@ TLS stored at 0xffff0ff0
+	mov	pc, lr
+
+#else
+
+	mrc	p15, 0, r0, c13, c0, 3		@ read TLS register
+	mov	pc, lr
+
+#endif
+
+	.rep	5
+	.word	0			@ pad up to __kuser_helper_version
+	.endr
+
+/*
+ * Reference declaration:
+ *
+ *	extern unsigned int __kernel_helper_version;
+ *
+ * Definition and user space usage example:
+ *
+ *	#define __kernel_helper_version (*(unsigned int *)0xffff0ffc)
+ *
+ * User space may read this to determine the curent number of helpers
+ * available.
+ */
+
+__kuser_helper_version:				@ 0xffff0ffc
+	.word	((__kuser_helper_end - __kuser_helper_start) >> 5)
+__kuser_helper_end:
+
+
 /*
  * Vector stubs.
  *
@@ -710,12 +910,21 @@
 	stmia	r0, {r1, r2, r3, r4, r5, r6, ip, lr}
 
 	add	r2, r0, #0x200
-	adr	r0, __stubs_start		@ copy stubs to 0x200
-	adr	r1, __stubs_end
-1:	ldr	r3, [r0], #4
+	adr	r1, __stubs_start		@ copy stubs to 0x200
+	adr	r4, __stubs_end
+1:	ldr	r3, [r1], #4
 	str	r3, [r2], #4
-	cmp	r0, r1
-	blt	1b
+	cmp	r1, r4
+	blo	1b
+
+	add	r2, r0, #0x1000		@ top of high vector page
+	adr	r4, __kuser_helper_end		@ user helpers to top of page
+	adr	r1, __kuser_helper_start	@ going downwards.
+1:	ldr	r3, [r4, #-4]!
+	str	r3, [r2, #-4]!
+	cmp	r4, r1
+	bhi	1b
+
 	LOADREGS(fd, sp!, {r4 - r6, pc})
 
 	.data
Index: linux-2.6/arch/arm/kernel/traps.c
===================================================================
--- linux-2.6.orig/arch/arm/kernel/traps.c
+++ linux-2.6/arch/arm/kernel/traps.c
@@ -454,13 +454,17 @@
 
 	case NR(set_tls):
 		thread->tp_value = regs->ARM_r0;
+#ifdef CONFIG_HAS_TLS_REG
+		asm ("mcr p15, 0, %0, c13, c0, 3" : : "r" (regs->ARM_r0) );
+#else
 		/*
-		 * Our user accessible TLS ptr is located at 0xffff0ffc.
-		 * On SMP read access to this address must raise a fault
-		 * and be emulated from the data abort handler.
-		 * m
+		 * User space must never try to access this directly.
+		 * Expect your app to break eventually if you do so.
+		 * The user helper at 0xffff0fe0 must be used instead.
+		 * (see entry-armv.S for details)
 		 */
-		*((unsigned long *)0xffff0ffc) = thread->tp_value;
+		*((unsigned int *)0xffff0ff0) = regs->ARM_r0;
+#endif
 		return 0;
 
 	default:
--- kernel26/include/asm-arm/unistd.h.old	2005-04-16 05:17:08.344899152 +0100
+++ kernel26/include/asm-arm/unistd.h	2005-04-16 05:17:54.027954272 +0100
@@ -315,10 +315,9 @@
 #define __ARM_NR_cacheflush		(__ARM_NR_BASE+2)
 #define __ARM_NR_usr26			(__ARM_NR_BASE+3)
 #define __ARM_NR_usr32			(__ARM_NR_BASE+4)
+#define __ARM_NR_set_tls		(__ARM_NR_BASE+5)
 #define __ARM_NR_lbl			(__ARM_NR_BASE+9)
 
-#define __ARM_NR_set_tls		(__ARM_NR_BASE+0x800)
-
 #define __sys2(x) #x
 #define __sys1(x) __sys2(x)