commit d8910bf773fbecf7cdea359d4b530a3672e27180
Author: David Lanzend�rfer <david.lanzendoerfer@o2s.ch>
Date:   Wed Feb 10 16:18:39 2010 +0100

    Removed neon because its not available in our kerne�l
    and so its causing trubble (Illegal instruction)

diff --git git/src/msm-swblits.h git/src/msm-swblits.h
index f89f00e..a40b24b 100755
--- git/src/msm-swblits.h
+++ git/src/msm-swblits.h
@@ -38,16 +38,6 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-/* Neon intrinsics are part of the ARM or GCC compiler used.                                                              */
-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */
-#include <arm_neon.h>
-
-/* These are NEON-optimized functions linked to by various tests.  */
-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes);
-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes);
-extern void memset16(uint16_t *dst, uint16_t value, int count);
-extern void memset32(uint32_t *dst, uint32_t value, int count);
-
 /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */
 #define BITS_PER_BYTE (8)
 #define BYTES_PER_16BPP_PIXEL (2)
diff --git git/src/msm-swfill.c git/src/msm-swfill.c
index 108fd94..3dd1ef2 100755
--- git/src/msm-swfill.c
+++ git/src/msm-swfill.c
@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou
    }
 }
 
-
+/*
 static inline void
 memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
 {
@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
    // Quickly fill remaining pixels (up to 7).
    memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
 }
-
+*/
 
 static inline void
 memset16_Test(uint16_t *dst, uint16_t src, int count)
@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count)
 
       // Copy remaining pixels using Neon and non-Neon instructions.
       // NOTE: This assumes that dst is aligned optimally for Neon instructions.
-      memset16_AssumesNeonAlignment((void *) dst, src, count);
+      //memset16_AssumesNeonAlignment((void *) dst, src, count);
+      memset((void *) dst, src, count);
    }
 }
 
@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp
    if (w < 32) {
       // For narrow rectangles, block signals only once for the entire rectangles.
       BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
+      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
+      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
       UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
    }
    else {
       // For wider rectangles, block and unblock signals for every row.
-      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
    }
 }
 
diff --git git/src/msm-swrender.c git/src/msm-swrender.c
index a7a9abc..835dc03 100755
--- git/src/msm-swrender.c
+++ git/src/msm-swrender.c
@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src
                   }
                }
                break;
-      case  7: if (xdir >= 0) {
-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
-                  return TRUE;
-               } else {
-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
-                  return TRUE;
-               }
-               break;
-      case  8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-                  return TRUE;
-               }
-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
-                  return TRUE;
-               }
-               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
-                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
-                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
-                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
-                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
-                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
-                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
-                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
-                  return TRUE;
-               }
-               else {
-                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
-                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
-                  uint16_t src3  = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
-                  uint16_t src4  = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
-                  uint16_t src5  = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
-                  uint16_t src6  = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
-                  uint16_t src7  = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
-                  uint16_t src8  = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
-                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
-                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
-                  *(uint16_t *) (dst+2*BYTES_PER_UINT16_T)  = src3;
-                  *(uint16_t *) (dst+3*BYTES_PER_UINT16_T)  = src4;
-                  *(uint16_t *) (dst+4*BYTES_PER_UINT16_T)  = src5;
-                  *(uint16_t *) (dst+5*BYTES_PER_UINT16_T)  = src6;
-                  *(uint16_t *) (dst+6*BYTES_PER_UINT16_T)  = src7;
-                  *(uint16_t *) (dst+7*BYTES_PER_UINT16_T)  = src8;
-                  return TRUE;
-               }
-               break;
-      case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
-                  return TRUE;
-               }
-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
-                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
-                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
-                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
-                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
-                  return TRUE;
-               }
-               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
-                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
-                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
-                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
-                  uint32_t src5  = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
-                  uint32_t src6  = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
-                  uint32_t src7  = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
-                  uint32_t src8  = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
-                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
-                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
-                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
-                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
-                  *(uint32_t *) (dst+4*BYTES_PER_UINT32_T)  = src5;
-                  *(uint32_t *) (dst+5*BYTES_PER_UINT32_T)  = src6;
-                  *(uint32_t *) (dst+6*BYTES_PER_UINT32_T)  = src7;
-                  *(uint32_t *) (dst+7*BYTES_PER_UINT32_T)  = src8;
-                  return TRUE;
-               }
-               else {
-                  // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
-                  // Instead, just call multiple fixed functions.
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
-                  } else {
-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
-                  }
-                  return TRUE;
-               }
-               break;
-      case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
-                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
-                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
-                  return TRUE;
-               }
-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
-                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
-                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
-                  uint64_t src5  = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
-                  uint64_t src6  = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
-                  uint64_t src7  = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
-                  uint64_t src8  = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
-                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
-                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
-                  *(uint64_t *) (dst+4*BYTES_PER_UINT64_T)  = src5;
-                  *(uint64_t *) (dst+5*BYTES_PER_UINT64_T)  = src6;
-                  *(uint64_t *) (dst+6*BYTES_PER_UINT64_T)  = src7;
-                  *(uint64_t *) (dst+7*BYTES_PER_UINT64_T)  = src8;
-                  return TRUE;
-               }
-               break;
-      case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
-                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
-                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
-                  vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
-                  vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
-                  vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
-                  vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
-                  return TRUE;
-               }
-               break;
    }
 
    return FALSE;
@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr
                }
                return TRUE;
                break;
-      case  7: if (xdir >= 0) {
-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-               } else {
-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-               }
-               return TRUE;
-               break;
-      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
-                  return TRUE;
-               }
-               else {
-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
-                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
-                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
-                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
-                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
-                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
-                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
-                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
-                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
-                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
-                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
-                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
-                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
-                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
-                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
-                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
-                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
-                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
-                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
-                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
-                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
-                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
-                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
-                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
-                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
-                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
-                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
-                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
-                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
-                  return TRUE;
-               }
-               break;
-      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
-                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
-                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
-                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
-                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
-                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
-                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
-                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
-                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
-                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
-                  return TRUE;
-               }
-               else {
-                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
-                  // Instead, just call multiple fixed functions.
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               break;
-      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
-                   }
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
-                  uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
-                  uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
-                  uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
-                  uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
-                  uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
-                  uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
-                  uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
-                  uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
-                  *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T)  = src5a;
-                  *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T)  = src6a;
-                  *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T)  = src7a;
-                  *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T)  = src8a;
-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
-                  *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T)  = src5b;
-                  *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T)  = src6b;
-                  *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T)  = src7b;
-                  *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T)  = src8b;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               else {
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               break;
-      case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b);
-                  return TRUE;
-               }//HERE
-               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               break;
    }
 
    return FALSE;
@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr
                }
                return TRUE;
                break;
-      case  7: if (xdir >= 0) {
-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-               } else {
-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
-               }
-               return TRUE;
-               break;
-      // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons?
-      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases...
-      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
-                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
-                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
-                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
-                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
-                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
-                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
-                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
-                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
-                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
-                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
-                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
-                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
-                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
-                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
-                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
-                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
-                  return TRUE;
-               }
-               else {
-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
-                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
-                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
-                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
-                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
-                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
-                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
-                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
-                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
-                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
-                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
-                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
-                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
-                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
-                  uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
-                  uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
-                  uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T);
-                  uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T);
-                  uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T);
-                  uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T);
-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
-                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
-                  uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
-                  uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
-                  uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T);
-                  uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T);
-                  uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T);
-                  uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T);
-                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
-                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
-                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
-                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
-                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
-                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
-                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
-                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
-                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
-                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
-                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
-                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
-                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
-                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
-                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
-                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
-                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
-                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
-                  *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T)  = src3c;
-                  *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T)  = src4c;
-                  *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T)  = src5c;
-                  *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T)  = src6c;
-                  *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T)  = src7c;
-                  *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T)  = src8c;
-                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
-                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
-                  *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T)  = src3d;
-                  *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T)  = src4d;
-                  *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T)  = src5d;
-                  *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T)  = src6d;
-                  *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T)  = src7d;
-                  *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T)  = src8d;
-                  return TRUE;
-               }
-               break;
-      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
-                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
-                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
-                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T);
-                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
-                  uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T);
-                  uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T);
-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
-                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
-                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
-                  *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T)  = src3c;
-                  *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T)  = src4c;
-                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
-                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
-                  *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T)  = src3d;
-                  *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T)  = src4d;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-                  uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-                  uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
-                  uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
-                  uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
-                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
-                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
-                  *(uint32_t *) (dst+2*dpitch+0)                                        = src1c;
-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2c;
-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3c;
-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5c;
-                  *(uint32_t *) (dst+3*dpitch+0)                                        = src1d;
-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2d;
-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3d;
-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5d;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T);
-                  uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T);
-                  uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T);
-                  uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T);
-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T);
-                  uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T);
-                  uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T);
-                  uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T);
-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
-                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
-                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
-                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
-                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
-                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
-                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
-                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
-                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
-                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
-                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
-                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
-                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
-                  *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T)  = src5c;
-                  *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T)  = src6c;
-                  *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T)  = src7c;
-                  *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T)  = src8c;
-                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
-                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
-                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
-                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
-                  *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T)  = src5d;
-                  *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T)  = src6d;
-                  *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T)  = src7d;
-                  *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T)  = src8d;
-                  return TRUE;
-               }
-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-                  uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-                  uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-                  uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-                  uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-                  uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
-                  uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
-                  uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
-                  uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
-                  uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7a;
-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8a;
-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9a;
-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7b;
-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8b;
-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9b;
-                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7c;
-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8c;
-                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9c;
-                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7d;
-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8d;
-                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9d;
-                  return TRUE;
-               }
-               else {
-                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
-                  // Instead, just call multiple fixed functions.
-                  if (xdir >= 0) {
-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                  } else {
-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
-                  }
-                  return TRUE;
-               }
-               break;
-      // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons?
-      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here...
-      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T));
-                  uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T));
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
-                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
-                  vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c);
-                  vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c);
-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
-                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
-                  vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d);
-                  vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d);
-                  return TRUE;
-               }
-               break;
-   }
+         }
 
    return FALSE;
 }
@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
       if (rowsOverlap)
       {
          if (w > 64) {
-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
          }
          else if (w == 64) {
-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
          }
          else {
             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
       else
       {
          if (w > 64) {
-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
          }
          else if (w == 64) {
-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+	    DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
          }
          else {
             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
       if (xdir >= 0 || !rowsOverlap) {
          if (w >= 128) {
             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-            neon_memcpy(dst, src, w);
+            //neon_memcpy(dst, src, w);
+	    memcpy(dst, src, w);
             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
          }
          else
@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
       else {
          if (w >= 128) {
             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-            neon_memmove(dst, src, w);
+            //neon_memmove(dst, src, w);
+	    memmove(dst, src, w);
             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
          }
          else
@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
       if (xdir >= 0 || !rowsOverlap) {
          if (w >= 42) {
             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-            neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+            //neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+	    memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
          }
          else
@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
       else {
          if (w >= 42) {
             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
-            neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+            //neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+	    memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
          }
          else
diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S
deleted file mode 100644
index 5ecc5ce..0000000
--- git/src/neon_memcpy.S
+++ /dev/null
@@ -1,549 +0,0 @@
-/***************************************************************************
- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright
-       notice, this list of conditions and the following disclaimer in the
-       documentation and/or other materials provided with the distribution.
-     * Neither the name of Code Aurora nor the names of its contributors may
-       be used to endorse or promote products derived from this software
-       without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- ***************************************************************************/
-
-/***************************************************************************
-  Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
-     Inputs:
-        dest: The destination buffer
-        src: The source buffer
-        n: The size of the buffer to transfer
-     Outputs:
-
-***************************************************************************/
-
-/*
- * General note:
- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
- * the correct object code for VPOP, resulting in horrific stack crashes.
- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
- *
- * Also, VSHL swaps the source register and the shift-amount register
- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
- * in the object code, but we'll need to undo that later...
- */
-
-	.code 32
-	.align 4
-	.globl neon_memcpy
-	.func
-
-neon_memcpy:
-	/* 
-	 * First, make sure we're not copying < 4 bytes.  If so, we'll
-         * just handle it here.
-	 */
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r0}
-#else
-	push		{r0}
-#endif
-	cmp		r2, #4
-	bgt		neon_gt_4
-	/* Copy 0-4 bytes, if needed, and return.*/
-	cmp		r2, #0
-neon_smallcopy_loop:
-	beq		neon_smallcopy_done
-	ldrb		r12, [r1], #1
-	subs		r2, r2, #1
-	strb		r12, [r0], #1
-	b		neon_smallcopy_loop
-neon_smallcopy_done:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r0}
-#else
-	pop		{r0}
-#endif
-	bx		lr
-
-	/* Copy 4 or more bytes*/
-neon_gt_4:
-	/* Preload what we can...*/
-	pld		[r0,#0]
-	pld		[r1,#0]
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r4-r5}
-#else
-	push		{r4-r5}
-#endif
-
-neon_check_align:
-	/* Check normal word alignment for target. */
-	ands		r12, r0, #0x3
-	beq		source_alignment_check
-	
-	/*
-	 * Target is not aligned.  Step through until we get that
-	 * word-aligned.  This works better than a loop, according
-	 * to our pipeline modeler.
-	 */
-	cmp		r12, #2
-	ldrb		r3, [r1], #1
-	ldrleb		r4, [r1], #1
-	ldrltb		r5, [r1], #1
-	rsb		r12, r12, #4
-	sub		r2, r2, r12
-	strb		r3, [r0], #1
-	strleb		r4, [r0], #1
-	strltb		r5, [r0], #1
-	
-source_alignment_check:
-	ands		r12, r1, #0x3
-	bne		neon_memcpy_nonaligned	/* Source is not word aligned.*/
-neon_try_16_align:
-	cmp		r2, #64
-	blt		neon_align_route
-	/* This is where we try 16-byte alignment. */
-	ands		r12, r0, #0xf
-	beq		neon_align_route
-	rsb		r12, r12, #16
-neon_16_start:
-	sub		r2, r2, r12
-	lsrs		r3, r12, #2
-neon_align_16_4:
-	ldr		r4, [r1], #4
-	subs		r3, r3, #1
-	str		r4, [r0], #4
-	bne		neon_align_16_4
-neon_align_route:
-	/* In this case, both source and target are word-aligned. */
-	cmp		r2, #32768
-	bge		neon_copy_128p_a
-	cmp		r2, #256
-	bge		neon_copy_128_a
-	cmp		r2, #64
-	bge		neon_copy_32_a
-	b		neon_copy_finish_a
-	nop
-neon_copy_128p_a:
-	/* We'll copy blocks 128-bytes at a time, but try to call pld to
-	 * load in the next page, if possible.
-	 */
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4-q7}
-#else
-	vpush		{q4-q7}
-#endif
-	mov		r12, r2, lsr #7
-neon_copy_128p_loop_a:
-	vld1.32		{q0, q1}, [r1]!
-	vld1.32		{q2, q3}, [r1]!
-	vld1.32		{q4, q5}, [r1]!
-	vld1.32		{q6, q7}, [r1]!
-	pld		[r1, #0]
-	pld		[r1, #1024]
-	vst1.32		{q0, q1}, [r0]!
-	vst1.32		{q2, q3}, [r0]!
-	vst1.32		{q4, q5}, [r0]!
-	vst1.32		{q6, q7}, [r0]!
-	subs		r12, r12, #1
-	bne		neon_copy_128p_loop_a
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q4-q7}
-#else
-	vpop		{q4-q7}
-#endif
-	ands		r2, r2, #0x7f
-	beq		neon_end
-	cmp		r2, #32
-	blt		neon_copy_finish_a
-	b		neon_copy_32_a
-	/* Copy blocks of 128-bytes (word-aligned) at a time*/
-neon_copy_128_a:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4-q7}
-#else
-	vpush		{q4-q7}
-#endif
-	/*
-	 * Move to a 1-s based countdown to determine when to loop.  That
-	 * allows the subs to set the Z flag without having to explicitly
-	 * call cmp to a value.
-	 */
-	mov		r12, r2, lsr #7
-neon_copy_128_loop_a:
-	vld1.32		{q0, q1}, [r1]!
-	vld1.32		{q2, q3}, [r1]!
-	vld1.32		{q4, q5}, [r1]!
-	vld1.32		{q6, q7}, [r1]!
-	pld		[r1, #0]
-	pld		[r1, #128]
-	vst1.32		{q0, q1}, [r0]!
-	vst1.32		{q2, q3}, [r0]!
-	vst1.32		{q4, q5}, [r0]!
-	vst1.32		{q6, q7}, [r0]!
-	subs		r12, r12, #1
-	pld		[r0, #0]
-	pld		[r0, #128]
-	bne		neon_copy_128_loop_a
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q4-q7}
-#else
-	vpop		{q4-q7}
-#endif
-	ands		r2, r2, #0x7f
-	beq		neon_end
-	cmp		r2, #32
-	blt		neon_copy_finish_a
-	/* Copy blocks of 32-bytes (word aligned) at a time*/
-neon_copy_32_a:
-	mov		r12, r2, lsr #5
-neon_copy_32_loop_a:
-	vld1.32		{q0,q1}, [r1]!
-	subs		r12, r12, #1
-	pld		[r1,#0]
-	vst1.32		{q0,q1}, [r0]!
-	bne		neon_copy_32_loop_a
-	ands		r2, r2, #0x1f
-	beq		neon_end
-neon_copy_finish_a:
-neon_copy_16_a:
-	movs		r12, r2, lsr #4
-	beq		neon_copy_8_a
-neon_copy_16_a_loop:
-	vld1.32		{q0}, [r1]!
-	subs		r12, r12, #1
-	vst1.32		{q0}, [r0]!
-	bne		neon_copy_16_a_loop
-	ands		r2, r2, #0xf
-	beq		neon_end
-neon_copy_8_a:
-	cmp		r2, #8
-	blt		neon_copy_4_a
-	ldm		r1!, {r4-r5}
-	subs		r2, r2, #8
-	stm		r0!, {r4-r5}
-	/* Copy 4-bytes of word-aligned data at a time*/
-neon_copy_4_a:
-	cmp		r2, #4
-	blt		neon_copy_finish
-	ldr		r4, [r1], #4
-	subs		r2, r2, #4
-	str		r4, [r0], #4
-	b		neon_copy_finish
-
-	/*
-	 * Handle unaligned data.  The basic concept here is that we'll
-	 * try to pull out enough data from the source to get that word-
-	 * aligned, then do our writes word-aligned, storing the difference
-	 * in a register, and shifting the data as needed.
-	 */
-neon_memcpy_nonaligned:
-	/*
-	 * If this is <8 bytes, it makes more sense to just copy it
-	 * quickly instead of incurring all kinds of overhead.
-	 */
-	cmp		r2, #8 /* Let's try this...*/
-	ble		neon_copy_finish
-	/*
-	 * This is where we'll pull out either 1, 2, or 3 bytes of data
-	 * from the source as needed to align it, then store off those
-	 * bytes in r4.  When we read in the (now) aligned data from the
-	 * source, we'll shift the bytes and AND in the r4 data, then write
-	 * to the target aligned.
-	 *
-	 * The conditional ldr calls work slightly faster than the
-	 * previous method, confirmed by our pipeline modeler.
-	 */
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r6-r9}
-#else
-	push		{r6-r9}
-#endif
-	cmp		r12, #2
-	ldrb		r4, [r1], #1
-	ldrleb		r5, [r1], #1
-	ldrltb		r6, [r1], #1
-	rsb		r8, r12, #4
-	sub		r2, r2, r8
-	lsl		r8, r8, #3
-	orrle		r4, r4, r5, lsl #8
-	orrlt		r4, r4, r6, lsl #16
-	rsb		r9, r8, #32
-	
-	cmp		r2, #64
-	blt		neon_unaligned_route
-	ands		r12, r0, #0xf
-	beq		neon_unaligned_route
-	rsb		r12, r12, #16
-neon_16_start_u:
-	sub		r2, r2, r12
-	lsrs		r6, r12, #2
-neon_align_16_4_u:
-	ldr		r5, [r1], #4
-	subs		r6, r6, #1
-	orr		r4, r4, r5, lsl r8
-	str		r4, [r0], #4
-	mov		r4, r5, lsr r9
-	bne		neon_align_16_4_u
-neon_unaligned_route:
-	/* Decide which loop block to branch to.*/
-	cmp		r2, #256
-	bge		neon_copy_64_u
-	cmp		r2, #64
-	bge		neon_copy_32_u
-	b		neon_copy_finish_u
-	/* Copy data in 64-byte blocks.*/
-neon_copy_64_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4}
-	vstmdb		sp!, {q5-q8}
-#else
-	vpush		{q4}
-	vpush		{q5-q8}
-#endif
-	/* We'll need this for the q register shift later.*/
-	vdup.u32	q8, r8
-	/*
-	 * As above, we determine how many times we can go through the
-	 * 64-byte copy loop, then countdown.
-	 */
-	mov		r12, r2, lsr #6
-	and		r2, r2, #0x3f
-neon_copy_64_u_loop:
-	/* Load 64-bytes into q4-q7.*/
-	vld1.32		{q4, q5}, [r1]!
-	vld1.32		{q6, q7}, [r1]!
-	/*
-	 * Shift q0-q3 right so everything but the data we need due to the
-	 * alignment falls off the right-hand side.  The branching
-	 * is needed, since vshr requires the shift to be an immediate
-	 * value.
-	 */
-	lsls		r5, r8, #28
-	bcc		neon_copy_64_u_b8
-	bpl		neon_copy_64_u_b16
-	vshr.u64	q0, q4, #40
-	vshr.u64	q1, q5, #40
-	vshr.u64	q2, q6, #40
-	vshr.u64	q3, q7, #40
-	b		neon_copy_64_unify
-neon_copy_64_u_b8:
-	vshr.u64	q0, q4, #56
-	vshr.u64	q1, q5, #56
-	vshr.u64	q2, q6, #56
-	vshr.u64	q3, q7, #56
-	b		neon_copy_64_unify
-neon_copy_64_u_b16:
-	vshr.u64	q0, q4, #48
-	vshr.u64	q1, q5, #48
-	vshr.u64	q2, q6, #48
-	vshr.u64	q3, q7, #48
-neon_copy_64_unify:
-	/*
-	 * Shift q4-q7 left by r8 bits to take the alignment into
-	 * account.
-	 */
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q4, q8, q4
-	vshl.u64	q5, q8, q5
-	vshl.u64	q6, q8, q6
-	vshl.u64	q7, q8, q7
-#else
-	vshl.u64	q4, q4, q8
-	vshl.u64	q5, q5, q8
-	vshl.u64	q6, q6, q8
-	vshl.u64	q7, q7, q8
-#endif
-	/*
-	 * The data in s14 will be needed for the next loop iteration.  Move
-	 * that to r5.
-	 */
-	vmov		r5, s14
-	/* We'll vorr the shifted data with the data that needs to move back.*/
-	vorr		d9, d9, d0
-	/* Copy the data from the previous loop into s14.*/
-	vmov		s14, r4
-	vorr		d10, d10, d1
-	vorr		d11, d11, d2
-	vorr		d12, d12, d3
-	vorr		d13, d13, d4
-	vorr		d14, d14, d5
-	vorr		d15, d15, d6
-	vorr		d8, d8, d7
-	subs		r12, r12, #1
-	pld		[r1, #0]
-	pld		[r1, #128]
-	/* Save off the r5 data into r4 for the next iteration.*/
-	mov		r4, r5
-	vst1.32		{q4, q5}, [r0]!
-	vst1.32		{q6, q7}, [r0]!
-	pld		[r0, #0]
-	pld		[r0, #128]
-	bne		neon_copy_64_u_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q5-q8}
-	vldmia		sp!, {q4}
-#else
-	vpop		{q5-q8}
-	vpop		{q4}
-#endif
-	cmp		r2, #32
-	bge		neon_copy_32_u
-	b		neon_copy_finish_u
-neon_copy_32_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4}
-#else
-	vpush		{q4}
-#endif
-	vdup.u32	q4, r8
-	mov		r12, r2, lsr #5
-	and		r2, r2, #0x1f
-neon_copy_32_u_loop:
-	vld1.32		{q0, q1}, [r1]!
-	lsls		r5, r8, #28
-	bcc		neon_copy_32_u_b8
-	bpl		neon_copy_32_u_b16
-	vshr.u64	q2, q0, #40
-	vshr.u64	q3, q1, #40
-	b		neon_copy_32_unify
-neon_copy_32_u_b8:
-	vshr.u64	q2, q0, #56
-	vshr.u64	q3, q1, #56
-	b		neon_copy_32_unify
-neon_copy_32_u_b16:
-	vshr.u64	q2, q0, #48
-	vshr.u64	q3, q1, #48
-neon_copy_32_unify:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q0, q4, q0
-	vshl.u64	q1, q4, q1
-#else
-	vshl.u64	q0, q0, q4
-	vshl.u64	q1, q1, q4
-#endif
-	vmov		r5, s14
-	vorr		d1, d1, d4
-	vmov		s14, r4
-	vorr		d2, d2, d5
-	vorr		d3, d3, d6
-	vorr		d0, d0, d7
-	subs		r12, r12, #1
-	pld		[r1, #0]
-	mov		r4, r5
-	vst1.32		{q0, q1}, [r0]!
-	bne		neon_copy_32_u_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q4}
-#else
-	vpop		{q4}
-#endif
-neon_copy_finish_u:
-neon_copy_16_u:
-	movs		r12, r2, lsr #4
-	beq		neon_copy_8_u
-	vdup.u32	q2, r8
-	and		r2, r2, #0xf
-neon_copy_16_u_loop:
-	vld1.32		{q0}, [r1]!
-	lsls		r5, r8, #28
-	bcc		neon_copy_16_u_b8
-	bpl		neon_copy_16_u_b16
-	vshr.u64	q1, q0, #40
-	b		neon_copy_16_unify
-neon_copy_16_u_b8:
-	vshr.u64	q1, q0, #56
-	b		neon_copy_16_unify
-neon_copy_16_u_b16:
-	vshr.u64	q1, q0, #48
-neon_copy_16_unify:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q0, q2, q0
-#else
-	vshl.u64	q0, q0, q2
-#endif
-	vmov		r5, s6
-	vorr		d1, d1, d2
-	vmov		s6, r4
-	vorr		d0, d0, d3
-	subs		r12, r12, #1
-	mov		r4, r5
-	vst1.32		{q0}, [r0]!
-	bne		neon_copy_16_u_loop
-neon_copy_8_u:
-	cmp		r2, #8
-	blt		neon_copy_4_u
-	ldm		r1!, {r6-r7}
-	subs		r2, r2, #8
-	orr		r4, r4, r6, lsl r8
-	mov		r5, r6, lsr r9
-	orr		r5, r5, r7, lsl r8
-	stm		r0!, {r4-r5}
-	mov		r4, r7, lsr r9
-neon_copy_4_u:
-	cmp		r2, #4
-	blt		neon_copy_last_bits_u
-	ldr		r5, [r1], #4
-	subs		r2, r2, #4
-	orr		r4, r4, r5, lsl r8
-	str		r4, [r0], #4
-	mov		r4, r5, lsr r9
-neon_copy_last_bits_u:
-	/*
-	 * Remember, r8 contains the size of the data in r4 in bits,
-	 * so to get to bytes we'll need to shift 3 places
-	 */
-	lsr		r8, r8, #0x3
-	/* Write out the bytes stored in r4.*/
-neon_copy_last_bits_u_loop:
-	strb		r4, [r0], #1
-	subs		r8, r8, #1
-	lsrne		r4, r4, #8
-	bne		neon_copy_last_bits_u_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r6-r9}
-#else
-	pop		{r6-r9}
-#endif
-neon_copy_finish:
-	cmp		r2, #0
-	beq		neon_end
-	/*
-	 * This just copies the data from source to target one byte
-	 * at a time.  For some small values, this makes more sense.
-	 * Note that since this code copies data a byte at a time,
-	 * both the aligned and unaligned paths can use it.
-	 */
-neon_copy_finish_loop:
-	ldrb		r4, [r1], #1
-	subs		r2, r2, #1
-	strb		r4, [r0], #1
-	bne		neon_copy_finish_loop
-neon_end:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r4-r5}
-	ldmia		sp!, {r0}
-#else
-	pop		{r4-r5}
-	pop		{r0}
-#endif
-	bx		lr
-
-	.endfunc
-	.end
diff --git git/src/neon_memmove.S git/src/neon_memmove.S
deleted file mode 100644
index 1bfe597..0000000
--- git/src/neon_memmove.S
+++ /dev/null
@@ -1,939 +0,0 @@
-/***************************************************************************
- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright
-       notice, this list of conditions and the following disclaimer in the
-       documentation and/or other materials provided with the distribution.
-     * Neither the name of Code Aurora nor the names of its contributors may
-       be used to endorse or promote products derived from this software
-       without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
-  ***************************************************************************/
-
-/***************************************************************************
- *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
- *     Inputs:
- *        dest: The destination buffer
- *        src: The source buffer
- *        n: The size of the buffer to transfer
- *     Outputs:
- *
- ***************************************************************************/
-
-/*
- * General note:
- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
- * the correct object code for VPOP, resulting in horrific stack crashes.
- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
- *
- * Also, VSHL swaps the source register and the shift-amount register
- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
- * in the object code, but we'll need to undo that later...
- */
-	.code 32
-	.align 4
-	.globl neon_memmove
-	.func
-
-neon_memmove:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r0}
-#else
-	push		{r0}
-#endif
-
-	/*
-	 * The requirements for memmove state that the function should
-	 * operate as if data were being copied from the source to a
-	 * buffer, then to the destination.  This is to allow a user
-	 * to copy data from a source and target that overlap.
-	 *
-	 * We can't just do byte copies front-to-back automatically, since
-	 * there's a good chance we may have an overlap (why else would someone
-	 * intentionally use memmove then?).
-	 *
-	 * We'll break this into two parts.  Front-to-back, or back-to-front
-	 * copies.
-	 */
-neon_memmove_cmf:
-	cmp		r0, r1
-	blt		neon_front_to_back_copy
-	bgt		neon_back_to_front_copy
-	b		neon_memmove_done
-
-	/* #############################################################
-	 * Front to Back copy
-	 */
-neon_front_to_back_copy:
-	/*
-	 * For small copies, just do a quick memcpy.  We can do this for
-	 * front-to-back copies, aligned or unaligned, since we're only
-	 * doing 1 byte at a time...
-	 */
-	cmp		r2, #4
-	bgt		neon_f2b_gt4
-	cmp		r2, #0
-neon_f2b_smallcopy_loop:
-	beq		neon_memmove_done
-	ldrb		r12, [r1], #1
-	subs		r2, r2, #1
-	strb		r12, [r0], #1
-	b		neon_f2b_smallcopy_loop
-neon_f2b_gt4:
-	/* Preload what we can...*/
-	pld		[r0,#0]
-	pld		[r1,#0]	
-	/* The window size is in r3. */
-	sub		r3, r1, r0
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r4-r6}
-#else
-	push		{r4-r6}
-#endif
-
-neon_f2b_check_align:
-	/* Check alignment. */
-	ands		r12, r0, #0x3
-	beq		neon_f2b_source_align_check
-	cmp		r12, #2
-	ldrb		r4, [r1], #1
-	ldrleb		r5, [r1], #1
-	ldrltb		r6, [r1], #1
-	rsb		r12, r12, #4
-	sub		r2, r2, r12
-	strb		r4, [r0], #1
-	strleb		r5, [r0], #1
-	strltb		r6, [r0], #1
-	
-neon_f2b_source_align_check:
-	ands		r12, r1, #0x3
-	bne		neon_f2b_nonaligned
-
-neon_f2b_try_16_align:
-	/* If we're >64, attempt to align on 16-bytes.  Smaller amounts
-	 * don't seem to be worth handling. */
-	cmp		r2, #64
-	blt		neon_f2b_align_route
-	/* This is where we try 16-byte alignment. */
-	ands		r12, r0, #0xf
-	beq		neon_f2b_align_route
-	rsb		r12, r12, #16
-neon_f2b_16_start:
-	sub		r2, r2, r12
-	lsrs		r5, r12, #2
-neon_f2b_align_16_4:
-	ldr		r4, [r1], #4
-	subs		r5, r5, #1
-	str		r4, [r0], #4
-	bne		neon_f2b_align_16_4
-neon_f2b_align_route:
-	/* #############################################################
-	 * Front to Back copy - aligned
-	 */
-	/*
-	 * Note that we can't just route based on the size in r2.  If that's
-	 * larger than the overlap window in r3, we could potentially
-	 * (and likely!) destroy data we're copying.
-	 */
-	cmp		r2, r3
-	movle		r12, r2
-	movgt		r12, r3
-	cmp		r12, #256
-	bge		neon_f2b_copy_128_a
-	cmp		r12, #64
-	bge		neon_f2b_copy_32_a
-	cmp		r12, #16
-	bge		neon_f2b_copy_16_a
-	cmp		r12, #8
-	bge		neon_f2b_copy_8_a
-	cmp		r12, #4
-	bge		neon_f2b_copy_4_a
-	b		neon_f2b_copy_1_a
-neon_f2b_copy_128_a:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4-q7}
-#else
-	vpush		{q4-q7}
-#endif
-	mov		r12, r2, lsr #7
-neon_f2b_copy_128_a_loop:
-	vld1.32		{q0,q1}, [r1]!
-	vld1.32		{q2,q3}, [r1]!
-	vld1.32		{q4,q5}, [r1]!
-	vld1.32		{q6,q7}, [r1]!
-	pld		[r1, #0]
-	pld		[r1, #128]
-	vst1.32		{q0,q1}, [r0]!
-	vst1.32		{q2,q3}, [r0]!
-	vst1.32		{q4,q5}, [r0]!
-	vst1.32		{q6,q7}, [r0]!
-	subs		r12, r12, #1
-	pld		[r0, #0]
-	pld		[r0, #128]
-	bne		neon_f2b_copy_128_a_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q4-q7}
-#else
-	vpop		{q4-q7}
-#endif
-	ands		r2, r2, #0x7f
-	beq		neon_f2b_finish
-	cmp		r2, #32
-	bge		neon_f2b_copy_32_a
-	b		neon_f2b_copy_finish_a
-neon_f2b_copy_32_a:
-	mov		r12, r2, lsr #5
-neon_f2b_copy_32_a_loop:
-	vld1.32		{q0,q1}, [r1]!
-	subs		r12, r12, #1
-	pld		[r1, #0]
-	vst1.32		{q0,q1}, [r0]!
-	bne		neon_f2b_copy_32_a_loop
-	ands		r2, r2, #0x1f
-	beq		neon_f2b_finish
-neon_f2b_copy_finish_a:
-neon_f2b_copy_16_a:
-	movs		r12, r2, lsr #4
-	beq		neon_f2b_copy_8_a
-neon_f2b_copy_16_a_loop:
-	vld1.32		{q0}, [r1]!
-	subs		r12, r12, #1
-	vst1.32		{q0}, [r0]!
-	bne		neon_f2b_copy_16_a_loop
-	ands		r2, r2, #0xf
-	beq		neon_f2b_finish
-neon_f2b_copy_8_a:
-	cmp		r2, #8
-	blt		neon_f2b_copy_4_a
-	ldm		r1!, {r4-r5}
-	subs		r2, r2, #8
-	stm		r0!, {r4-r5}
-neon_f2b_copy_4_a:
-	cmp		r2, #4
-	blt		neon_f2b_copy_1_a
-	ldr		r4, [r1], #4
-	subs		r2, r2, #4
-	str		r4, [r0], #4
-neon_f2b_copy_1_a:
-	cmp		r2, #0
-	beq		neon_f2b_finish
-neon_f2b_copy_1_a_loop:
-	ldrb		r12, [r1], #1
-	subs		r2, r2, #1
-	strb		r12, [r0], #1
-	bne		neon_f2b_copy_1_a_loop
-	b		neon_f2b_finish
-		
-	/* #############################################################
-	 * Front to Back copy - unaligned
-	 */
-neon_f2b_nonaligned:
-	/*
-	 * For sizes < 8, does it really make sense to do the whole shift
-	 * party?  Note that we DON'T want to call neon_f2b_copy_1_u,
-	 * since we'll end up trying to pop r8-r11, and we DON'T want
-	 * to do that...
-	 */
-	cmp		r2, #8
-	ble		neon_f2b_copy_1_a
-
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r7-r9}
-#else
-	push		{r7-r9}
-#endif
-	cmp		r12, #2
-	ldrb		r4, [r1], #1
-	ldrleb		r5, [r1], #1
-	ldrltb		r6, [r1], #1
-	rsb		r8, r12, #4
-	sub		r2, r2, r8
-	lsl		r8, r8, #3
-	orrle		r4, r4, r5, lsl #8
-	orrlt		r4, r4, r6, lsl #16
-	rsb		r9, r8, #32
-	/*
-	 * r4  = overflow bits
-	 * r8 = # of bits we copied into the r4 register to align source.
-	 * r9 = 32 - r8
-	 * r12 = Index counter for each size, so we determine how many times
-	 *       the given size will go into r2, then count down that # of
-	 *       times in r12.
-	 */
-	cmp		r2, #64
-	blt		neon_f2b_unaligned_route
-	ands		r12, r0, #0xf
-	beq		neon_f2b_unaligned_route
-	cmp		r3, #4
-	blt		neon_f2b_unaligned_route
-	rsb		r12, r12, #16
-neon_f2b_16_start_u:
-	sub		r2, r2, r12
-	lsrs		r6, r12, #2
-neon_f2b_align_16_4_u:
-	ldr		r5, [r1], #4
-	subs		r6, r6, #1
-	orr		r4, r4, r5, lsl r8
-	str		r4, [r0], #4
-	mov		r4, r5, lsr r9
-	bne		neon_f2b_align_16_4_u
-neon_f2b_unaligned_route:
-	cmp		r2, r3
-	movle		r12, r2
-	movgt		r12, r3
-	cmp		r12, #256
-	bge		neon_f2b_copy_64_u
-	cmp		r12, #64
-	bge		neon_f2b_copy_32_u
-	cmp		r12, #16
-	bge		neon_f2b_copy_16_u
-	cmp		r12, #8
-	bge		neon_f2b_copy_8_u
-	cmp		r12, #4
-	bge		neon_f2b_copy_4_u
-	b		neon_f2b_last_bits_u
-neon_f2b_copy_64_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4}
-	vstmdb		sp!, {q5-q8}
-#else
-	vpush		{q4}
-	vpush		{q5-q8}
-#endif
-	vdup.u32	q8, r8
-	mov		r12, r2, lsr #6
-	and		r2, r2, #0x3f
-neon_f2b_copy_64_u_loop:
-	vld1.32		{q4, q5}, [r1]!
-	vld1.32		{q6, q7}, [r1]!
-	lsls		r5, r8, #28
-	bcc		neon_f2b_copy_64_u_b8
-	bpl		neon_f2b_copy_64_u_b16
-	vshr.u64	q0, q4, #40
-	vshr.u64	q1, q5, #40
-	vshr.u64	q2, q6, #40
-	vshr.u64	q3, q7, #40
-	b		neon_f2b_copy_64_unify
-neon_f2b_copy_64_u_b8:
-	vshr.u64	q0, q4, #56
-	vshr.u64	q1, q5, #56
-	vshr.u64	q2, q6, #56
-	vshr.u64	q3, q7, #56
-	b		neon_f2b_copy_64_unify
-neon_f2b_copy_64_u_b16:
-	vshr.u64	q0, q4, #48
-	vshr.u64	q1, q5, #48
-	vshr.u64	q2, q6, #48
-	vshr.u64	q3, q7, #48
-neon_f2b_copy_64_unify:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q4, q8, q4
-	vshl.u64	q5, q8, q5
-	vshl.u64	q6, q8, q6
-	vshl.u64	q7, q8, q7
-#else
-	vshl.u64	q4, q4, q8
-	vshl.u64	q5, q5, q8
-	vshl.u64	q6, q6, q8
-	vshl.u64	q7, q7, q8
-#endif
-	vmov		r5, s14
-	vorr		d9, d9, d0
-	vmov		s14, r4
-	vorr		d10, d10, d1
-	vorr		d11, d11, d2
-	vorr		d12, d12, d3
-	vorr		d13, d13, d4
-	vorr		d14, d14, d5
-	vorr		d15, d15, d6
-	vorr		d8, d8, d7
-	subs		r12, r12, #1
-	pld		[r1, #0]
-	pld		[r1, #128]
-	mov		r4, r5
-	vst1.32		{q4, q5}, [r0]!
-	vst1.32		{q6, q7}, [r0]!
-	pld		[r0, #0]
-	pld		[r0, #128]
-	bne		neon_f2b_copy_64_u_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q5-q8}
-	vldmia		sp!, {q4}
-#else
-	vpop		{q5-q8}
-	vpop		{q4}
-#endif
-	cmp		r2, #32
-	bge		neon_f2b_copy_32_u
-	b		neon_f2b_copy_finish_u
-neon_f2b_copy_32_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4}
-#else
-	vpush		{q4}
-#endif
-	vdup.u32	q4, r8
-	mov		r12, r2, lsr #5
-	and		r2, r2, #0x1f
-neon_f2b_copy_32_u_loop:
-	vld1.32		{q0, q1}, [r1]!
-	lsls		r5, r8, #28
-	bcc		neon_f2b_copy_32_u_b8
-	bpl		neon_f2b_copy_32_u_b16
-	vshr.u64	q2, q0, #40
-	vshr.u64	q3, q1, #40
-	b		neon_f2b_copy_32_unify
-neon_f2b_copy_32_u_b8:
-	vshr.u64	q2, q0, #56
-	vshr.u64	q3, q1, #56
-	b		neon_f2b_copy_32_unify
-neon_f2b_copy_32_u_b16:
-	vshr.u64	q2, q0, #48
-	vshr.u64	q3, q1, #48
-neon_f2b_copy_32_unify:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q0, q4, q0
-	vshl.u64	q1, q4, q1
-#else
-	vshl.u64	q0, q0, q4
-	vshl.u64	q1, q1, q4
-#endif
-	vmov		r5, s14
-	vorr		d1, d1, d4
-	vmov		s14, r4
-	vorr		d2, d2, d5
-	vorr		d3, d3, d6
-	vorr		d0, d0, d7
-	subs		r12, r12, #1
-	pld		[r1, #0]
-	mov		r4, r5
-	vst1.32		{q0, q1}, [r0]!
-	bne		neon_f2b_copy_32_u_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q4}
-#else
-	vpop		{q4}
-#endif
-neon_f2b_copy_finish_u:
-neon_f2b_copy_16_u:
-	movs		r12, r2, lsr #4
-	beq		neon_f2b_copy_8_u
-	vdup.u32	q2, r8
-	and		r2, r2, #0xf
-neon_f2b_copy_16_u_loop:
-	vld1.32		{q0}, [r1]!
-	lsls		r5, r8, #28
-	bcc		neon_f2b_copy_16_u_b8
-	bpl		neon_f2b_copy_16_u_b16
-	vshr.u64	q1, q0, #40
-	b		neon_f2b_copy_16_unify
-neon_f2b_copy_16_u_b8:
-	vshr.u64	q1, q0, #56
-	b		neon_f2b_copy_16_unify
-neon_f2b_copy_16_u_b16:
-	vshr.u64	q1, q0, #48
-neon_f2b_copy_16_unify:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q0, q2, q0
-#else
-	vshl.u64	q0, q0, q2
-#endif
-	vmov		r5, s6
-	vorr		d1, d1, d2
-	vmov		s6, r4
-	vorr		d0, d0, d3
-	subs		r12, r12, #1
-	mov		r4, r5
-	vst1.32		{q0}, [r0]!
-	bne		neon_f2b_copy_16_u_loop
-neon_f2b_copy_8_u:
-	cmp		r2, #8
-	blt		neon_f2b_copy_4_u
-	ldm		r1!, {r6-r7}
-	subs		r2, r2, #8
-	orr		r4, r4, r6, lsl r8
-	mov		r5, r6, lsr r9
-	orr		r5, r5, r7, lsl r8
-	stm		r0!, {r4-r5}
-	mov		r4, r7, lsr r9
-neon_f2b_copy_4_u:
-	cmp		r2, #4
-	blt		neon_f2b_last_bits_u
-	ldr		r5, [r1], #4
-	subs		r2, r2, #4
-	orr		r4, r4, r5, lsl r8
-	str		r4, [r0], #4
-	mov		r4, r5, lsr r9
-neon_f2b_last_bits_u:
-	lsr		r8, r8, #0x3
-neon_f2b_last_bits_u_loop:
-	strb		r4, [r0], #1
-	subs		r8, r8, #1
-	lsr		r4, r4, #8
-	bne		neon_f2b_last_bits_u_loop
-neon_f2b_copy_1_u:
-	cmp		r2, #0
-	beq		neon_f2b_finish_u
-neon_f2b_copy_1_u_loop:
-	ldrb		r12, [r1], #1
-	subs		r2, r2, #1
-	strb		r12, [r0], #1
-	bne		neon_f2b_copy_1_u_loop
-neon_f2b_finish_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r7-r9}
-#else
-	pop		{r7-r9}
-#endif
-	/* #############################################################
-	 * Front to Back copy - finish
-	 */
-neon_f2b_finish:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r4-r6}
-#else
-	pop		{r4-r6}
-#endif
-	b		neon_memmove_done
-
-	/* #############################################################
-	 * Back to Front copy
-	 */
-neon_back_to_front_copy:
-	/*
-	 * Here, we'll want to shift to the end of the buffers.  This
-	 * actually points us one past where we need to go, but since
-	 * we'll pre-decrement throughout, this will be fine.
-	 */
-	add		r0, r0, r2
-	add		r1, r1, r2
-	cmp		r2, #4
-	bgt		neon_b2f_gt4
-	cmp		r2, #0
-neon_b2f_smallcopy_loop:
-	beq		neon_memmove_done
-	ldrb		r12, [r1, #-1]!
-	subs		r2, r2, #1
-	strb		r12, [r0, #-1]!
-	b		neon_b2f_smallcopy_loop
-neon_b2f_gt4:
-	pld		[r0, #0]
-	pld		[r1, #0]
-	/*
-	 * The minimum of the overlap window size and the copy size
-	 * is in r3.
-	 */
-	sub		r3, r0, r1
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r4-r5}
-#else
-	push		{r4-r5}
-#endif
-
-	/*
-	 * Check alignment.  Since we'll pre-decrement as we step thru, we'll
-	 * need to make sure we're on word-alignment.
-	 */
-neon_b2f_check_align:
-	ands		r12, r0, #0x3
-	beq		neon_b2f_source_align_check
-	sub		r2, r2, r12
-neon_b2f_shift_align:
-	ldrb		r4, [r1, #-1]!
-	subs		r12, r12, #1
-	strb		r4, [r0, #-1]!
-	bne		neon_b2f_shift_align
-neon_b2f_source_align_check:
-	ands		r4, r1, #0x3
-	bne		neon_b2f_nonaligned
-	
-neon_b2f_try_16_align:
-	/* If we're >64, attempt to align on 16-bytes.  Smaller amounts
-	 * don't seem to be worth handling. */
-	cmp		r2, #64
-	blt		neon_b2f_align_route
-	ands		r12, r0, #0xf
-	beq		neon_b2f_align_route
-	/* In this case, r12 has the number of bytes to roll backward. */
-neon_b2f_16_start:
-	sub		r2, r2, r12
-	lsrs		r5, r12, #2
-neon_b2f_align_16_4:
-	ldr		r4, [r1, #-4]!
-	subs		r5, r5, #1
-	str		r4, [r0, #-4]!
-	bne		neon_b2f_align_16_4
-neon_b2f_align_route:
-	/*
-	 * #############################################################
-	 * Back to Front copy - aligned
-	 */
-	cmp		r2, r3
-	movle		r12, r2
-	movgt		r12, r3
-	cmp		r12, #256
-	bge		neon_b2f_copy_128_a
-	cmp		r12, #64
-	bge		neon_b2f_copy_32_a
-	cmp		r12, #8
-	bge		neon_b2f_copy_8_a
-	cmp		r12, #4
-	bge		neon_b2f_copy_4_a
-	b		neon_b2f_copy_1_a
-neon_b2f_copy_128_a:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4-q7}
-#else
-	vpush		{q4-q7}
-#endif
-	movs		r12, r2, lsr #7
-	/*
-	 * This irks me.  There MUST be a better way to read these in and
-	 * scan the register backward instead of making it go forward.  Then
-	 * we need to do two subtractions...
-	 */
-neon_b2f_copy_128_a_loop:
-	sub		r1, r1, #128
-	sub		r0, r0, #128
-	vld1.32		{q0, q1}, [r1]!
-	vld1.32		{q2, q3}, [r1]!
-	vld1.32		{q4, q5}, [r1]!
-	vld1.32		{q6, q7}, [r1]!
-	pld		[r1, #-128]
-	pld		[r1, #-256]
-	vst1.32		{q0, q1}, [r0]!
-	vst1.32		{q2, q3}, [r0]!
-	vst1.32		{q4, q5}, [r0]!
-	vst1.32		{q6, q7}, [r0]!
-	subs		r12, r12, #1
-	pld		[r0, #-128]
-	pld		[r0, #-256]
-	sub		r1, r1, #128
-	sub		r0, r0, #128
-	bne		neon_b2f_copy_128_a_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q4-q7}
-#else
-	vpop		{q4-q7}
-#endif
-	ands		r2, r2, #0x7f
-	beq		neon_b2f_finish
-	cmp		r2, #32
-	bge		neon_b2f_copy_32_a
-	b		neon_b2f_copy_finish_a
-neon_b2f_copy_32_a:
-	mov		r12, r2, lsr #5
-neon_b2f_copy_32_a_loop:
-	sub		r1, r1, #32
-	sub		r0, r0, #32
-	vld1.32		{q0,q1}, [r1]
-	subs		r12, r12, #1
-	vst1.32		{q0,q1}, [r0]
-	pld		[r1, #0]
-	bne		neon_b2f_copy_32_a_loop
-	ands		r2, r2, #0x1f
-	beq		neon_b2f_finish
-neon_b2f_copy_finish_a:
-neon_b2f_copy_8_a:
-	movs		r12, r2, lsr #0x3
-	beq		neon_b2f_copy_4_a
-neon_b2f_copy_8_a_loop:
-	ldmdb		r1!, {r4-r5}
-	subs		r12, r12, #1
-	stmdb		r0!, {r4-r5}
-	bne		neon_b2f_copy_8_a_loop
-	and		r2, r2, #0x7
-neon_b2f_copy_4_a:
-	movs		r12, r2, lsr #0x2
-	beq		neon_b2f_copy_1_a
-	and		r2, r2, #0x3
-neon_b2f_copy_4_a_loop:
-	ldr		r4, [r1, #-4]!
-	subs		r12, r12, #1
-	str		r4, [r0, #-4]!
-	bne		neon_b2f_copy_4_a_loop
-neon_b2f_copy_1_a:
-	cmp		r2, #0
-	beq		neon_b2f_finish
-neon_b2f_copy_1_a_loop:
-	ldrb		r12, [r1, #-1]!
-	subs		r2, r2, #1
-	strb		r12, [r0, #-1]!
-	bne		neon_b2f_copy_1_a_loop
-
-	/* #############################################################
-	 * Back to Front copy - unaligned
-	 */
-neon_b2f_nonaligned:
-	/*
-	 * For sizes < 8, does it really make sense to do the whole shift
-	 * party?
-	 */
-	cmp		r2, #8
-	ble		neon_b2f_copy_1_a
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	stmdb		sp!, {r6-r11}
-#else
-	push		{r6-r11}
-#endif
-	/*
-	 * r3 = max window size
-	 * r4 = overflow bytes
-	 * r5 = bytes we're reading into
-	 * r6 = # bytes we're off.
-	 * r10 = copy of r6
-	 */
-	and		r6, r1, #0x3
-	eor		r4, r4, r4
-	mov		r10, r6
-neon_b2f_realign:
-	ldrb		r5, [r1, #-1]!
-	subs		r6, r6, #1
-	orr		r4, r5, r4, lsl #8
-	bne		neon_b2f_realign
-	/*
-	 * r10 = # of bits we copied into the r4 register to align source.
-	 * r11 = 32 - r10
-	 * r12 = Index counter for each size, so we determine how many times
-	 *       the given size will go into r2, then count down that # of
-	 *       times in r12.
-	 */
-	sub		r2, r2, r10
-	lsl		r10, r10, #0x3
-	rsb		r11, r10, #32
-
-	cmp		r2, r3
-	movle		r12, r2
-	movgt		r12, r3
-	cmp		r12, #256
-	bge		neon_b2f_copy_64_u
-	cmp		r12, #64
-	bge		neon_b2f_copy_32_u
-	cmp		r12, #8
-	bge		neon_b2f_copy_8_u
-	cmp		r12, #4
-	bge		neon_b2f_copy_4_u
-	b		neon_b2f_last_bits_u
-neon_b2f_copy_64_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4,q5}
-	vstmdb		sp!, {q6-q8}
-#else
-	vpush		{q4,q5}
-	vpush		{q6-q8}
-#endif
-	add		r7, r11, #32
-	movs		r12, r2, lsr #6
-	vdup.u32	q8, r7
-neon_b2f_copy_64_u_loop:
-	sub		r1, r1, #64
-	sub		r0, r0, #64
-	vld1.32		{q0, q1}, [r1]!
-	vld1.32		{q2, q3}, [r1]
-	sub		r1, r1, #32
-	vmov		q4, q0
-	vmov		q5, q1
-	vmov		q6, q2
-	vmov		q7, q3
-	vmov		r5, s0
-	mov		r4, r4, lsl r11
-	lsls		r6, r10, #28
-	bcc		neon_b2f_copy_64_u_b8
-	bpl		neon_b2f_copy_64_u_b16
-	vshr.u64	q0, q0, #24
-	vshr.u64	q1, q1, #24
-	vshr.u64	q2, q2, #24
-	vshr.u64	q3, q3, #24
-	b		neon_b2f_copy_64_unify
-neon_b2f_copy_64_u_b8:
-	vshr.u64	q0, q0, #8
-	vshr.u64	q1, q1, #8
-	vshr.u64	q2, q2, #8
-	vshr.u64	q3, q3, #8
-	b		neon_b2f_copy_64_unify
-neon_b2f_copy_64_u_b16:
-	vshr.u64	q0, q0, #16
-	vshr.u64	q1, q1, #16
-	vshr.u64	q2, q2, #16
-	vshr.u64	q3, q3, #16
-neon_b2f_copy_64_unify:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q4, q8, q4
-	vshl.u64	q5, q8, q5
-	vshl.u64	q6, q8, q6
-	vshl.u64	q7, q8, q7
-#else
-	vshl.u64	q4, q4, q8
-	vshl.u64	q5, q5, q8
-	vshl.u64	q6, q6, q8
-	vshl.u64	q7, q7, q8
-#endif
-	vmov		s17, r4
-	vorr		d7, d7, d8
-	vorr		d6, d6, d15
-	vorr		d5, d5, d14
-	vorr		d4, d4, d13
-	vorr		d3, d3, d12
-	vorr		d2, d2, d11
-	vorr		d1, d1, d10
-	vorr		d0, d0, d9
-	mov		r4, r5, lsl r11
-	subs		r12, r12, #1
-	lsr		r4, r4, r11
-	vst1.32		{q0, q1}, [r0]!
-	vst1.32		{q2, q3}, [r0]
-	pld		[r1, #0]
-	sub		r0, r0, #32
-	bne		neon_b2f_copy_64_u_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q6-q8}
-	vldmia		sp!, {q4,q5}
-#else
-	vpop		{q6-q8}
-	vpop		{q4,q5}
-#endif
-	ands		r2, r2, #0x3f
-	cmp		r2, #32
-	bge		neon_b2f_copy_32_u
-	b		neon_b2f_copy_finish_u
-neon_b2f_copy_32_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vstmdb		sp!, {q4}
-#else
-	vpush		{q4}
-#endif
-	add		r7, r11, #32
-	movs		r12, r2, lsr #5
-	vdup.u32	q4, r7
-	and		r2, r2, #0x1f
-neon_b2f_copy_32_u_loop:
-	sub		r1, r1, #32
-	sub		r0, r0, #32
-	vld1.32		{q0, q1}, [r1]
-	vmov		q2, q0
-	vmov		q3, q1
-	vmov		r5, s0
-	mov		r4, r4, lsl r11
-	lsls		r6, r10, #28
-	bcc		neon_b2f_copy_32_u_b8
-	bpl		neon_b2f_copy_32_u_b16
-	vshr.u64	q0, q0, #24
-	vshr.u64	q1, q1, #24
-	b		neon_b2f_copy_32_unify
-neon_b2f_copy_32_u_b8:
-	vshr.u64	q0, q0, #8
-	vshr.u64	q1, q1, #8
-	b		neon_b2f_copy_32_unify
-neon_b2f_copy_32_u_b16:
-	vshr.u64	q0, q0, #16
-	vshr.u64	q1, q1, #16
-neon_b2f_copy_32_unify:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vshl.u64	q2, q4, q2
-	vshl.u64	q3, q4, q3
-#else
-	vshl.u64	q2, q2, q4
-	vshl.u64	q3, q3, q4
-#endif
-	vmov		s9, r4
-	vorr		d3, d3, d4
-	vorr		d2, d2, d7
-	vorr		d1, d1, d6
-	vorr		d0, d0, d5
-	mov		r4, r5, lsl r11
-	subs		r12, r12, #1
-	lsr		r4, r4, r11
-	vst1.32		{q0, q1}, [r0]
-	pld		[r1, #0]
-	bne		neon_b2f_copy_32_u_loop
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	vldmia		sp!, {q4}
-#else
-	vpop		{q4}
-#endif
-neon_b2f_copy_finish_u:
-neon_b2f_copy_8_u:
-	movs		r12, r2, lsr #0x3
-	beq		neon_b2f_copy_4_u
-	mov		r5, r4, lsl r11
-neon_b2f_copy_8_u_loop:
-	ldmdb		r1!, {r6-r7}
-	subs		r12, r12, #1
-	orr		r5, r5, r7, lsr r10
-	mov		r4, r7, lsl r11
-	orr		r4, r4, r6, lsr r10
-	stmdb		r0!, {r4-r5}
-	mov		r4, r6, lsl r11
-	lsr		r4, r4, r11
-	mov		r5, r4, lsl r11
-	bne		neon_b2f_copy_8_u_loop
-	ands		r2, r2, #0x7
-neon_b2f_copy_4_u:
-	movs		r12, r2, lsr #0x2
-	beq		neon_b2f_last_bits_u
-	mov		r5, r4, lsl r11
-neon_b2f_copy_4_u_loop:
-	ldr		r6, [r1, #-4]!
-	subs		r12, r12, #1
-	orr		r5, r5, r6, lsr r10
-	str		r5, [r0, #-4]!
-	mov		r4, r6, lsl r11
-	lsr		r4, r4, r11
-	mov		r5, r4, lsl r11
-	bne		neon_b2f_copy_4_u_loop
-	and		r2, r2, #0x3
-neon_b2f_last_bits_u:
-neon_b2f_last_bits_u_loop:
-	subs		r10, r10, #8
-	mov		r5, r4, lsr r10
-	strb		r5, [r0, #-1]!
-	bne		neon_b2f_last_bits_u_loop
-neon_b2f_copy_1_u:
-	cmp		r2, #0
-	beq		neon_b2f_finish_u
-neon_b2f_copy_1_u_loop:
-	ldrb		r12, [r1, #-1]!
-	subs		r2, r2, #1
-	strb		r12, [r0, #-1]!
-	bne		neon_b2f_copy_1_u_loop
-neon_b2f_finish_u:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r6-r11}
-#else
-	pop		{r6-r11}
-#endif
-
-neon_b2f_finish:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r4-r5}
-#else
-	pop		{r4-r5}
-#endif
-
-neon_memmove_done:
-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
-	ldmia		sp!, {r0}
-#else
-	pop		{r0}
-#endif
-	bx		lr
-
-	.endfunc
-	.end
diff --git git/src/neon_memsets.c git/src/neon_memsets.c
deleted file mode 100755
index 740fc1e..0000000
--- git/src/neon_memsets.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* neon_memsets.c
- *
- * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of Code Aurora nor
- *       the names of its contributors may be used to endorse or promote
- *       products derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "msm-swblits.h"
-
-void memset16(uint16_t dst[], uint16_t value, int count)
-{
-    if (count <= 0)
-        return;
-
-    asm volatile(
-                 "       pld        [%[dst], #0]                           \n"
-                 "       cmp        %[count], #4                           \n"
-                 "       blt        6f                                     \n"
-                 "       tst        %[dst], #0x3                           \n"
-                 "       strneh     %[value], [%[dst]], #2                 \n"
-                 "       subne      %[count], %[count], #1                 \n"
-                 "       vdup.u16   q8, %[value]                           \n"
-                 "       vmov       q9, q8                                 \n"
-                 "       cmp        %[count], #64                          \n"
-                 "       bge        0f                                     \n"
-                 "       cmp        %[count], #32                          \n"
-                 "       bge        2f                                     \n"
-                 "       cmp        %[count], #16                          \n"
-                 "       bge        3f                                     \n"
-                 "       cmp        %[count], #8                           \n"
-                 "       bge        4f                                     \n"
-                 "       b          5f                                     \n"
-                 "0:                                                       \n"
-                 "       mov        r12, %[count], lsr #6                  \n"
-                 "1:                                                       \n"
-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
-                 "       subs       r12, r12, #1                           \n"
-                 "       bne        1b                                     \n"
-                 "       ands       %[count], %[count], #0x3f              \n"
-                 "       beq        7f                                     \n"
-                 "2:                                                       \n"
-                 "       cmp        %[count], #32                          \n"
-                 "       blt        3f                                     \n"
-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
-                 "       subs       %[count], %[count], #32                \n"
-                 "       beq        7f                                     \n"
-                 "3:                                                       \n"
-                 "       cmp        %[count], #16                          \n"
-                 "       blt        4f                                     \n"
-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
-                 "       subs       %[count], %[count], #16                \n"
-                 "       beq        7f                                     \n"
-                 "4:                                                       \n"
-                 "       cmp        %[count], #8                           \n"
-                 "       blt        5f                                     \n"
-                 "       vst1.16    {q8}, [%[dst]]!                        \n"
-                 "       subs       %[count], %[count], #8                 \n"
-                 "       beq        7f                                     \n"
-                 "5:                                                       \n"
-                 "       cmp        %[count], #4                           \n"
-                 "       blt        6f                                     \n"
-                 "       vst1.16    {d16}, [%[dst]]!                       \n"
-                 "       subs       %[count], %[count], #4                 \n"
-                 "       beq        7f                                     \n"
-                 "6:                                                       \n"
-                 "       cmp        %[count], #0                           \n"
-                 "       blt        7f                                     \n"
-                 "       lsls       %[count], #31                          \n"
-                 "       strmih     %[value], [%[dst]], #2                 \n"
-                 "       strcsh     %[value], [%[dst]], #2                 \n"
-                 "       strcsh     %[value], [%[dst]], #2                 \n"
-                 "7:                                                       \n"
-                 // Clobbered input registers
-                 : [dst] "+r" (dst), [count] "+r" (count)
-                 // Unclobbered input
-                 : [value] "r" (value)
-                 // Clobbered registers
-                 : "q8", "q9", "r12", "cc", "memory"
-                 );
-}
-
-void memset32(uint32_t dst[], uint32_t value, int count)
-{
-    asm volatile(
-                 "       pld        [%[dst], #0]                           \n"
-                 "       cmp        %[count], #4                           \n"
-                 "       blt        5f                                     \n"
-                 "       vdup.u32   q8, %[value]                           \n"
-                 "       vmov       q9, q8                                 \n"
-                 "       cmp        %[count], #32                          \n"
-                 "       bge        0f                                     \n"
-                 "       cmp        %[count], #16                          \n"
-                 "       bge        2f                                     \n"
-                 "       cmp        %[count], #8                           \n"
-                 "       bge        3f                                     \n"
-                 "       b          4f                                     \n"
-                 "0:                                                       \n"
-                 "       mov        r12, %[count], lsr #5                  \n"
-                 "1:                                                       \n"
-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
-                 "       pld        [%[dst], #0]                           \n"
-                 "       subs       r12, r12, #1                           \n"
-                 "       bne        1b                                     \n"
-                 "       ands       %[count], %[count], #0x1f              \n"
-                 "       beq        6f                                     \n"
-                 "2:                                                       \n"
-                 "       cmp        %[count], #16                          \n"
-                 "       blt        3f                                     \n"
-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
-                 "       subs       %[count], %[count], #16                \n"
-                 "       beq        6f                                     \n"
-                 "3:                                                       \n"
-                 "       cmp        %[count], #8                           \n"
-                 "       blt        4f                                     \n"
-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
-                 "       subs       %[count], %[count], #8                 \n"
-                 "       beq        6f                                     \n"
-                 "4:                                                       \n"
-                 "       cmp        %[count], #4                           \n"
-                 "       blt        5f                                     \n"
-                 "       vst1.32    {q8}, [%[dst]]!                        \n"
-                 "       subs       %[count], %[count], #4                 \n"
-                 "       beq        6f                                     \n"
-                 "5:                                                       \n"
-                 "       cmp        %[count], #0                           \n"
-                 "       beq        6f                                     \n"
-                 "       lsls       %[count], #31                          \n"
-                 "       strmi      %[value], [%[dst]], #4                 \n"
-                 "       strcs      %[value], [%[dst]], #4                 \n"
-                 "       strcs      %[value], [%[dst]], #4                 \n"
-                 "6: @end                                                  \n"
-                 // Clobbered input registers
-                 : [dst] "+r" (dst), [count] "+r" (count)
-                 // Unclobbered input
-                 : [value] "r" (value)
-                 // Clobbered registers
-                 : "q8", "q9", "r12", "cc", "memory"
-                 );
-}