summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Lanzendörfer <david.lanzendoerfer@o2s.ch>2010-03-17 21:25:38 +0100
committerLukas Gorris <lukas.gorris@gmail.com>2010-03-17 21:25:38 +0100
commitd695f337bdaa0e297ad89c6fdd99edf97bc270db (patch)
tree23080a6660c4a5e76c82ef47b587e4c304bdf43b
parent6c5f9d4325be253fb3cb9b6d3bec11f7a7a13562 (diff)
xf86-video-msm: fix build errors
-rw-r--r--recipes/xorg-driver/xf86-video-msm/no_neon.patch2901
-rw-r--r--recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch36
-rw-r--r--recipes/xorg-driver/xf86-video-msm/renaming_variables.patch116
-rw-r--r--recipes/xorg-driver/xf86-video-msm_git.bb10
4 files changed, 3061 insertions, 2 deletions
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
new file mode 100644
index 0000000000..c0aa92e76a
--- /dev/null
+++ b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
@@ -0,0 +1,2901 @@
+commit d8910bf773fbecf7cdea359d4b530a3672e27180
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date: Wed Feb 10 16:18:39 2010 +0100
+
+ Removed neon because its not available in our kerneÃl
+ and so its causing trubble (Illegal instruction)
+
+diff --git git/src/msm-swblits.h git/src/msm-swblits.h
+index f89f00e..a40b24b 100755
+--- git/src/msm-swblits.h
++++ git/src/msm-swblits.h
+@@ -38,16 +38,6 @@
+ #include <stdint.h>
+ #include <stdlib.h>
+
+-/* Neon intrinsics are part of the ARM or GCC compiler used. */
+-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */
+-#include <arm_neon.h>
+-
+-/* These are NEON-optimized functions linked to by various tests. */
+-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes);
+-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes);
+-extern void memset16(uint16_t *dst, uint16_t value, int count);
+-extern void memset32(uint32_t *dst, uint32_t value, int count);
+-
+ /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */
+ #define BITS_PER_BYTE (8)
+ #define BYTES_PER_16BPP_PIXEL (2)
+diff --git git/src/msm-swfill.c git/src/msm-swfill.c
+index 108fd94..3dd1ef2 100755
+--- git/src/msm-swfill.c
++++ git/src/msm-swfill.c
+@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou
+ }
+ }
+
+-
++/*
+ static inline void
+ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+ {
+@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+ // Quickly fill remaining pixels (up to 7).
+ memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
+ }
+-
++*/
+
+ static inline void
+ memset16_Test(uint16_t *dst, uint16_t src, int count)
+@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count)
+
+ // Copy remaining pixels using Neon and non-Neon instructions.
+ // NOTE: This assumes that dst is aligned optimally for Neon instructions.
+- memset16_AssumesNeonAlignment((void *) dst, src, count);
++ //memset16_AssumesNeonAlignment((void *) dst, src, count);
++ memset((void *) dst, src, count);
+ }
+ }
+
+@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp
+ if (w < 32) {
+ // For narrow rectangles, block signals only once for the entire rectangles.
+ BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
+ UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+ }
+ else {
+ // For wider rectangles, block and unblock signals for every row.
+- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+ }
+ }
+
+diff --git git/src/msm-swrender.c git/src/msm-swrender.c
+index a7a9abc..835dc03 100755
+--- git/src/msm-swrender.c
++++ git/src/msm-swrender.c
+@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src
+ }
+ }
+ break;
+- case 7: if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+- return TRUE;
+- } else {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+- return TRUE;
+- }
+- break;
+- case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
+- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1;
+- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2;
+- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3;
+- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4;
+- return TRUE;
+- }
+- else {
+- uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
+- uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
+- uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
+- uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
+- uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
+- uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
+- uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
+- uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
+- *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1;
+- *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2;
+- *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3;
+- *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4;
+- *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5;
+- *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6;
+- *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7;
+- *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8;
+- return TRUE;
+- }
+- break;
+- case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
+- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
+- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3;
+- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4;
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+- uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
+- uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
+- uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
+- uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1;
+- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2;
+- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3;
+- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4;
+- *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5;
+- *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6;
+- *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7;
+- *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8;
+- return TRUE;
+- }
+- else {
+- // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
+- // Instead, just call multiple fixed functions.
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+- } else {
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+- }
+- return TRUE;
+- }
+- break;
+- case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+- return TRUE;
+- }
+- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+- uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
+- uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
+- uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
+- uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1;
+- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2;
+- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3;
+- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4;
+- *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5;
+- *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6;
+- *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7;
+- *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8;
+- return TRUE;
+- }
+- break;
+- case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
+- uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
+- uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
+- uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+- vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
+- vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
+- vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
+- vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
+- return TRUE;
+- }
+- break;
+ }
+
+ return FALSE;
+@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr
+ }
+ return TRUE;
+ break;
+- case 7: if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- break;
+- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
+- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
+- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
+- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+- uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+- *(uint16_t *) (dst+0*dpitch+0) = src1a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a;
+- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a;
+- *(uint16_t *) (dst+1*dpitch+0) = src1b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b;
+- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b;
+- return TRUE;
+- }
+- else {
+- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
+- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
+- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
+- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
+- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
+- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
+- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
+- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
+- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
+- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a;
+- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a;
+- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a;
+- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a;
+- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a;
+- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a;
+- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a;
+- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a;
+- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b;
+- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b;
+- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b;
+- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b;
+- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b;
+- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b;
+- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b;
+- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b;
+- return TRUE;
+- }
+- break;
+- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+- *(uint32_t *) (dst+0*dpitch+0) = src1a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a;
+- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a;
+- *(uint32_t *) (dst+1*dpitch+0) = src1b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b;
+- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
+- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
+- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
+- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
+- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
+- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a;
+- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a;
+- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a;
+- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a;
+- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a;
+- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a;
+- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a;
+- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a;
+- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b;
+- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b;
+- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b;
+- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b;
+- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b;
+- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b;
+- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b;
+- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b;
+- return TRUE;
+- }
+- else {
+- // Don't bother unrolling loops, since that won't help for more than around 8 operations.
+- // Instead, just call multiple fixed functions.
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- break;
+- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
+- uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
+- uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
+- uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
+- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+- uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
+- uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
+- uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
+- uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
+- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a;
+- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a;
+- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a;
+- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a;
+- *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a;
+- *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a;
+- *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a;
+- *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a;
+- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b;
+- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b;
+- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b;
+- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b;
+- *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b;
+- *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b;
+- *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b;
+- *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b;
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- else {
+- if (xdir >= 0) {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- } else {
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+- }
+- return TRUE;
+- }
+- break;
+- case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
+- uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
+- uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
+- uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
+- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+- uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
+- uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
+- uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
+- uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
+- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
+- vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4