diff options
| author | David Lanzendörfer <david.lanzendoerfer@o2s.ch> | 2010-03-17 21:25:38 +0100 |
|---|---|---|
| committer | Lukas Gorris <lukas.gorris@gmail.com> | 2010-03-17 21:25:38 +0100 |
| commit | d695f337bdaa0e297ad89c6fdd99edf97bc270db (patch) | |
| tree | 23080a6660c4a5e76c82ef47b587e4c304bdf43b | |
| parent | 6c5f9d4325be253fb3cb9b6d3bec11f7a7a13562 (diff) | |
xf86-video-msm: fix build errors
| -rw-r--r-- | recipes/xorg-driver/xf86-video-msm/no_neon.patch | 2901 | ||||
| -rw-r--r-- | recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch | 36 | ||||
| -rw-r--r-- | recipes/xorg-driver/xf86-video-msm/renaming_variables.patch | 116 | ||||
| -rw-r--r-- | recipes/xorg-driver/xf86-video-msm_git.bb | 10 |
4 files changed, 3061 insertions, 2 deletions
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch new file mode 100644 index 0000000000..c0aa92e76a --- /dev/null +++ b/recipes/xorg-driver/xf86-video-msm/no_neon.patch @@ -0,0 +1,2901 @@ +commit d8910bf773fbecf7cdea359d4b530a3672e27180 +Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch> +Date: Wed Feb 10 16:18:39 2010 +0100 + + Removed neon because its not available in our kerneÃl + and so its causing trubble (Illegal instruction) + +diff --git git/src/msm-swblits.h git/src/msm-swblits.h +index f89f00e..a40b24b 100755 +--- git/src/msm-swblits.h ++++ git/src/msm-swblits.h +@@ -38,16 +38,6 @@ + #include <stdint.h> + #include <stdlib.h> + +-/* Neon intrinsics are part of the ARM or GCC compiler used. */ +-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */ +-#include <arm_neon.h> +- +-/* These are NEON-optimized functions linked to by various tests. */ +-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes); +-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes); +-extern void memset16(uint16_t *dst, uint16_t value, int count); +-extern void memset32(uint32_t *dst, uint32_t value, int count); +- + /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */ + #define BITS_PER_BYTE (8) + #define BYTES_PER_16BPP_PIXEL (2) +diff --git git/src/msm-swfill.c git/src/msm-swfill.c +index 108fd94..3dd1ef2 100755 +--- git/src/msm-swfill.c ++++ git/src/msm-swfill.c +@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou + } + } + +- ++/* + static inline void + memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) + { +@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) + // Quickly fill remaining pixels (up to 7). + memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count); + } +- ++*/ + + static inline void + memset16_Test(uint16_t *dst, uint16_t src, int count) +@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count) + + // Copy remaining pixels using Neon and non-Neon instructions. + // NOTE: This assumes that dst is aligned optimally for Neon instructions. +- memset16_AssumesNeonAlignment((void *) dst, src, count); ++ //memset16_AssumesNeonAlignment((void *) dst, src, count); ++ memset((void *) dst, src, count); + } + } + +@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp + if (w < 32) { + // For narrow rectangles, block signals only once for the entire rectangles. + BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); +- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); ++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); ++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); + UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); + } + else { + // For wider rectangles, block and unblock signals for every row. +- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + } + } + +diff --git git/src/msm-swrender.c git/src/msm-swrender.c +index a7a9abc..835dc03 100755 +--- git/src/msm-swrender.c ++++ git/src/msm-swrender.c +@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src + } + } + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); +- return TRUE; +- } else { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); +- return TRUE; +- } +- break; +- case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); +- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); +- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); +- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; +- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; +- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; +- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; +- return TRUE; +- } +- else { +- uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); +- uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); +- uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); +- uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T); +- uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T); +- uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T); +- uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T); +- uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T); +- *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; +- *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; +- *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; +- *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4; +- *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5; +- *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6; +- *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7; +- *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8; +- return TRUE; +- } +- break; +- case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); +- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; +- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); +- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); +- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); +- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); +- uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T); +- uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T); +- uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T); +- uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; +- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; +- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; +- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; +- *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5; +- *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6; +- *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7; +- *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8; +- return TRUE; +- } +- else { +- // Don't bother unrolling loops here, since that won't help for more than around 8 operations. +- // Instead, just call multiple fixed functions. +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); +- } else { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); +- } +- return TRUE; +- } +- break; +- case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); +- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); +- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); +- uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T); +- uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T); +- uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T); +- uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; +- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; +- *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5; +- *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6; +- *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7; +- *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8; +- return TRUE; +- } +- break; +- case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); +- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); +- vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5); +- vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6); +- vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7); +- vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8); +- return TRUE; +- } +- break; + } + + return FALSE; +@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr + } + return TRUE; + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- break; +- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- *(uint16_t *) (dst+0*dpitch+0) = src1a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; +- *(uint16_t *) (dst+1*dpitch+0) = src1b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; +- return TRUE; +- } +- else { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); +- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; +- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; +- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; +- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; +- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; +- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; +- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; +- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; +- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; +- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; +- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; +- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; +- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; +- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; +- return TRUE; +- } +- break; +- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- *(uint32_t *) (dst+0*dpitch+0) = src1a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; +- *(uint32_t *) (dst+1*dpitch+0) = src1b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; +- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; +- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; +- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; +- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; +- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; +- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; +- return TRUE; +- } +- else { +- // Don't bother unrolling loops, since that won't help for more than around 8 operations. +- // Instead, just call multiple fixed functions. +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; +- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T); +- uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T); +- uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T); +- uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T); +- uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T); +- uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T); +- uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; +- *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a; +- *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a; +- *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a; +- *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; +- *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b; +- *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b; +- *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b; +- *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; +- case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4 |
