From d695f337bdaa0e297ad89c6fdd99edf97bc270db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Lanzend=C3=B6rfer?= Date: Wed, 17 Mar 2010 21:25:38 +0100 Subject: xf86-video-msm: fix build errors --- recipes/xorg-driver/xf86-video-msm/no_neon.patch | 2901 ++++++++++++++++++++ .../xorg-driver/xf86-video-msm/no_neon_flags.patch | 36 + .../xf86-video-msm/renaming_variables.patch | 116 + recipes/xorg-driver/xf86-video-msm_git.bb | 10 +- 4 files changed, 3061 insertions(+), 2 deletions(-) create mode 100644 recipes/xorg-driver/xf86-video-msm/no_neon.patch create mode 100644 recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch create mode 100644 recipes/xorg-driver/xf86-video-msm/renaming_variables.patch diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch new file mode 100644 index 0000000000..c0aa92e76a --- /dev/null +++ b/recipes/xorg-driver/xf86-video-msm/no_neon.patch @@ -0,0 +1,2901 @@ +commit d8910bf773fbecf7cdea359d4b530a3672e27180 +Author: David Lanzendörfer +Date: Wed Feb 10 16:18:39 2010 +0100 + + Removed neon because its not available in our kerneÃl + and so its causing trubble (Illegal instruction) + +diff --git git/src/msm-swblits.h git/src/msm-swblits.h +index f89f00e..a40b24b 100755 +--- git/src/msm-swblits.h ++++ git/src/msm-swblits.h +@@ -38,16 +38,6 @@ + #include + #include + +-/* Neon intrinsics are part of the ARM or GCC compiler used. */ +-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */ +-#include +- +-/* These are NEON-optimized functions linked to by various tests. */ +-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes); +-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes); +-extern void memset16(uint16_t *dst, uint16_t value, int count); +-extern void memset32(uint32_t *dst, uint32_t value, int count); +- + /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */ + #define BITS_PER_BYTE (8) + #define BYTES_PER_16BPP_PIXEL (2) +diff --git git/src/msm-swfill.c git/src/msm-swfill.c +index 108fd94..3dd1ef2 100755 +--- git/src/msm-swfill.c ++++ git/src/msm-swfill.c +@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou + } + } + +- ++/* + static inline void + memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) + { +@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) + // Quickly fill remaining pixels (up to 7). + memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count); + } +- ++*/ + + static inline void + memset16_Test(uint16_t *dst, uint16_t src, int count) +@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count) + + // Copy remaining pixels using Neon and non-Neon instructions. + // NOTE: This assumes that dst is aligned optimally for Neon instructions. +- memset16_AssumesNeonAlignment((void *) dst, src, count); ++ //memset16_AssumesNeonAlignment((void *) dst, src, count); ++ memset((void *) dst, src, count); + } + } + +@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp + if (w < 32) { + // For narrow rectangles, block signals only once for the entire rectangles. + BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); +- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); ++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); ++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); + UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); + } + else { + // For wider rectangles, block and unblock signals for every row. +- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + } + } + +diff --git git/src/msm-swrender.c git/src/msm-swrender.c +index a7a9abc..835dc03 100755 +--- git/src/msm-swrender.c ++++ git/src/msm-swrender.c +@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src + } + } + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); +- return TRUE; +- } else { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); +- return TRUE; +- } +- break; +- case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); +- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); +- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); +- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; +- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; +- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; +- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; +- return TRUE; +- } +- else { +- uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); +- uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); +- uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); +- uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T); +- uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T); +- uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T); +- uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T); +- uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T); +- *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; +- *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; +- *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; +- *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4; +- *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5; +- *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6; +- *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7; +- *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8; +- return TRUE; +- } +- break; +- case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); +- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; +- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); +- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); +- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); +- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); +- uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T); +- uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T); +- uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T); +- uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; +- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; +- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; +- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; +- *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5; +- *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6; +- *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7; +- *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8; +- return TRUE; +- } +- else { +- // Don't bother unrolling loops here, since that won't help for more than around 8 operations. +- // Instead, just call multiple fixed functions. +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); +- } else { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); +- } +- return TRUE; +- } +- break; +- case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); +- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); +- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); +- uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T); +- uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T); +- uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T); +- uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; +- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; +- *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5; +- *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6; +- *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7; +- *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8; +- return TRUE; +- } +- break; +- case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); +- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); +- vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5); +- vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6); +- vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7); +- vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8); +- return TRUE; +- } +- break; + } + + return FALSE; +@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr + } + return TRUE; + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- break; +- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- *(uint16_t *) (dst+0*dpitch+0) = src1a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; +- *(uint16_t *) (dst+1*dpitch+0) = src1b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; +- return TRUE; +- } +- else { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); +- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; +- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; +- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; +- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; +- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; +- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; +- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; +- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; +- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; +- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; +- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; +- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; +- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; +- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; +- return TRUE; +- } +- break; +- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- *(uint32_t *) (dst+0*dpitch+0) = src1a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; +- *(uint32_t *) (dst+1*dpitch+0) = src1b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; +- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; +- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; +- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; +- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; +- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; +- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; +- return TRUE; +- } +- else { +- // Don't bother unrolling loops, since that won't help for more than around 8 operations. +- // Instead, just call multiple fixed functions. +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; +- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T); +- uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T); +- uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T); +- uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T); +- uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T); +- uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T); +- uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; +- *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a; +- *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a; +- *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a; +- *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; +- *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b; +- *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b; +- *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b; +- *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; +- case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b); +- return TRUE; +- }//HERE +- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; + } + + return FALSE; +@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr + } + return TRUE; + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- break; +- // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons? +- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases... +- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); +- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; +- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; +- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; +- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; +- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; +- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; +- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; +- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; +- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; +- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; +- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1c = *(uint16_t *) (src+2*spitch+0); +- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1d = *(uint16_t *) (src+3*spitch+0); +- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- *(uint16_t *) (dst+0*dpitch+0) = src1a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; +- *(uint16_t *) (dst+1*dpitch+0) = src1b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; +- *(uint16_t *) (dst+2*dpitch+0) = src1c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; +- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; +- *(uint16_t *) (dst+3*dpitch+0) = src1d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; +- *(uint1