diff options
author | David Lanzendörfer <david.lanzendoerfer@o2s.ch> | 2010-03-17 21:25:38 +0100 |
---|---|---|
committer | Lukas Gorris <lukas.gorris@gmail.com> | 2010-03-17 21:25:38 +0100 |
commit | d695f337bdaa0e297ad89c6fdd99edf97bc270db (patch) | |
tree | 23080a6660c4a5e76c82ef47b587e4c304bdf43b /recipes/xorg-driver | |
parent | 6c5f9d4325be253fb3cb9b6d3bec11f7a7a13562 (diff) |
xf86-video-msm: fix build errors
Diffstat (limited to 'recipes/xorg-driver')
-rw-r--r-- | recipes/xorg-driver/xf86-video-msm/no_neon.patch | 2901 | ||||
-rw-r--r-- | recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch | 36 | ||||
-rw-r--r-- | recipes/xorg-driver/xf86-video-msm/renaming_variables.patch | 116 | ||||
-rw-r--r-- | recipes/xorg-driver/xf86-video-msm_git.bb | 10 |
4 files changed, 3061 insertions, 2 deletions
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch new file mode 100644 index 0000000000..c0aa92e76a --- /dev/null +++ b/recipes/xorg-driver/xf86-video-msm/no_neon.patch @@ -0,0 +1,2901 @@ +commit d8910bf773fbecf7cdea359d4b530a3672e27180 +Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch> +Date: Wed Feb 10 16:18:39 2010 +0100 + + Removed neon because its not available in our kerneÃl + and so its causing trubble (Illegal instruction) + +diff --git git/src/msm-swblits.h git/src/msm-swblits.h +index f89f00e..a40b24b 100755 +--- git/src/msm-swblits.h ++++ git/src/msm-swblits.h +@@ -38,16 +38,6 @@ + #include <stdint.h> + #include <stdlib.h> + +-/* Neon intrinsics are part of the ARM or GCC compiler used. */ +-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */ +-#include <arm_neon.h> +- +-/* These are NEON-optimized functions linked to by various tests. */ +-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes); +-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes); +-extern void memset16(uint16_t *dst, uint16_t value, int count); +-extern void memset32(uint32_t *dst, uint32_t value, int count); +- + /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */ + #define BITS_PER_BYTE (8) + #define BYTES_PER_16BPP_PIXEL (2) +diff --git git/src/msm-swfill.c git/src/msm-swfill.c +index 108fd94..3dd1ef2 100755 +--- git/src/msm-swfill.c ++++ git/src/msm-swfill.c +@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou + } + } + +- ++/* + static inline void + memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) + { +@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count) + // Quickly fill remaining pixels (up to 7). + memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count); + } +- ++*/ + + static inline void + memset16_Test(uint16_t *dst, uint16_t src, int count) +@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count) + + // Copy remaining pixels using Neon and non-Neon instructions. + // NOTE: This assumes that dst is aligned optimally for Neon instructions. +- memset16_AssumesNeonAlignment((void *) dst, src, count); ++ //memset16_AssumesNeonAlignment((void *) dst, src, count); ++ memset((void *) dst, src, count); + } + } + +@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp + if (w < 32) { + // For narrow rectangles, block signals only once for the entire rectangles. + BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); +- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); ++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); ++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP); + UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); + } + else { + // For wider rectangles, block and unblock signals for every row. +- DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + } + } + +diff --git git/src/msm-swrender.c git/src/msm-swrender.c +index a7a9abc..835dc03 100755 +--- git/src/msm-swrender.c ++++ git/src/msm-swrender.c +@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src + } + } + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); +- return TRUE; +- } else { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir); +- return TRUE; +- } +- break; +- case 8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); +- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); +- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); +- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; +- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; +- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; +- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; +- return TRUE; +- } +- else { +- uint16_t src1 = *(uint16_t *) (src+0*BYTES_PER_UINT16_T); +- uint16_t src2 = *(uint16_t *) (src+1*BYTES_PER_UINT16_T); +- uint16_t src3 = *(uint16_t *) (src+2*BYTES_PER_UINT16_T); +- uint16_t src4 = *(uint16_t *) (src+3*BYTES_PER_UINT16_T); +- uint16_t src5 = *(uint16_t *) (src+4*BYTES_PER_UINT16_T); +- uint16_t src6 = *(uint16_t *) (src+5*BYTES_PER_UINT16_T); +- uint16_t src7 = *(uint16_t *) (src+6*BYTES_PER_UINT16_T); +- uint16_t src8 = *(uint16_t *) (src+7*BYTES_PER_UINT16_T); +- *(uint16_t *) (dst+0*BYTES_PER_UINT16_T) = src1; +- *(uint16_t *) (dst+1*BYTES_PER_UINT16_T) = src2; +- *(uint16_t *) (dst+2*BYTES_PER_UINT16_T) = src3; +- *(uint16_t *) (dst+3*BYTES_PER_UINT16_T) = src4; +- *(uint16_t *) (dst+4*BYTES_PER_UINT16_T) = src5; +- *(uint16_t *) (dst+5*BYTES_PER_UINT16_T) = src6; +- *(uint16_t *) (dst+6*BYTES_PER_UINT16_T) = src7; +- *(uint16_t *) (dst+7*BYTES_PER_UINT16_T) = src8; +- return TRUE; +- } +- break; +- case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); +- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; +- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32_t src1 = *(uint32_t *) (src+0*BYTES_PER_UINT32_T); +- uint32_t src2 = *(uint32_t *) (src+1*BYTES_PER_UINT32_T); +- uint32_t src3 = *(uint32_t *) (src+2*BYTES_PER_UINT32_T); +- uint32_t src4 = *(uint32_t *) (src+3*BYTES_PER_UINT32_T); +- uint32_t src5 = *(uint32_t *) (src+4*BYTES_PER_UINT32_T); +- uint32_t src6 = *(uint32_t *) (src+5*BYTES_PER_UINT32_T); +- uint32_t src7 = *(uint32_t *) (src+6*BYTES_PER_UINT32_T); +- uint32_t src8 = *(uint32_t *) (src+7*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*BYTES_PER_UINT32_T) = src1; +- *(uint32_t *) (dst+1*BYTES_PER_UINT32_T) = src2; +- *(uint32_t *) (dst+2*BYTES_PER_UINT32_T) = src3; +- *(uint32_t *) (dst+3*BYTES_PER_UINT32_T) = src4; +- *(uint32_t *) (dst+4*BYTES_PER_UINT32_T) = src5; +- *(uint32_t *) (dst+5*BYTES_PER_UINT32_T) = src6; +- *(uint32_t *) (dst+6*BYTES_PER_UINT32_T) = src7; +- *(uint32_t *) (dst+7*BYTES_PER_UINT32_T) = src8; +- return TRUE; +- } +- else { +- // Don't bother unrolling loops here, since that won't help for more than around 8 operations. +- // Instead, just call multiple fixed functions. +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); +- } else { +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir); +- swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); +- } +- return TRUE; +- } +- break; +- case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); +- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); +- return TRUE; +- } +- else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint64_t src1 = *(uint64_t *) (src+0*BYTES_PER_UINT64_T); +- uint64_t src2 = *(uint64_t *) (src+1*BYTES_PER_UINT64_T); +- uint64_t src3 = *(uint64_t *) (src+2*BYTES_PER_UINT64_T); +- uint64_t src4 = *(uint64_t *) (src+3*BYTES_PER_UINT64_T); +- uint64_t src5 = *(uint64_t *) (src+4*BYTES_PER_UINT64_T); +- uint64_t src6 = *(uint64_t *) (src+5*BYTES_PER_UINT64_T); +- uint64_t src7 = *(uint64_t *) (src+6*BYTES_PER_UINT64_T); +- uint64_t src8 = *(uint64_t *) (src+7*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*BYTES_PER_UINT64_T) = src1; +- *(uint64_t *) (dst+1*BYTES_PER_UINT64_T) = src2; +- *(uint64_t *) (dst+2*BYTES_PER_UINT64_T) = src3; +- *(uint64_t *) (dst+3*BYTES_PER_UINT64_T) = src4; +- *(uint64_t *) (dst+4*BYTES_PER_UINT64_T) = src5; +- *(uint64_t *) (dst+5*BYTES_PER_UINT64_T) = src6; +- *(uint64_t *) (dst+6*BYTES_PER_UINT64_T) = src7; +- *(uint64_t *) (dst+7*BYTES_PER_UINT64_T) = src8; +- return TRUE; +- } +- break; +- case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) { +- uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1); +- vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2); +- vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3); +- vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4); +- vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5); +- vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6); +- vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7); +- vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8); +- return TRUE; +- } +- break; + } + + return FALSE; +@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr + } + return TRUE; + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- break; +- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- *(uint16_t *) (dst+0*dpitch+0) = src1a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; +- *(uint16_t *) (dst+1*dpitch+0) = src1b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; +- return TRUE; +- } +- else { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); +- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; +- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; +- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; +- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; +- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; +- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; +- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; +- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; +- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; +- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; +- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; +- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; +- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; +- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; +- return TRUE; +- } +- break; +- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- *(uint32_t *) (dst+0*dpitch+0) = src1a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; +- *(uint32_t *) (dst+1*dpitch+0) = src1b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; +- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; +- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; +- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; +- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; +- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; +- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; +- return TRUE; +- } +- else { +- // Don't bother unrolling loops, since that won't help for more than around 8 operations. +- // Instead, just call multiple fixed functions. +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; +- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T, src + (16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T, src + (4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T); +- uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T); +- uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T); +- uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T); +- uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T); +- uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T); +- uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; +- *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T) = src5a; +- *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T) = src6a; +- *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T) = src7a; +- *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T) = src8a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; +- *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T) = src5b; +- *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T) = src6b; +- *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T) = src7b; +- *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T) = src8b; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; +- case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T)); +- uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T)); +- uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T)); +- uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b); +- return TRUE; +- }//HERE +- else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T, src + (3*16+4)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T, src + (2*16+4)*BYTES_PER_UINT16_T, 16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T, src + (0*16+4)*BYTES_PER_UINT16_T, 2*16, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 2, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0, src + 0 , 1, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; + } + + return FALSE; +@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr + } + return TRUE; + break; +- case 7: if (xdir >= 0) { +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch); +- } +- return TRUE; +- break; +- // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons? +- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases... +- case 8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); +- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; +- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; +- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; +- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; +- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; +- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; +- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; +- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; +- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; +- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; +- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1c = *(uint16_t *) (src+2*spitch+0); +- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint16_t src1d = *(uint16_t *) (src+3*spitch+0); +- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- *(uint16_t *) (dst+0*dpitch+0) = src1a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; +- *(uint16_t *) (dst+1*dpitch+0) = src1b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; +- *(uint16_t *) (dst+2*dpitch+0) = src1c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; +- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; +- *(uint16_t *) (dst+3*dpitch+0) = src1d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; +- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; +- return TRUE; +- } +- else { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T); +- uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T); +- uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T); +- uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T); +- uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T); +- uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T); +- uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T); +- uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T); +- uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T); +- uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T); +- *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T) = src1a; +- *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T) = src2a; +- *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T) = src3a; +- *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T) = src4a; +- *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T) = src5a; +- *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T) = src6a; +- *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T) = src7a; +- *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T) = src8a; +- *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T) = src1b; +- *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T) = src2b; +- *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T) = src3b; +- *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T) = src4b; +- *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T) = src5b; +- *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T) = src6b; +- *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T) = src7b; +- *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T) = src8b; +- *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T) = src1c; +- *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T) = src2c; +- *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T) = src3c; +- *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T) = src4c; +- *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T) = src5c; +- *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T) = src6c; +- *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T) = src7c; +- *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T) = src8c; +- *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T) = src1d; +- *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T) = src2d; +- *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T) = src3d; +- *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T) = src4d; +- *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T) = src5d; +- *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T) = src6d; +- *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T) = src7d; +- *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T) = src8d; +- return TRUE; +- } +- break; +- case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); +- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); +- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); +- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T); +- uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T); +- uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T); +- uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T); +- uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T); +- *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T) = src1a; +- *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T) = src4a; +- *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T) = src1b; +- *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T) = src4b; +- *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T) = src1c; +- *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T) = src2c; +- *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T) = src3c; +- *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T) = src4c; +- *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T) = src1d; +- *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T) = src2d; +- *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T) = src3d; +- *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T) = src4d; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0); +- uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0); +- uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- uint32_t src1c = *(uint32_t *) (src+2*spitch+0); +- uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- uint32_t src1d = *(uint32_t *) (src+3*spitch+0); +- uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T); +- uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T); +- uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T); +- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T); +- *(uint32_t *) (dst+0*dpitch+0) = src1a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3a; +- *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5a; +- *(uint32_t *) (dst+1*dpitch+0) = src1b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3b; +- *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5b; +- *(uint32_t *) (dst+2*dpitch+0) = src1c; +- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2c; +- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3c; +- *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5c; +- *(uint32_t *) (dst+3*dpitch+0) = src1d; +- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T) = src2d; +- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T) = src3d; +- *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T) = src4d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T) = src5d; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T); +- uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T); +- uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T); +- uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T); +- uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T); +- uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T); +- uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T); +- uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T); +- uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T); +- uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T); +- uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T); +- *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T) = src1a; +- *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T) = src5a; +- *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T) = src6a; +- *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T) = src7a; +- *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T) = src8a; +- *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T) = src1b; +- *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T) = src5b; +- *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T) = src6b; +- *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T) = src7b; +- *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T) = src8b; +- *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T) = src1c; +- *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T) = src2c; +- *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T) = src3c; +- *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T) = src4c; +- *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T) = src5c; +- *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T) = src6c; +- *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T) = src7c; +- *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T) = src8c; +- *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T) = src1d; +- *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T) = src2d; +- *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T) = src3d; +- *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T) = src4d; +- *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T) = src5d; +- *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T) = src6d; +- *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T) = src7d; +- *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T) = src8d; +- return TRUE; +- } +- else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) { +- uint16_t src1a = *(uint16_t *) (src+0*spitch+0); +- uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); +- uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); +- uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); +- uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); +- uint16_t src1b = *(uint16_t *) (src+1*spitch+0); +- uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); +- uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); +- uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); +- uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); +- uint16_t src1c = *(uint16_t *) (src+2*spitch+0); +- uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); +- uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); +- uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); +- uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); +- uint16_t src1d = *(uint16_t *) (src+3*spitch+0); +- uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T); +- uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T); +- uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T); +- uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T); +- uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T); +- uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T); +- uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T); +- uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T); +- *(uint16_t *) (dst+0*dpitch+0) = src1a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7a; +- *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8a; +- *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9a; +- *(uint16_t *) (dst+1*dpitch+0) = src1b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7b; +- *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8b; +- *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9b; +- *(uint16_t *) (dst+2*dpitch+0) = src1c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7c; +- *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8c; +- *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9c; +- *(uint16_t *) (dst+3*dpitch+0) = src1d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T) = src2d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = src3d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src4d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T) = src5d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T) = src6d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T) = src7d; +- *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T) = src8d; +- *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T) = src9d; +- return TRUE; +- } +- else { +- // Don't bother unrolling loops, since that won't help for more than around 8 operations. +- // Instead, just call multiple fixed functions. +- if (xdir >= 0) { +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- } else { +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch); +- swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); +- } +- return TRUE; +- } +- break; +- // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons? +- // For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here... +- case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) { +- uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T)); +- uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T)); +- uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T)); +- uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T)); +- uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T)); +- vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a); +- vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a); +- vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b); +- vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b); +- vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c); +- vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c); +- vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c); +- vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c); +- vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d); +- vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d); +- vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d); +- vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d); +- return TRUE; +- } +- break; +- } ++ } + + return FALSE; + } +@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int + if (rowsOverlap) + { + if (w > 64) { +- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + } + else if (w == 64) { +- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + } + else { + DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); +@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int + else + { + if (w > 64) { +- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + } + else if (w == 64) { +- DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); ++ DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS); + } + else { + DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP); +@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i + if (xdir >= 0 || !rowsOverlap) { + if (w >= 128) { + BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); +- neon_memcpy(dst, src, w); ++ //neon_memcpy(dst, src, w); ++ memcpy(dst, src, w); + UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); + } + else +@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i + else { + if (w >= 128) { + BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); +- neon_memmove(dst, src, w); ++ //neon_memmove(dst, src, w); ++ memmove(dst, src, w); + UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); + } + else +@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, + if (xdir >= 0 || !rowsOverlap) { + if (w >= 42) { + BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); +- neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); ++ //neon_memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); ++ memcpy(dst, src, w * BYTES_PER_24BPP_PIXEL); + UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); + } + else +@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, + else { + if (w >= 42) { + BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); +- neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); ++ //neon_memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); ++ memmove(dst, src, w * BYTES_PER_24BPP_PIXEL); + UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); + } + else +diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S +deleted file mode 100644 +index 5ecc5ce..0000000 +--- git/src/neon_memcpy.S ++++ /dev/null +@@ -1,549 +0,0 @@ +-/*************************************************************************** +- Copyright (c) 2009, Code Aurora Forum. All rights reserved. +- +- Redistribution and use in source and binary forms, with or without +- modification, are permitted provided that the following conditions are met: +- * Redistributions of source code must retain the above copyright +- notice, this list of conditions and the following disclaimer. +- * Redistributions in binary form must reproduce the above copyright +- notice, this list of conditions and the following disclaimer in the +- documentation and/or other materials provided with the distribution. +- * Neither the name of Code Aurora nor the names of its contributors may +- be used to endorse or promote products derived from this software +- without specific prior written permission. +- +- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +- POSSIBILITY OF SUCH DAMAGE. +- ***************************************************************************/ +- +-/*************************************************************************** +- Neon memcpy: Attempts to do a memcpy with Neon registers if possible, +- Inputs: +- dest: The destination buffer +- src: The source buffer +- n: The size of the buffer to transfer +- Outputs: +- +-***************************************************************************/ +- +-/* +- * General note: +- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP +- * However, it looks like the 2006 CodeSourcery Assembler has issues generating +- * the correct object code for VPOP, resulting in horrific stack crashes. +- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, +- * and VPOP->VLDMIA. We can revert this back once we update our toolchain. +- * +- * Also, VSHL swaps the source register and the shift-amount register +- * around in 2006-q3. I've coded this incorrectly so it turns out correct +- * in the object code, but we'll need to undo that later... +- */ +- +- .code 32 +- .align 4 +- .globl neon_memcpy +- .func +- +-neon_memcpy: +- /* +- * First, make sure we're not copying < 4 bytes. If so, we'll +- * just handle it here. +- */ +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r0} +-#else +- push {r0} +-#endif +- cmp r2, #4 +- bgt neon_gt_4 +- /* Copy 0-4 bytes, if needed, and return.*/ +- cmp r2, #0 +-neon_smallcopy_loop: +- beq neon_smallcopy_done +- ldrb r12, [r1], #1 +- subs r2, r2, #1 +- strb r12, [r0], #1 +- b neon_smallcopy_loop +-neon_smallcopy_done: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r0} +-#else +- pop {r0} +-#endif +- bx lr +- +- /* Copy 4 or more bytes*/ +-neon_gt_4: +- /* Preload what we can...*/ +- pld [r0,#0] +- pld [r1,#0] +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r4-r5} +-#else +- push {r4-r5} +-#endif +- +-neon_check_align: +- /* Check normal word alignment for target. */ +- ands r12, r0, #0x3 +- beq source_alignment_check +- +- /* +- * Target is not aligned. Step through until we get that +- * word-aligned. This works better than a loop, according +- * to our pipeline modeler. +- */ +- cmp r12, #2 +- ldrb r3, [r1], #1 +- ldrleb r4, [r1], #1 +- ldrltb r5, [r1], #1 +- rsb r12, r12, #4 +- sub r2, r2, r12 +- strb r3, [r0], #1 +- strleb r4, [r0], #1 +- strltb r5, [r0], #1 +- +-source_alignment_check: +- ands r12, r1, #0x3 +- bne neon_memcpy_nonaligned /* Source is not word aligned.*/ +-neon_try_16_align: +- cmp r2, #64 +- blt neon_align_route +- /* This is where we try 16-byte alignment. */ +- ands r12, r0, #0xf +- beq neon_align_route +- rsb r12, r12, #16 +-neon_16_start: +- sub r2, r2, r12 +- lsrs r3, r12, #2 +-neon_align_16_4: +- ldr r4, [r1], #4 +- subs r3, r3, #1 +- str r4, [r0], #4 +- bne neon_align_16_4 +-neon_align_route: +- /* In this case, both source and target are word-aligned. */ +- cmp r2, #32768 +- bge neon_copy_128p_a +- cmp r2, #256 +- bge neon_copy_128_a +- cmp r2, #64 +- bge neon_copy_32_a +- b neon_copy_finish_a +- nop +-neon_copy_128p_a: +- /* We'll copy blocks 128-bytes at a time, but try to call pld to +- * load in the next page, if possible. +- */ +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4-q7} +-#else +- vpush {q4-q7} +-#endif +- mov r12, r2, lsr #7 +-neon_copy_128p_loop_a: +- vld1.32 {q0, q1}, [r1]! +- vld1.32 {q2, q3}, [r1]! +- vld1.32 {q4, q5}, [r1]! +- vld1.32 {q6, q7}, [r1]! +- pld [r1, #0] +- pld [r1, #1024] +- vst1.32 {q0, q1}, [r0]! +- vst1.32 {q2, q3}, [r0]! +- vst1.32 {q4, q5}, [r0]! +- vst1.32 {q6, q7}, [r0]! +- subs r12, r12, #1 +- bne neon_copy_128p_loop_a +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q4-q7} +-#else +- vpop {q4-q7} +-#endif +- ands r2, r2, #0x7f +- beq neon_end +- cmp r2, #32 +- blt neon_copy_finish_a +- b neon_copy_32_a +- /* Copy blocks of 128-bytes (word-aligned) at a time*/ +-neon_copy_128_a: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4-q7} +-#else +- vpush {q4-q7} +-#endif +- /* +- * Move to a 1-s based countdown to determine when to loop. That +- * allows the subs to set the Z flag without having to explicitly +- * call cmp to a value. +- */ +- mov r12, r2, lsr #7 +-neon_copy_128_loop_a: +- vld1.32 {q0, q1}, [r1]! +- vld1.32 {q2, q3}, [r1]! +- vld1.32 {q4, q5}, [r1]! +- vld1.32 {q6, q7}, [r1]! +- pld [r1, #0] +- pld [r1, #128] +- vst1.32 {q0, q1}, [r0]! +- vst1.32 {q2, q3}, [r0]! +- vst1.32 {q4, q5}, [r0]! +- vst1.32 {q6, q7}, [r0]! +- subs r12, r12, #1 +- pld [r0, #0] +- pld [r0, #128] +- bne neon_copy_128_loop_a +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q4-q7} +-#else +- vpop {q4-q7} +-#endif +- ands r2, r2, #0x7f +- beq neon_end +- cmp r2, #32 +- blt neon_copy_finish_a +- /* Copy blocks of 32-bytes (word aligned) at a time*/ +-neon_copy_32_a: +- mov r12, r2, lsr #5 +-neon_copy_32_loop_a: +- vld1.32 {q0,q1}, [r1]! +- subs r12, r12, #1 +- pld [r1,#0] +- vst1.32 {q0,q1}, [r0]! +- bne neon_copy_32_loop_a +- ands r2, r2, #0x1f +- beq neon_end +-neon_copy_finish_a: +-neon_copy_16_a: +- movs r12, r2, lsr #4 +- beq neon_copy_8_a +-neon_copy_16_a_loop: +- vld1.32 {q0}, [r1]! +- subs r12, r12, #1 +- vst1.32 {q0}, [r0]! +- bne neon_copy_16_a_loop +- ands r2, r2, #0xf +- beq neon_end +-neon_copy_8_a: +- cmp r2, #8 +- blt neon_copy_4_a +- ldm r1!, {r4-r5} +- subs r2, r2, #8 +- stm r0!, {r4-r5} +- /* Copy 4-bytes of word-aligned data at a time*/ +-neon_copy_4_a: +- cmp r2, #4 +- blt neon_copy_finish +- ldr r4, [r1], #4 +- subs r2, r2, #4 +- str r4, [r0], #4 +- b neon_copy_finish +- +- /* +- * Handle unaligned data. The basic concept here is that we'll +- * try to pull out enough data from the source to get that word- +- * aligned, then do our writes word-aligned, storing the difference +- * in a register, and shifting the data as needed. +- */ +-neon_memcpy_nonaligned: +- /* +- * If this is <8 bytes, it makes more sense to just copy it +- * quickly instead of incurring all kinds of overhead. +- */ +- cmp r2, #8 /* Let's try this...*/ +- ble neon_copy_finish +- /* +- * This is where we'll pull out either 1, 2, or 3 bytes of data +- * from the source as needed to align it, then store off those +- * bytes in r4. When we read in the (now) aligned data from the +- * source, we'll shift the bytes and AND in the r4 data, then write +- * to the target aligned. +- * +- * The conditional ldr calls work slightly faster than the +- * previous method, confirmed by our pipeline modeler. +- */ +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r6-r9} +-#else +- push {r6-r9} +-#endif +- cmp r12, #2 +- ldrb r4, [r1], #1 +- ldrleb r5, [r1], #1 +- ldrltb r6, [r1], #1 +- rsb r8, r12, #4 +- sub r2, r2, r8 +- lsl r8, r8, #3 +- orrle r4, r4, r5, lsl #8 +- orrlt r4, r4, r6, lsl #16 +- rsb r9, r8, #32 +- +- cmp r2, #64 +- blt neon_unaligned_route +- ands r12, r0, #0xf +- beq neon_unaligned_route +- rsb r12, r12, #16 +-neon_16_start_u: +- sub r2, r2, r12 +- lsrs r6, r12, #2 +-neon_align_16_4_u: +- ldr r5, [r1], #4 +- subs r6, r6, #1 +- orr r4, r4, r5, lsl r8 +- str r4, [r0], #4 +- mov r4, r5, lsr r9 +- bne neon_align_16_4_u +-neon_unaligned_route: +- /* Decide which loop block to branch to.*/ +- cmp r2, #256 +- bge neon_copy_64_u +- cmp r2, #64 +- bge neon_copy_32_u +- b neon_copy_finish_u +- /* Copy data in 64-byte blocks.*/ +-neon_copy_64_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4} +- vstmdb sp!, {q5-q8} +-#else +- vpush {q4} +- vpush {q5-q8} +-#endif +- /* We'll need this for the q register shift later.*/ +- vdup.u32 q8, r8 +- /* +- * As above, we determine how many times we can go through the +- * 64-byte copy loop, then countdown. +- */ +- mov r12, r2, lsr #6 +- and r2, r2, #0x3f +-neon_copy_64_u_loop: +- /* Load 64-bytes into q4-q7.*/ +- vld1.32 {q4, q5}, [r1]! +- vld1.32 {q6, q7}, [r1]! +- /* +- * Shift q0-q3 right so everything but the data we need due to the +- * alignment falls off the right-hand side. The branching +- * is needed, since vshr requires the shift to be an immediate +- * value. +- */ +- lsls r5, r8, #28 +- bcc neon_copy_64_u_b8 +- bpl neon_copy_64_u_b16 +- vshr.u64 q0, q4, #40 +- vshr.u64 q1, q5, #40 +- vshr.u64 q2, q6, #40 +- vshr.u64 q3, q7, #40 +- b neon_copy_64_unify +-neon_copy_64_u_b8: +- vshr.u64 q0, q4, #56 +- vshr.u64 q1, q5, #56 +- vshr.u64 q2, q6, #56 +- vshr.u64 q3, q7, #56 +- b neon_copy_64_unify +-neon_copy_64_u_b16: +- vshr.u64 q0, q4, #48 +- vshr.u64 q1, q5, #48 +- vshr.u64 q2, q6, #48 +- vshr.u64 q3, q7, #48 +-neon_copy_64_unify: +- /* +- * Shift q4-q7 left by r8 bits to take the alignment into +- * account. +- */ +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q4, q8, q4 +- vshl.u64 q5, q8, q5 +- vshl.u64 q6, q8, q6 +- vshl.u64 q7, q8, q7 +-#else +- vshl.u64 q4, q4, q8 +- vshl.u64 q5, q5, q8 +- vshl.u64 q6, q6, q8 +- vshl.u64 q7, q7, q8 +-#endif +- /* +- * The data in s14 will be needed for the next loop iteration. Move +- * that to r5. +- */ +- vmov r5, s14 +- /* We'll vorr the shifted data with the data that needs to move back.*/ +- vorr d9, d9, d0 +- /* Copy the data from the previous loop into s14.*/ +- vmov s14, r4 +- vorr d10, d10, d1 +- vorr d11, d11, d2 +- vorr d12, d12, d3 +- vorr d13, d13, d4 +- vorr d14, d14, d5 +- vorr d15, d15, d6 +- vorr d8, d8, d7 +- subs r12, r12, #1 +- pld [r1, #0] +- pld [r1, #128] +- /* Save off the r5 data into r4 for the next iteration.*/ +- mov r4, r5 +- vst1.32 {q4, q5}, [r0]! +- vst1.32 {q6, q7}, [r0]! +- pld [r0, #0] +- pld [r0, #128] +- bne neon_copy_64_u_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q5-q8} +- vldmia sp!, {q4} +-#else +- vpop {q5-q8} +- vpop {q4} +-#endif +- cmp r2, #32 +- bge neon_copy_32_u +- b neon_copy_finish_u +-neon_copy_32_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4} +-#else +- vpush {q4} +-#endif +- vdup.u32 q4, r8 +- mov r12, r2, lsr #5 +- and r2, r2, #0x1f +-neon_copy_32_u_loop: +- vld1.32 {q0, q1}, [r1]! +- lsls r5, r8, #28 +- bcc neon_copy_32_u_b8 +- bpl neon_copy_32_u_b16 +- vshr.u64 q2, q0, #40 +- vshr.u64 q3, q1, #40 +- b neon_copy_32_unify +-neon_copy_32_u_b8: +- vshr.u64 q2, q0, #56 +- vshr.u64 q3, q1, #56 +- b neon_copy_32_unify +-neon_copy_32_u_b16: +- vshr.u64 q2, q0, #48 +- vshr.u64 q3, q1, #48 +-neon_copy_32_unify: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q0, q4, q0 +- vshl.u64 q1, q4, q1 +-#else +- vshl.u64 q0, q0, q4 +- vshl.u64 q1, q1, q4 +-#endif +- vmov r5, s14 +- vorr d1, d1, d4 +- vmov s14, r4 +- vorr d2, d2, d5 +- vorr d3, d3, d6 +- vorr d0, d0, d7 +- subs r12, r12, #1 +- pld [r1, #0] +- mov r4, r5 +- vst1.32 {q0, q1}, [r0]! +- bne neon_copy_32_u_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q4} +-#else +- vpop {q4} +-#endif +-neon_copy_finish_u: +-neon_copy_16_u: +- movs r12, r2, lsr #4 +- beq neon_copy_8_u +- vdup.u32 q2, r8 +- and r2, r2, #0xf +-neon_copy_16_u_loop: +- vld1.32 {q0}, [r1]! +- lsls r5, r8, #28 +- bcc neon_copy_16_u_b8 +- bpl neon_copy_16_u_b16 +- vshr.u64 q1, q0, #40 +- b neon_copy_16_unify +-neon_copy_16_u_b8: +- vshr.u64 q1, q0, #56 +- b neon_copy_16_unify +-neon_copy_16_u_b16: +- vshr.u64 q1, q0, #48 +-neon_copy_16_unify: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q0, q2, q0 +-#else +- vshl.u64 q0, q0, q2 +-#endif +- vmov r5, s6 +- vorr d1, d1, d2 +- vmov s6, r4 +- vorr d0, d0, d3 +- subs r12, r12, #1 +- mov r4, r5 +- vst1.32 {q0}, [r0]! +- bne neon_copy_16_u_loop +-neon_copy_8_u: +- cmp r2, #8 +- blt neon_copy_4_u +- ldm r1!, {r6-r7} +- subs r2, r2, #8 +- orr r4, r4, r6, lsl r8 +- mov r5, r6, lsr r9 +- orr r5, r5, r7, lsl r8 +- stm r0!, {r4-r5} +- mov r4, r7, lsr r9 +-neon_copy_4_u: +- cmp r2, #4 +- blt neon_copy_last_bits_u +- ldr r5, [r1], #4 +- subs r2, r2, #4 +- orr r4, r4, r5, lsl r8 +- str r4, [r0], #4 +- mov r4, r5, lsr r9 +-neon_copy_last_bits_u: +- /* +- * Remember, r8 contains the size of the data in r4 in bits, +- * so to get to bytes we'll need to shift 3 places +- */ +- lsr r8, r8, #0x3 +- /* Write out the bytes stored in r4.*/ +-neon_copy_last_bits_u_loop: +- strb r4, [r0], #1 +- subs r8, r8, #1 +- lsrne r4, r4, #8 +- bne neon_copy_last_bits_u_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r6-r9} +-#else +- pop {r6-r9} +-#endif +-neon_copy_finish: +- cmp r2, #0 +- beq neon_end +- /* +- * This just copies the data from source to target one byte +- * at a time. For some small values, this makes more sense. +- * Note that since this code copies data a byte at a time, +- * both the aligned and unaligned paths can use it. +- */ +-neon_copy_finish_loop: +- ldrb r4, [r1], #1 +- subs r2, r2, #1 +- strb r4, [r0], #1 +- bne neon_copy_finish_loop +-neon_end: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r4-r5} +- ldmia sp!, {r0} +-#else +- pop {r4-r5} +- pop {r0} +-#endif +- bx lr +- +- .endfunc +- .end +diff --git git/src/neon_memmove.S git/src/neon_memmove.S +deleted file mode 100644 +index 1bfe597..0000000 +--- git/src/neon_memmove.S ++++ /dev/null +@@ -1,939 +0,0 @@ +-/*************************************************************************** +- Copyright (c) 2009, Code Aurora Forum. All rights reserved. +- +- Redistribution and use in source and binary forms, with or without +- modification, are permitted provided that the following conditions are met: +- * Redistributions of source code must retain the above copyright +- notice, this list of conditions and the following disclaimer. +- * Redistributions in binary form must reproduce the above copyright +- notice, this list of conditions and the following disclaimer in the +- documentation and/or other materials provided with the distribution. +- * Neither the name of Code Aurora nor the names of its contributors may +- be used to endorse or promote products derived from this software +- without specific prior written permission. +- +- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +- POSSIBILITY OF SUCH DAMAGE. +- ***************************************************************************/ +- +-/*************************************************************************** +- * Neon memmove: Attempts to do a memmove with Neon registers if possible, +- * Inputs: +- * dest: The destination buffer +- * src: The source buffer +- * n: The size of the buffer to transfer +- * Outputs: +- * +- ***************************************************************************/ +- +-/* +- * General note: +- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP +- * However, it looks like the 2006 CodeSourcery Assembler has issues generating +- * the correct object code for VPOP, resulting in horrific stack crashes. +- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB, +- * and VPOP->VLDMIA. We can revert this back once we update our toolchain. +- * +- * Also, VSHL swaps the source register and the shift-amount register +- * around in 2006-q3. I've coded this incorrectly so it turns out correct +- * in the object code, but we'll need to undo that later... +- */ +- .code 32 +- .align 4 +- .globl neon_memmove +- .func +- +-neon_memmove: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r0} +-#else +- push {r0} +-#endif +- +- /* +- * The requirements for memmove state that the function should +- * operate as if data were being copied from the source to a +- * buffer, then to the destination. This is to allow a user +- * to copy data from a source and target that overlap. +- * +- * We can't just do byte copies front-to-back automatically, since +- * there's a good chance we may have an overlap (why else would someone +- * intentionally use memmove then?). +- * +- * We'll break this into two parts. Front-to-back, or back-to-front +- * copies. +- */ +-neon_memmove_cmf: +- cmp r0, r1 +- blt neon_front_to_back_copy +- bgt neon_back_to_front_copy +- b neon_memmove_done +- +- /* ############################################################# +- * Front to Back copy +- */ +-neon_front_to_back_copy: +- /* +- * For small copies, just do a quick memcpy. We can do this for +- * front-to-back copies, aligned or unaligned, since we're only +- * doing 1 byte at a time... +- */ +- cmp r2, #4 +- bgt neon_f2b_gt4 +- cmp r2, #0 +-neon_f2b_smallcopy_loop: +- beq neon_memmove_done +- ldrb r12, [r1], #1 +- subs r2, r2, #1 +- strb r12, [r0], #1 +- b neon_f2b_smallcopy_loop +-neon_f2b_gt4: +- /* Preload what we can...*/ +- pld [r0,#0] +- pld [r1,#0] +- /* The window size is in r3. */ +- sub r3, r1, r0 +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r4-r6} +-#else +- push {r4-r6} +-#endif +- +-neon_f2b_check_align: +- /* Check alignment. */ +- ands r12, r0, #0x3 +- beq neon_f2b_source_align_check +- cmp r12, #2 +- ldrb r4, [r1], #1 +- ldrleb r5, [r1], #1 +- ldrltb r6, [r1], #1 +- rsb r12, r12, #4 +- sub r2, r2, r12 +- strb r4, [r0], #1 +- strleb r5, [r0], #1 +- strltb r6, [r0], #1 +- +-neon_f2b_source_align_check: +- ands r12, r1, #0x3 +- bne neon_f2b_nonaligned +- +-neon_f2b_try_16_align: +- /* If we're >64, attempt to align on 16-bytes. Smaller amounts +- * don't seem to be worth handling. */ +- cmp r2, #64 +- blt neon_f2b_align_route +- /* This is where we try 16-byte alignment. */ +- ands r12, r0, #0xf +- beq neon_f2b_align_route +- rsb r12, r12, #16 +-neon_f2b_16_start: +- sub r2, r2, r12 +- lsrs r5, r12, #2 +-neon_f2b_align_16_4: +- ldr r4, [r1], #4 +- subs r5, r5, #1 +- str r4, [r0], #4 +- bne neon_f2b_align_16_4 +-neon_f2b_align_route: +- /* ############################################################# +- * Front to Back copy - aligned +- */ +- /* +- * Note that we can't just route based on the size in r2. If that's +- * larger than the overlap window in r3, we could potentially +- * (and likely!) destroy data we're copying. +- */ +- cmp r2, r3 +- movle r12, r2 +- movgt r12, r3 +- cmp r12, #256 +- bge neon_f2b_copy_128_a +- cmp r12, #64 +- bge neon_f2b_copy_32_a +- cmp r12, #16 +- bge neon_f2b_copy_16_a +- cmp r12, #8 +- bge neon_f2b_copy_8_a +- cmp r12, #4 +- bge neon_f2b_copy_4_a +- b neon_f2b_copy_1_a +-neon_f2b_copy_128_a: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4-q7} +-#else +- vpush {q4-q7} +-#endif +- mov r12, r2, lsr #7 +-neon_f2b_copy_128_a_loop: +- vld1.32 {q0,q1}, [r1]! +- vld1.32 {q2,q3}, [r1]! +- vld1.32 {q4,q5}, [r1]! +- vld1.32 {q6,q7}, [r1]! +- pld [r1, #0] +- pld [r1, #128] +- vst1.32 {q0,q1}, [r0]! +- vst1.32 {q2,q3}, [r0]! +- vst1.32 {q4,q5}, [r0]! +- vst1.32 {q6,q7}, [r0]! +- subs r12, r12, #1 +- pld [r0, #0] +- pld [r0, #128] +- bne neon_f2b_copy_128_a_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q4-q7} +-#else +- vpop {q4-q7} +-#endif +- ands r2, r2, #0x7f +- beq neon_f2b_finish +- cmp r2, #32 +- bge neon_f2b_copy_32_a +- b neon_f2b_copy_finish_a +-neon_f2b_copy_32_a: +- mov r12, r2, lsr #5 +-neon_f2b_copy_32_a_loop: +- vld1.32 {q0,q1}, [r1]! +- subs r12, r12, #1 +- pld [r1, #0] +- vst1.32 {q0,q1}, [r0]! +- bne neon_f2b_copy_32_a_loop +- ands r2, r2, #0x1f +- beq neon_f2b_finish +-neon_f2b_copy_finish_a: +-neon_f2b_copy_16_a: +- movs r12, r2, lsr #4 +- beq neon_f2b_copy_8_a +-neon_f2b_copy_16_a_loop: +- vld1.32 {q0}, [r1]! +- subs r12, r12, #1 +- vst1.32 {q0}, [r0]! +- bne neon_f2b_copy_16_a_loop +- ands r2, r2, #0xf +- beq neon_f2b_finish +-neon_f2b_copy_8_a: +- cmp r2, #8 +- blt neon_f2b_copy_4_a +- ldm r1!, {r4-r5} +- subs r2, r2, #8 +- stm r0!, {r4-r5} +-neon_f2b_copy_4_a: +- cmp r2, #4 +- blt neon_f2b_copy_1_a +- ldr r4, [r1], #4 +- subs r2, r2, #4 +- str r4, [r0], #4 +-neon_f2b_copy_1_a: +- cmp r2, #0 +- beq neon_f2b_finish +-neon_f2b_copy_1_a_loop: +- ldrb r12, [r1], #1 +- subs r2, r2, #1 +- strb r12, [r0], #1 +- bne neon_f2b_copy_1_a_loop +- b neon_f2b_finish +- +- /* ############################################################# +- * Front to Back copy - unaligned +- */ +-neon_f2b_nonaligned: +- /* +- * For sizes < 8, does it really make sense to do the whole shift +- * party? Note that we DON'T want to call neon_f2b_copy_1_u, +- * since we'll end up trying to pop r8-r11, and we DON'T want +- * to do that... +- */ +- cmp r2, #8 +- ble neon_f2b_copy_1_a +- +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r7-r9} +-#else +- push {r7-r9} +-#endif +- cmp r12, #2 +- ldrb r4, [r1], #1 +- ldrleb r5, [r1], #1 +- ldrltb r6, [r1], #1 +- rsb r8, r12, #4 +- sub r2, r2, r8 +- lsl r8, r8, #3 +- orrle r4, r4, r5, lsl #8 +- orrlt r4, r4, r6, lsl #16 +- rsb r9, r8, #32 +- /* +- * r4 = overflow bits +- * r8 = # of bits we copied into the r4 register to align source. +- * r9 = 32 - r8 +- * r12 = Index counter for each size, so we determine how many times +- * the given size will go into r2, then count down that # of +- * times in r12. +- */ +- cmp r2, #64 +- blt neon_f2b_unaligned_route +- ands r12, r0, #0xf +- beq neon_f2b_unaligned_route +- cmp r3, #4 +- blt neon_f2b_unaligned_route +- rsb r12, r12, #16 +-neon_f2b_16_start_u: +- sub r2, r2, r12 +- lsrs r6, r12, #2 +-neon_f2b_align_16_4_u: +- ldr r5, [r1], #4 +- subs r6, r6, #1 +- orr r4, r4, r5, lsl r8 +- str r4, [r0], #4 +- mov r4, r5, lsr r9 +- bne neon_f2b_align_16_4_u +-neon_f2b_unaligned_route: +- cmp r2, r3 +- movle r12, r2 +- movgt r12, r3 +- cmp r12, #256 +- bge neon_f2b_copy_64_u +- cmp r12, #64 +- bge neon_f2b_copy_32_u +- cmp r12, #16 +- bge neon_f2b_copy_16_u +- cmp r12, #8 +- bge neon_f2b_copy_8_u +- cmp r12, #4 +- bge neon_f2b_copy_4_u +- b neon_f2b_last_bits_u +-neon_f2b_copy_64_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4} +- vstmdb sp!, {q5-q8} +-#else +- vpush {q4} +- vpush {q5-q8} +-#endif +- vdup.u32 q8, r8 +- mov r12, r2, lsr #6 +- and r2, r2, #0x3f +-neon_f2b_copy_64_u_loop: +- vld1.32 {q4, q5}, [r1]! +- vld1.32 {q6, q7}, [r1]! +- lsls r5, r8, #28 +- bcc neon_f2b_copy_64_u_b8 +- bpl neon_f2b_copy_64_u_b16 +- vshr.u64 q0, q4, #40 +- vshr.u64 q1, q5, #40 +- vshr.u64 q2, q6, #40 +- vshr.u64 q3, q7, #40 +- b neon_f2b_copy_64_unify +-neon_f2b_copy_64_u_b8: +- vshr.u64 q0, q4, #56 +- vshr.u64 q1, q5, #56 +- vshr.u64 q2, q6, #56 +- vshr.u64 q3, q7, #56 +- b neon_f2b_copy_64_unify +-neon_f2b_copy_64_u_b16: +- vshr.u64 q0, q4, #48 +- vshr.u64 q1, q5, #48 +- vshr.u64 q2, q6, #48 +- vshr.u64 q3, q7, #48 +-neon_f2b_copy_64_unify: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q4, q8, q4 +- vshl.u64 q5, q8, q5 +- vshl.u64 q6, q8, q6 +- vshl.u64 q7, q8, q7 +-#else +- vshl.u64 q4, q4, q8 +- vshl.u64 q5, q5, q8 +- vshl.u64 q6, q6, q8 +- vshl.u64 q7, q7, q8 +-#endif +- vmov r5, s14 +- vorr d9, d9, d0 +- vmov s14, r4 +- vorr d10, d10, d1 +- vorr d11, d11, d2 +- vorr d12, d12, d3 +- vorr d13, d13, d4 +- vorr d14, d14, d5 +- vorr d15, d15, d6 +- vorr d8, d8, d7 +- subs r12, r12, #1 +- pld [r1, #0] +- pld [r1, #128] +- mov r4, r5 +- vst1.32 {q4, q5}, [r0]! +- vst1.32 {q6, q7}, [r0]! +- pld [r0, #0] +- pld [r0, #128] +- bne neon_f2b_copy_64_u_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q5-q8} +- vldmia sp!, {q4} +-#else +- vpop {q5-q8} +- vpop {q4} +-#endif +- cmp r2, #32 +- bge neon_f2b_copy_32_u +- b neon_f2b_copy_finish_u +-neon_f2b_copy_32_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4} +-#else +- vpush {q4} +-#endif +- vdup.u32 q4, r8 +- mov r12, r2, lsr #5 +- and r2, r2, #0x1f +-neon_f2b_copy_32_u_loop: +- vld1.32 {q0, q1}, [r1]! +- lsls r5, r8, #28 +- bcc neon_f2b_copy_32_u_b8 +- bpl neon_f2b_copy_32_u_b16 +- vshr.u64 q2, q0, #40 +- vshr.u64 q3, q1, #40 +- b neon_f2b_copy_32_unify +-neon_f2b_copy_32_u_b8: +- vshr.u64 q2, q0, #56 +- vshr.u64 q3, q1, #56 +- b neon_f2b_copy_32_unify +-neon_f2b_copy_32_u_b16: +- vshr.u64 q2, q0, #48 +- vshr.u64 q3, q1, #48 +-neon_f2b_copy_32_unify: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q0, q4, q0 +- vshl.u64 q1, q4, q1 +-#else +- vshl.u64 q0, q0, q4 +- vshl.u64 q1, q1, q4 +-#endif +- vmov r5, s14 +- vorr d1, d1, d4 +- vmov s14, r4 +- vorr d2, d2, d5 +- vorr d3, d3, d6 +- vorr d0, d0, d7 +- subs r12, r12, #1 +- pld [r1, #0] +- mov r4, r5 +- vst1.32 {q0, q1}, [r0]! +- bne neon_f2b_copy_32_u_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q4} +-#else +- vpop {q4} +-#endif +-neon_f2b_copy_finish_u: +-neon_f2b_copy_16_u: +- movs r12, r2, lsr #4 +- beq neon_f2b_copy_8_u +- vdup.u32 q2, r8 +- and r2, r2, #0xf +-neon_f2b_copy_16_u_loop: +- vld1.32 {q0}, [r1]! +- lsls r5, r8, #28 +- bcc neon_f2b_copy_16_u_b8 +- bpl neon_f2b_copy_16_u_b16 +- vshr.u64 q1, q0, #40 +- b neon_f2b_copy_16_unify +-neon_f2b_copy_16_u_b8: +- vshr.u64 q1, q0, #56 +- b neon_f2b_copy_16_unify +-neon_f2b_copy_16_u_b16: +- vshr.u64 q1, q0, #48 +-neon_f2b_copy_16_unify: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q0, q2, q0 +-#else +- vshl.u64 q0, q0, q2 +-#endif +- vmov r5, s6 +- vorr d1, d1, d2 +- vmov s6, r4 +- vorr d0, d0, d3 +- subs r12, r12, #1 +- mov r4, r5 +- vst1.32 {q0}, [r0]! +- bne neon_f2b_copy_16_u_loop +-neon_f2b_copy_8_u: +- cmp r2, #8 +- blt neon_f2b_copy_4_u +- ldm r1!, {r6-r7} +- subs r2, r2, #8 +- orr r4, r4, r6, lsl r8 +- mov r5, r6, lsr r9 +- orr r5, r5, r7, lsl r8 +- stm r0!, {r4-r5} +- mov r4, r7, lsr r9 +-neon_f2b_copy_4_u: +- cmp r2, #4 +- blt neon_f2b_last_bits_u +- ldr r5, [r1], #4 +- subs r2, r2, #4 +- orr r4, r4, r5, lsl r8 +- str r4, [r0], #4 +- mov r4, r5, lsr r9 +-neon_f2b_last_bits_u: +- lsr r8, r8, #0x3 +-neon_f2b_last_bits_u_loop: +- strb r4, [r0], #1 +- subs r8, r8, #1 +- lsr r4, r4, #8 +- bne neon_f2b_last_bits_u_loop +-neon_f2b_copy_1_u: +- cmp r2, #0 +- beq neon_f2b_finish_u +-neon_f2b_copy_1_u_loop: +- ldrb r12, [r1], #1 +- subs r2, r2, #1 +- strb r12, [r0], #1 +- bne neon_f2b_copy_1_u_loop +-neon_f2b_finish_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r7-r9} +-#else +- pop {r7-r9} +-#endif +- /* ############################################################# +- * Front to Back copy - finish +- */ +-neon_f2b_finish: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r4-r6} +-#else +- pop {r4-r6} +-#endif +- b neon_memmove_done +- +- /* ############################################################# +- * Back to Front copy +- */ +-neon_back_to_front_copy: +- /* +- * Here, we'll want to shift to the end of the buffers. This +- * actually points us one past where we need to go, but since +- * we'll pre-decrement throughout, this will be fine. +- */ +- add r0, r0, r2 +- add r1, r1, r2 +- cmp r2, #4 +- bgt neon_b2f_gt4 +- cmp r2, #0 +-neon_b2f_smallcopy_loop: +- beq neon_memmove_done +- ldrb r12, [r1, #-1]! +- subs r2, r2, #1 +- strb r12, [r0, #-1]! +- b neon_b2f_smallcopy_loop +-neon_b2f_gt4: +- pld [r0, #0] +- pld [r1, #0] +- /* +- * The minimum of the overlap window size and the copy size +- * is in r3. +- */ +- sub r3, r0, r1 +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r4-r5} +-#else +- push {r4-r5} +-#endif +- +- /* +- * Check alignment. Since we'll pre-decrement as we step thru, we'll +- * need to make sure we're on word-alignment. +- */ +-neon_b2f_check_align: +- ands r12, r0, #0x3 +- beq neon_b2f_source_align_check +- sub r2, r2, r12 +-neon_b2f_shift_align: +- ldrb r4, [r1, #-1]! +- subs r12, r12, #1 +- strb r4, [r0, #-1]! +- bne neon_b2f_shift_align +-neon_b2f_source_align_check: +- ands r4, r1, #0x3 +- bne neon_b2f_nonaligned +- +-neon_b2f_try_16_align: +- /* If we're >64, attempt to align on 16-bytes. Smaller amounts +- * don't seem to be worth handling. */ +- cmp r2, #64 +- blt neon_b2f_align_route +- ands r12, r0, #0xf +- beq neon_b2f_align_route +- /* In this case, r12 has the number of bytes to roll backward. */ +-neon_b2f_16_start: +- sub r2, r2, r12 +- lsrs r5, r12, #2 +-neon_b2f_align_16_4: +- ldr r4, [r1, #-4]! +- subs r5, r5, #1 +- str r4, [r0, #-4]! +- bne neon_b2f_align_16_4 +-neon_b2f_align_route: +- /* +- * ############################################################# +- * Back to Front copy - aligned +- */ +- cmp r2, r3 +- movle r12, r2 +- movgt r12, r3 +- cmp r12, #256 +- bge neon_b2f_copy_128_a +- cmp r12, #64 +- bge neon_b2f_copy_32_a +- cmp r12, #8 +- bge neon_b2f_copy_8_a +- cmp r12, #4 +- bge neon_b2f_copy_4_a +- b neon_b2f_copy_1_a +-neon_b2f_copy_128_a: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4-q7} +-#else +- vpush {q4-q7} +-#endif +- movs r12, r2, lsr #7 +- /* +- * This irks me. There MUST be a better way to read these in and +- * scan the register backward instead of making it go forward. Then +- * we need to do two subtractions... +- */ +-neon_b2f_copy_128_a_loop: +- sub r1, r1, #128 +- sub r0, r0, #128 +- vld1.32 {q0, q1}, [r1]! +- vld1.32 {q2, q3}, [r1]! +- vld1.32 {q4, q5}, [r1]! +- vld1.32 {q6, q7}, [r1]! +- pld [r1, #-128] +- pld [r1, #-256] +- vst1.32 {q0, q1}, [r0]! +- vst1.32 {q2, q3}, [r0]! +- vst1.32 {q4, q5}, [r0]! +- vst1.32 {q6, q7}, [r0]! +- subs r12, r12, #1 +- pld [r0, #-128] +- pld [r0, #-256] +- sub r1, r1, #128 +- sub r0, r0, #128 +- bne neon_b2f_copy_128_a_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q4-q7} +-#else +- vpop {q4-q7} +-#endif +- ands r2, r2, #0x7f +- beq neon_b2f_finish +- cmp r2, #32 +- bge neon_b2f_copy_32_a +- b neon_b2f_copy_finish_a +-neon_b2f_copy_32_a: +- mov r12, r2, lsr #5 +-neon_b2f_copy_32_a_loop: +- sub r1, r1, #32 +- sub r0, r0, #32 +- vld1.32 {q0,q1}, [r1] +- subs r12, r12, #1 +- vst1.32 {q0,q1}, [r0] +- pld [r1, #0] +- bne neon_b2f_copy_32_a_loop +- ands r2, r2, #0x1f +- beq neon_b2f_finish +-neon_b2f_copy_finish_a: +-neon_b2f_copy_8_a: +- movs r12, r2, lsr #0x3 +- beq neon_b2f_copy_4_a +-neon_b2f_copy_8_a_loop: +- ldmdb r1!, {r4-r5} +- subs r12, r12, #1 +- stmdb r0!, {r4-r5} +- bne neon_b2f_copy_8_a_loop +- and r2, r2, #0x7 +-neon_b2f_copy_4_a: +- movs r12, r2, lsr #0x2 +- beq neon_b2f_copy_1_a +- and r2, r2, #0x3 +-neon_b2f_copy_4_a_loop: +- ldr r4, [r1, #-4]! +- subs r12, r12, #1 +- str r4, [r0, #-4]! +- bne neon_b2f_copy_4_a_loop +-neon_b2f_copy_1_a: +- cmp r2, #0 +- beq neon_b2f_finish +-neon_b2f_copy_1_a_loop: +- ldrb r12, [r1, #-1]! +- subs r2, r2, #1 +- strb r12, [r0, #-1]! +- bne neon_b2f_copy_1_a_loop +- +- /* ############################################################# +- * Back to Front copy - unaligned +- */ +-neon_b2f_nonaligned: +- /* +- * For sizes < 8, does it really make sense to do the whole shift +- * party? +- */ +- cmp r2, #8 +- ble neon_b2f_copy_1_a +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- stmdb sp!, {r6-r11} +-#else +- push {r6-r11} +-#endif +- /* +- * r3 = max window size +- * r4 = overflow bytes +- * r5 = bytes we're reading into +- * r6 = # bytes we're off. +- * r10 = copy of r6 +- */ +- and r6, r1, #0x3 +- eor r4, r4, r4 +- mov r10, r6 +-neon_b2f_realign: +- ldrb r5, [r1, #-1]! +- subs r6, r6, #1 +- orr r4, r5, r4, lsl #8 +- bne neon_b2f_realign +- /* +- * r10 = # of bits we copied into the r4 register to align source. +- * r11 = 32 - r10 +- * r12 = Index counter for each size, so we determine how many times +- * the given size will go into r2, then count down that # of +- * times in r12. +- */ +- sub r2, r2, r10 +- lsl r10, r10, #0x3 +- rsb r11, r10, #32 +- +- cmp r2, r3 +- movle r12, r2 +- movgt r12, r3 +- cmp r12, #256 +- bge neon_b2f_copy_64_u +- cmp r12, #64 +- bge neon_b2f_copy_32_u +- cmp r12, #8 +- bge neon_b2f_copy_8_u +- cmp r12, #4 +- bge neon_b2f_copy_4_u +- b neon_b2f_last_bits_u +-neon_b2f_copy_64_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4,q5} +- vstmdb sp!, {q6-q8} +-#else +- vpush {q4,q5} +- vpush {q6-q8} +-#endif +- add r7, r11, #32 +- movs r12, r2, lsr #6 +- vdup.u32 q8, r7 +-neon_b2f_copy_64_u_loop: +- sub r1, r1, #64 +- sub r0, r0, #64 +- vld1.32 {q0, q1}, [r1]! +- vld1.32 {q2, q3}, [r1] +- sub r1, r1, #32 +- vmov q4, q0 +- vmov q5, q1 +- vmov q6, q2 +- vmov q7, q3 +- vmov r5, s0 +- mov r4, r4, lsl r11 +- lsls r6, r10, #28 +- bcc neon_b2f_copy_64_u_b8 +- bpl neon_b2f_copy_64_u_b16 +- vshr.u64 q0, q0, #24 +- vshr.u64 q1, q1, #24 +- vshr.u64 q2, q2, #24 +- vshr.u64 q3, q3, #24 +- b neon_b2f_copy_64_unify +-neon_b2f_copy_64_u_b8: +- vshr.u64 q0, q0, #8 +- vshr.u64 q1, q1, #8 +- vshr.u64 q2, q2, #8 +- vshr.u64 q3, q3, #8 +- b neon_b2f_copy_64_unify +-neon_b2f_copy_64_u_b16: +- vshr.u64 q0, q0, #16 +- vshr.u64 q1, q1, #16 +- vshr.u64 q2, q2, #16 +- vshr.u64 q3, q3, #16 +-neon_b2f_copy_64_unify: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q4, q8, q4 +- vshl.u64 q5, q8, q5 +- vshl.u64 q6, q8, q6 +- vshl.u64 q7, q8, q7 +-#else +- vshl.u64 q4, q4, q8 +- vshl.u64 q5, q5, q8 +- vshl.u64 q6, q6, q8 +- vshl.u64 q7, q7, q8 +-#endif +- vmov s17, r4 +- vorr d7, d7, d8 +- vorr d6, d6, d15 +- vorr d5, d5, d14 +- vorr d4, d4, d13 +- vorr d3, d3, d12 +- vorr d2, d2, d11 +- vorr d1, d1, d10 +- vorr d0, d0, d9 +- mov r4, r5, lsl r11 +- subs r12, r12, #1 +- lsr r4, r4, r11 +- vst1.32 {q0, q1}, [r0]! +- vst1.32 {q2, q3}, [r0] +- pld [r1, #0] +- sub r0, r0, #32 +- bne neon_b2f_copy_64_u_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q6-q8} +- vldmia sp!, {q4,q5} +-#else +- vpop {q6-q8} +- vpop {q4,q5} +-#endif +- ands r2, r2, #0x3f +- cmp r2, #32 +- bge neon_b2f_copy_32_u +- b neon_b2f_copy_finish_u +-neon_b2f_copy_32_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vstmdb sp!, {q4} +-#else +- vpush {q4} +-#endif +- add r7, r11, #32 +- movs r12, r2, lsr #5 +- vdup.u32 q4, r7 +- and r2, r2, #0x1f +-neon_b2f_copy_32_u_loop: +- sub r1, r1, #32 +- sub r0, r0, #32 +- vld1.32 {q0, q1}, [r1] +- vmov q2, q0 +- vmov q3, q1 +- vmov r5, s0 +- mov r4, r4, lsl r11 +- lsls r6, r10, #28 +- bcc neon_b2f_copy_32_u_b8 +- bpl neon_b2f_copy_32_u_b16 +- vshr.u64 q0, q0, #24 +- vshr.u64 q1, q1, #24 +- b neon_b2f_copy_32_unify +-neon_b2f_copy_32_u_b8: +- vshr.u64 q0, q0, #8 +- vshr.u64 q1, q1, #8 +- b neon_b2f_copy_32_unify +-neon_b2f_copy_32_u_b16: +- vshr.u64 q0, q0, #16 +- vshr.u64 q1, q1, #16 +-neon_b2f_copy_32_unify: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vshl.u64 q2, q4, q2 +- vshl.u64 q3, q4, q3 +-#else +- vshl.u64 q2, q2, q4 +- vshl.u64 q3, q3, q4 +-#endif +- vmov s9, r4 +- vorr d3, d3, d4 +- vorr d2, d2, d7 +- vorr d1, d1, d6 +- vorr d0, d0, d5 +- mov r4, r5, lsl r11 +- subs r12, r12, #1 +- lsr r4, r4, r11 +- vst1.32 {q0, q1}, [r0] +- pld [r1, #0] +- bne neon_b2f_copy_32_u_loop +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- vldmia sp!, {q4} +-#else +- vpop {q4} +-#endif +-neon_b2f_copy_finish_u: +-neon_b2f_copy_8_u: +- movs r12, r2, lsr #0x3 +- beq neon_b2f_copy_4_u +- mov r5, r4, lsl r11 +-neon_b2f_copy_8_u_loop: +- ldmdb r1!, {r6-r7} +- subs r12, r12, #1 +- orr r5, r5, r7, lsr r10 +- mov r4, r7, lsl r11 +- orr r4, r4, r6, lsr r10 +- stmdb r0!, {r4-r5} +- mov r4, r6, lsl r11 +- lsr r4, r4, r11 +- mov r5, r4, lsl r11 +- bne neon_b2f_copy_8_u_loop +- ands r2, r2, #0x7 +-neon_b2f_copy_4_u: +- movs r12, r2, lsr #0x2 +- beq neon_b2f_last_bits_u +- mov r5, r4, lsl r11 +-neon_b2f_copy_4_u_loop: +- ldr r6, [r1, #-4]! +- subs r12, r12, #1 +- orr r5, r5, r6, lsr r10 +- str r5, [r0, #-4]! +- mov r4, r6, lsl r11 +- lsr r4, r4, r11 +- mov r5, r4, lsl r11 +- bne neon_b2f_copy_4_u_loop +- and r2, r2, #0x3 +-neon_b2f_last_bits_u: +-neon_b2f_last_bits_u_loop: +- subs r10, r10, #8 +- mov r5, r4, lsr r10 +- strb r5, [r0, #-1]! +- bne neon_b2f_last_bits_u_loop +-neon_b2f_copy_1_u: +- cmp r2, #0 +- beq neon_b2f_finish_u +-neon_b2f_copy_1_u_loop: +- ldrb r12, [r1, #-1]! +- subs r2, r2, #1 +- strb r12, [r0, #-1]! +- bne neon_b2f_copy_1_u_loop +-neon_b2f_finish_u: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r6-r11} +-#else +- pop {r6-r11} +-#endif +- +-neon_b2f_finish: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r4-r5} +-#else +- pop {r4-r5} +-#endif +- +-neon_memmove_done: +-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__) +- ldmia sp!, {r0} +-#else +- pop {r0} +-#endif +- bx lr +- +- .endfunc +- .end +diff --git git/src/neon_memsets.c git/src/neon_memsets.c +deleted file mode 100755 +index 740fc1e..0000000 +--- git/src/neon_memsets.c ++++ /dev/null +@@ -1,169 +0,0 @@ +-/* neon_memsets.c +- * +- * Copyright (c) 2009, Code Aurora Forum. All rights reserved. +- * +- * Redistribution and use in source and binary forms, with or without +- * modification, are permitted provided that the following conditions are met: +- * * Redistributions of source code must retain the above copyright +- * notice, this list of conditions and the following disclaimer. +- * * Redistributions in binary form must reproduce the above copyright +- * notice, this list of conditions and the following disclaimer in the +- * documentation and/or other materials provided with the distribution. +- * * Neither the name of Code Aurora nor +- * the names of its contributors may be used to endorse or promote +- * products derived from this software without specific prior written +- * permission. +- * +- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +- * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +- */ +- +-#include "msm-swblits.h" +- +-void memset16(uint16_t dst[], uint16_t value, int count) +-{ +- if (count <= 0) +- return; +- +- asm volatile( +- " pld [%[dst], #0] \n" +- " cmp %[count], #4 \n" +- " blt 6f \n" +- " tst %[dst], #0x3 \n" +- " strneh %[value], [%[dst]], #2 \n" +- " subne %[count], %[count], #1 \n" +- " vdup.u16 q8, %[value] \n" +- " vmov q9, q8 \n" +- " cmp %[count], #64 \n" +- " bge 0f \n" +- " cmp %[count], #32 \n" +- " bge 2f \n" +- " cmp %[count], #16 \n" +- " bge 3f \n" +- " cmp %[count], #8 \n" +- " bge 4f \n" +- " b 5f \n" +- "0: \n" +- " mov r12, %[count], lsr #6 \n" +- "1: \n" +- " vst1.16 {q8, q9}, [%[dst]]! \n" +- " vst1.16 {q8, q9}, [%[dst]]! \n" +- " vst1.16 {q8, q9}, [%[dst]]! \n" +- " vst1.16 {q8, q9}, [%[dst]]! \n" +- " subs r12, r12, #1 \n" +- " bne 1b \n" +- " ands %[count], %[count], #0x3f \n" +- " beq 7f \n" +- "2: \n" +- " cmp %[count], #32 \n" +- " blt 3f \n" +- " vst1.16 {q8, q9}, [%[dst]]! \n" +- " vst1.16 {q8, q9}, [%[dst]]! \n" +- " subs %[count], %[count], #32 \n" +- " beq 7f \n" +- "3: \n" +- " cmp %[count], #16 \n" +- " blt 4f \n" +- " vst1.16 {q8, q9}, [%[dst]]! \n" +- " subs %[count], %[count], #16 \n" +- " beq 7f \n" +- "4: \n" +- " cmp %[count], #8 \n" +- " blt 5f \n" +- " vst1.16 {q8}, [%[dst]]! \n" +- " subs %[count], %[count], #8 \n" +- " beq 7f \n" +- "5: \n" +- " cmp %[count], #4 \n" +- " blt 6f \n" +- " vst1.16 {d16}, [%[dst]]! \n" +- " subs %[count], %[count], #4 \n" +- " beq 7f \n" +- "6: \n" +- " cmp %[count], #0 \n" +- " blt 7f \n" +- " lsls %[count], #31 \n" +- " strmih %[value], [%[dst]], #2 \n" +- " strcsh %[value], [%[dst]], #2 \n" +- " strcsh %[value], [%[dst]], #2 \n" +- "7: \n" +- // Clobbered input registers +- : [dst] "+r" (dst), [count] "+r" (count) +- // Unclobbered input +- : [value] "r" (value) +- // Clobbered registers +- : "q8", "q9", "r12", "cc", "memory" +- ); +-} +- +-void memset32(uint32_t dst[], uint32_t value, int count) +-{ +- asm volatile( +- " pld [%[dst], #0] \n" +- " cmp %[count], #4 \n" +- " blt 5f \n" +- " vdup.u32 q8, %[value] \n" +- " vmov q9, q8 \n" +- " cmp %[count], #32 \n" +- " bge 0f \n" +- " cmp %[count], #16 \n" +- " bge 2f \n" +- " cmp %[count], #8 \n" +- " bge 3f \n" +- " b 4f \n" +- "0: \n" +- " mov r12, %[count], lsr #5 \n" +- "1: \n" +- " vst1.32 {q8, q9}, [%[dst]]! \n" +- " vst1.32 {q8, q9}, [%[dst]]! \n" +- " vst1.32 {q8, q9}, [%[dst]]! \n" +- " vst1.32 {q8, q9}, [%[dst]]! \n" +- " pld [%[dst], #0] \n" +- " subs r12, r12, #1 \n" +- " bne 1b \n" +- " ands %[count], %[count], #0x1f \n" +- " beq 6f \n" +- "2: \n" +- " cmp %[count], #16 \n" +- " blt 3f \n" +- " vst1.32 {q8, q9}, [%[dst]]! \n" +- " vst1.32 {q8, q9}, [%[dst]]! \n" +- " subs %[count], %[count], #16 \n" +- " beq 6f \n" +- "3: \n" +- " cmp %[count], #8 \n" +- " blt 4f \n" +- " vst1.32 {q8, q9}, [%[dst]]! \n" +- " subs %[count], %[count], #8 \n" +- " beq 6f \n" +- "4: \n" +- " cmp %[count], #4 \n" +- " blt 5f \n" +- " vst1.32 {q8}, [%[dst]]! \n" +- " subs %[count], %[count], #4 \n" +- " beq 6f \n" +- "5: \n" +- " cmp %[count], #0 \n" +- " beq 6f \n" +- " lsls %[count], #31 \n" +- " strmi %[value], [%[dst]], #4 \n" +- " strcs %[value], [%[dst]], #4 \n" +- " strcs %[value], [%[dst]], #4 \n" +- "6: @end \n" +- // Clobbered input registers +- : [dst] "+r" (dst), [count] "+r" (count) +- // Unclobbered input +- : [value] "r" (value) +- // Clobbered registers +- : "q8", "q9", "r12", "cc", "memory" +- ); +-} diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch new file mode 100644 index 0000000000..97ad380e27 --- /dev/null +++ b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch @@ -0,0 +1,36 @@ +commit 18515a56822fcd9c0a71240edce97ea5623b0448 +Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch> +Date: Wed Feb 10 16:29:55 2010 +0100 + + Modify Makefile.am + Removed depencies for neon + +diff --git git/src/Makefile.am git/src/Makefile.am +index 8ab1856..08da5a5 100755 +--- a/src/Makefile.am ++++ b/src/Makefile.am +@@ -12,13 +12,7 @@ MSM_DRI_SRCS += msm-drm.c msm-dri2.c + msm_drv_la_LIBADD += $(DRI2_LIBS) + endif + +-NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp +-NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork +-NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS) +- +-AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror +-AM_ASFLAGS = $(NEON_ASFLAGS) +-AM_CCASFLAGS = $(NEON_CCASFLAGS) ++AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ -Wall -Werror + + msm_drv_la_LTLIBRARIES = msm_drv.la + msm_drv_la_LDFLAGS = -module -avoid-version +@@ -37,9 +31,6 @@ msm_drv_la_SOURCES = \ + msm-swfill.c \ + msm-hwrender.c \ + msm-pixmap.c \ +- neon_memsets.c \ +- neon_memcpy.S \ +- neon_memmove.S \ + $(MSM_DRI_SRCS) + + diff --git a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch new file mode 100644 index 0000000000..90dd31f605 --- /dev/null +++ b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch @@ -0,0 +1,116 @@ +commit cc83ba5835d5b55347fd0c0775156493b0cf3a15 +Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch> +Date: Thu Feb 11 16:26:52 2010 +0100 + + Renaming variables for getting Xorg (xf86-video-msm) work + under linux-leviathan (htcdream): + cd src + sed 's/fixed_info/fix/' -i *.h + sed 's/fixed_info/fix/' -i *.c + +diff --git git/src/msm-dri.c git/src/msm-dri.c +index a51d3bd..a74368b 100644 +--- git/src/msm-dri.c ++++ git/src/msm-dri.c +@@ -151,10 +151,10 @@ MSMDRIScreenInit(ScreenPtr pScreen) + pDRIInfo->ddxDriverMinorVersion = 0; + pDRIInfo->ddxDriverPatchVersion = 0; + +- pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start; ++ pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fix.smem_start; + +- pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len; +- pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length; ++ pDRIInfo->frameBufferSize = pMsm->fix.smem_len; ++ pDRIInfo->frameBufferStride = pMsm->fix.line_length; + + /* FIXME: How many drawables can we do (should we do)? */ + +diff --git git/src/msm-driver.c git/src/msm-driver.c +index 803197f..15378f8 100755 +--- git/src/msm-driver.c ++++ git/src/msm-driver.c +@@ -399,7 +399,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) + + /* Get the fixed info (par) structure */ + +- if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) { ++ if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fix)) { + xf86DrvMsg(pScrn->scrnIndex, X_ERROR, + "Unable to read hardware info from %s: %s\n", + dev, strerror(errno)); +@@ -410,7 +410,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) + /* Parse the ID and figure out what version of the MDP and what + * panel ID we have */ + +- if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) { ++ if (sscanf(pMsm->fix.id, "msmfb%d_%x", &mdpver, &panelid) < 2) { + + xf86DrvMsg(pScrn->scrnIndex, X_ERROR, + "Unable to determine the MDP and panel type\n"); +@@ -435,7 +435,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) + * the fbdev driver to allocate memory. In the mean time, we + * just reuse the framebuffer memory */ + +- pScrn->videoRam = pMsm->fixed_info.smem_len; ++ pScrn->videoRam = pMsm->fix.smem_len; + + /* Get the current screen setting */ + if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) { +@@ -671,8 +671,8 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags) + /* The framebuffer driver should always report the line length, + * but in case it doesn't, we can calculate it ourselves */ + +- if (pMsm->fixed_info.line_length) { +- pScrn->displayWidth = pMsm->fixed_info.line_length; ++ if (pMsm->fix.line_length) { ++ pScrn->displayWidth = pMsm->fix.line_length; + } else { + pScrn->displayWidth = pMsm->mode_info.xres_virtual * + pMsm->mode_info.bits_per_pixel / 8; +@@ -811,7 +811,7 @@ MSMCloseScreen(int scrnIndex, ScreenPtr pScreen) + #endif + + /* Unmap the framebuffer memory */ +- munmap(pMsm->fbmem, pMsm->fixed_info.smem_len); ++ munmap(pMsm->fbmem, pMsm->fix.smem_len); + + pScreen->CloseScreen = pMsm->CloseScreen; + +@@ -857,7 +857,7 @@ MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv) + #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION) + + /* Map the framebuffer memory */ +- pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len, ++ pMsm->fbmem = mmap(NULL, pMsm->fix.smem_len, + PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0); + + /* If we can't map the memory, then this is a short trip */ +diff --git git/src/msm-exa.c git/src/msm-exa.c +index 301923f..ce16a93 100755 +--- git/src/msm-exa.c ++++ git/src/msm-exa.c +@@ -740,8 +740,8 @@ MSMSetupExa(ScreenPtr pScreen) + pExa->flags = EXA_OFFSCREEN_PIXMAPS; + + pExa->offScreenBase = +- (pMsm->fixed_info.line_length * pMsm->mode_info.yres); +- pExa->memorySize = pMsm->fixed_info.smem_len; ++ (pMsm->fix.line_length * pMsm->mode_info.yres); ++ pExa->memorySize = pMsm->fix.smem_len; + + /* Align pixmap offsets along page boundaries */ + pExa->pixmapOffsetAlign = 4096; +diff --git git/src/msm.h git/src/msm.h +index e1e2bc7..520d390 100755 +--- git/src/msm.h ++++ git/src/msm.h +@@ -85,7 +85,7 @@ typedef struct _MSMRec + int fd; + + /* Fixed and var strutures from the framebuffer */ +- struct fb_fix_screeninfo fixed_info; ++ struct fb_fix_screeninfo fix; + struct fb_var_screeninfo mode_info; + + /* Pointer to the mapped framebuffer memory */ diff --git a/recipes/xorg-driver/xf86-video-msm_git.bb b/recipes/xorg-driver/xf86-video-msm_git.bb index faccb87e35..4723b867f0 100644 --- a/recipes/xorg-driver/xf86-video-msm_git.bb +++ b/recipes/xorg-driver/xf86-video-msm_git.bb @@ -8,9 +8,15 @@ SRCREV = "5f7df59155ae301a3ebc40aec22ed16d203cb5fc" PV = "1.1.0+${PR}+gitr${SRCREV}" PE = "1" -SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git\ - " +SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git" +SRC_URI_htcdream = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git \ + file://no_neon.patch;patch=1 \ + file://no_neon_flags.patch;patch=1 \ + file://renaming_variables.patch;patch=1" S = "${WORKDIR}/git" CFLAGS += " -I${STAGING_INCDIR}/xorg " +CFLAGS += " -Wno-error " + +ARM_INSTRUCTION_SET="arm" |