diff options
| -rw-r--r-- | packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch | 6444 | ||||
| -rw-r--r-- | packages/mplayer/mplayer_0.0+1.0rc1.bb | 8 |
2 files changed, 6450 insertions, 2 deletions
diff --git a/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch b/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch new file mode 100644 index 0000000000..800f43e8eb --- /dev/null +++ b/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch @@ -0,0 +1,6444 @@ + cfg-common.h | 4 + + cfg-mencoder.h | 4 + + cfg-mplayer.h | 4 + + configure | 13 +- + libaf/af_format.c | 7 + + libavcodec/Makefile | 7 + + libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++ + libavcodec/avr32/fdct.S | 541 ++++++++ + libavcodec/avr32/h264idct.S | 451 +++++++ + libavcodec/avr32/idct.S | 829 ++++++++++++ + libavcodec/avr32/mc.S | 434 ++++++ + libavcodec/avr32/pico.h | 260 ++++ + libavcodec/bitstream.h | 77 +- + libavcodec/dsputil.c | 3 + + libavcodec/h264.c | 15 + + libavutil/common.h | 16 + + libavutil/internal.h | 9 + + libfaad2/common.h | 2 +- + libmpcodecs/ad_libmad.c | 5 + + libswscale/pico-avr32.h | 137 ++ + libswscale/swscale_internal.h | 2 +- + libswscale/yuv2rgb.c | 14 + + libswscale/yuv2rgb_avr32.c | 416 ++++++ + libvo/vo_fbdev2.c | 101 ++- + version.sh | 2 +- + 25 files changed, 6011 insertions(+), 20 deletions(-) + create mode 100644 libavcodec/avr32/dsputil_avr32.c + create mode 100644 libavcodec/avr32/fdct.S + create mode 100644 libavcodec/avr32/h264idct.S + create mode 100644 libavcodec/avr32/idct.S + create mode 100644 libavcodec/avr32/mc.S + create mode 100644 libavcodec/avr32/pico.h + create mode 100644 libswscale/pico-avr32.h + create mode 100644 libswscale/yuv2rgb_avr32.c + +diff --git a/cfg-common.h b/cfg-common.h +index 780df38..7d878a8 100644 +--- a/cfg-common.h ++++ b/cfg-common.h +@@ -235,6 +235,10 @@ + {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL}, + {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL}, + ++#ifdef ARCH_AVR32 ++ {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL}, ++ {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL}, ++#endif + // draw by slices or whole frame (useful with libmpeg2/libavcodec) + {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL}, + {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL}, +diff --git a/cfg-mencoder.h b/cfg-mencoder.h +index 411b748..addf791 100644 +--- a/cfg-mencoder.h ++++ b/cfg-mencoder.h +@@ -5,6 +5,10 @@ + + #include "cfg-common.h" + ++#ifdef ARCH_AVR32 ++extern int avr32_use_pico; ++#endif ++ + #ifdef USE_FAKE_MONO + extern int fakemono; // defined in dec_audio.c + #endif +diff --git a/cfg-mplayer.h b/cfg-mplayer.h +index 62b6eac..31499c2 100644 +--- a/cfg-mplayer.h ++++ b/cfg-mplayer.h +@@ -4,6 +4,10 @@ + + #include "cfg-common.h" + ++#ifdef ARCH_AVR32 ++extern int avr32_use_pico; ++#endif ++ + extern int noconsolecontrols; + + #if defined(HAVE_FBDEV)||defined(HAVE_VESA) +diff --git a/configure b/configure +index 29002c8..56c6fe4 100755 +--- a/configure ++++ b/configure +@@ -1203,6 +1203,15 @@ EOF + _optimizing="$proc" + ;; + ++ avr32) ++ _def_arch='#define ARCH_AVR32' ++ _target_arch='TARGET_ARCH_AVR32 = yes' ++ iproc='avr32' ++ proc='' ++ _march='' ++ _mcpu='' ++ _optimizing='' ++ ;; + arm|armv4l|armv5tel) + _def_arch='#define ARCH_ARMV4L 1' + _target_arch='TARGET_ARCH_ARMV4L = yes' +@@ -1533,7 +1542,7 @@ echores $_named_asm_args + # Checking for CFLAGS + _stripbinaries=yes + if test "$_profile" != "" || test "$_debug" != "" ; then +- CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile" ++ CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile" + if test "$_cc_major" -ge "3" ; then + CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'` + fi +@@ -3794,7 +3803,7 @@ fi + + + echocheck "X11 headers presence" +- for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do ++ for I in `echo $_inc_extra | sed s/-I//g`; do + if test -f "$I/X11/Xlib.h" ; then + _inc_x11="-I$I" + _x11_headers="yes" +diff --git a/libaf/af_format.c b/libaf/af_format.c +index e5b7cc9..5d7ea6d 100644 +--- a/libaf/af_format.c ++++ b/libaf/af_format.c +@@ -20,7 +20,14 @@ + // Integer to float conversion through lrintf() + #ifdef HAVE_LRINTF + #include <math.h> ++ ++#ifdef ARCH_AVR32 ++#define lrintf(x) rint(x) ++#define llrint(x) (long long)rint(x) ++#else + long int lrintf(float); ++#endif ++ + #else + #define lrintf(x) ((int)(x)) + #endif +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 17b6c45..8e1dc96 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \ + + sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc + ++# avr32 specific stuff ++ifeq ($(TARGET_ARCH_AVR32),yes) ++ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o ++OBJS += avr32/dsputil_avr32.o ++endif ++ + # sun mediaLib specific stuff + OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \ + +@@ -419,6 +425,7 @@ tests: apiexample $(TESTS) + clean:: + rm -f \ + i386/*.o i386/*~ \ ++ avr32/*.o avr32/*~ \ + armv4l/*.o armv4l/*~ \ + mlib/*.o mlib/*~ \ + alpha/*.o alpha/*~ \ +diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c +new file mode 100644 +index 0000000..200284d +--- /dev/null ++++ b/libavcodec/avr32/dsputil_avr32.c +@@ -0,0 +1,2678 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++ ++#include "../dsputil.h" ++#include "pico.h" ++ ++int avr32_use_pico = 1; ++ ++//#define CHECK_DSP_FUNCS_AGAINST_C ++ ++#ifdef CHECK_DSP_FUNCS_AGAINST_C ++#define DSP_FUNC_NAME(name) test_ ## name ++#else ++#define DSP_FUNC_NAME(name) name ++#endif ++ ++union doubleword { ++ int64_t doubleword; ++ struct { ++ int32_t top; ++ int32_t bottom; ++ } words; ++}; ++ ++#undef LD16 ++#undef LD32 ++#undef LD64 ++ ++#define LD16(a) (*((uint16_t*)(a))) ++#define LD32(a) (*((uint32_t*)(a))) ++#define LD64(a) (*((uint64_t*)(a))) ++#define LD64_UNALIGNED(a) \ ++ ({ union doubleword __tmp__; \ ++ __tmp__.words.top = LD32(a); \ ++ __tmp__.words.bottom = LD32(a + 4); \ ++ __tmp__.doubleword; }) ++ ++#undef ST32 ++#undef ST16 ++ ++#define ST16(a, b) *((uint16_t*)(a)) = (b) ++#define ST32(a, b) *((uint32_t*)(a)) = (b) ++ ++#undef rnd_avg32 ++#define rnd_avg32(a, b) \ ++ ({ uint32_t __tmp__;\ ++ asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\ ++ __tmp__;}) ++ ++void idct_avr32(DCTELEM *data); ++void fdct_avr32(DCTELEM *data); ++ ++void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data); ++void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data); ++ ++void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride); ++void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride); ++ ++#define extern_dspfunc(PFX, NUM) \ ++ void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) ++ ++extern_dspfunc(put, 8); ++extern_dspfunc(put_no_rnd, 8); ++extern_dspfunc(avg, 8); ++extern_dspfunc(avg_no_rnd, 8); ++#undef extern_dspfunc ++ ++#ifdef CHECK_DSP_FUNCS_AGAINST_C ++#define extern_dspfunc(PFX, NUM) \ ++ void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) ++ ++extern_dspfunc(put, 4); ++extern_dspfunc(put_no_rnd, 4); ++extern_dspfunc(put, 8); ++extern_dspfunc(put_no_rnd, 8); ++extern_dspfunc(put, 16); ++extern_dspfunc(put_no_rnd, 16); ++extern_dspfunc(avg, 8); ++extern_dspfunc(avg_no_rnd, 8); ++extern_dspfunc(avg, 16); ++extern_dspfunc(avg_no_rnd, 16); ++ ++ ++#undef extern_dspfunc ++#define extern_dspfunc(PFX, NUM) \ ++void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \ ++ ++extern_dspfunc(put_h264_qpel, 16); ++extern_dspfunc(put_h264_qpel, 8); ++extern_dspfunc(put_h264_qpel, 4); ++extern_dspfunc(avg_h264_qpel, 16); ++extern_dspfunc(avg_h264_qpel, 8); ++extern_dspfunc(avg_h264_qpel, 4); ++ ++#undef extern_dspfunc ++ ++void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++ ++void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++ ++ ++void dump_block8(uint8_t *block, int line_size, int h); ++void dump_block4(uint8_t *block, int line_size, int h); ++void dump_block(uint8_t *block, int line_size, int h, int w); ++ ++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, char *name, int max_dev); ++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, char *name, int max_dev); ++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, int width, char *name, int max_dev); ++ ++#define PIXOP2( OPNAME, OP ) \ ++void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ OP(*((uint32_t*)(block )), LD32(pixels ));\ ++ pixels+=line_size;\ ++ block +=line_size;\ ++ }\ ++}\ ++void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ ++ int src_stride1, int src_stride2, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ uint32_t a,b;\ ++ a= LD32(&src1[i*src_stride1 ]);\ ++ b= LD32(&src2[i*src_stride2 ]);\ ++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ ++ a= LD32(&src1[i*src_stride1+4]);\ ++ b= LD32(&src2[i*src_stride2+4]);\ ++ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ ++ }\ ++}\ ++\ ++void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ ++ int src_stride1, int src_stride2, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ uint32_t a,b;\ ++ a= LD32(&src1[i*src_stride1 ]);\ ++ b= LD32(&src2[i*src_stride2 ]);\ ++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ ++ }\ ++}\ ++\ ++void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ ++ int src_stride1, int src_stride2, int h){\ ++ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ ++ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ ++}\ ++ ++#else ++#define PIXOP2( OPNAME, OP ) \ ++static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ OP(*((uint32_t*)(block )), LD32(pixels ));\ ++ pixels+=line_size;\ ++ block +=line_size;\ ++ }\ ++}\ ++static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ OP(*((uint32_t*)(block )), LD32(pixels ));\ ++ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\ ++ pixels+=line_size;\ ++ block +=line_size;\ ++ }\ ++}\ ++static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ OP(*((uint32_t*)(block )), LD32(pixels ));\ ++ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\ ++ OP(*((uint32_t*)(block+8)), LD32(pixels+8));\ ++ OP(*((uint32_t*)(block+12)), LD32(pixels+12));\ ++ pixels+=line_size;\ ++ block +=line_size;\ ++ }\ ++}\ ++static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ ++ int src_stride1, int src_stride2, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ uint32_t a,b;\ ++ a= LD32(&src1[i*src_stride1 ]);\ ++ b= LD32(&src2[i*src_stride2 ]);\ ++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ ++ a= LD32(&src1[i*src_stride1+4]);\ ++ b= LD32(&src2[i*src_stride2+4]);\ ++ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ ++ }\ ++}\ ++\ ++static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ ++ int src_stride1, int src_stride2, int h){\ ++ int i;\ ++ for(i=0; i<h; i++){\ ++ uint32_t a,b;\ ++ a= LD32(&src1[i*src_stride1 ]);\ ++ b= LD32(&src2[i*src_stride2 ]);\ ++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ ++ }\ ++}\ ++\ ++static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ ++ int src_stride1, int src_stride2, int h){\ ++ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ ++ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ ++}\ ++ ++#endif ++ ++#define op_avg(a, b) a = rnd_avg32(a, b) ++#define op_put(a, b) a = b ++ ++PIXOP2(avg, op_avg) ++PIXOP2(put, op_put) ++#undef op_avg ++#undef op_put ++ ++ ++ ++static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) ++{ ++ int i; ++ for(i=0; i<h; i++) ++ { ++ ST32(dst , LD32(src )); ++ dst+=dstStride; ++ src+=srcStride; ++ } ++} ++ ++static void clear_blocks_avr32(DCTELEM *blocks) ++{ ++ int n = 12; ++ uint64_t tmp1, tmp2; ++ blocks += 6*64; ++ asm volatile ( "mov\t%1, 0\n" ++ "mov\t%m1, 0\n" ++ "mov\t%2, 0\n" ++ "mov\t%m2, 0\n" ++ "0:\n" ++ "stm\t--%3, %1, %m1, %2, %m2\n" ++ "stm\t--%3, %1, %m1, %2, %m2\n" ++ "stm\t--%3, %1, %m1, %2, %m2\n" ++ "stm\t--%3, %1, %m1, %2, %m2\n" ++ "sub\t%0, 1\n" ++ "brne\t0b\n" ++ : "+r"(n), "=&r"(tmp1), "=&r"(tmp2), ++ "+r"(blocks)); ++} ++ ++ ++static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) ++{ ++ int i; ++ for(i=0; i<h; i++) ++ { ++ ST32(dst , LD32(src )); ++ ST32(dst+4 , LD32(src+4 )); ++ dst+=dstStride; ++ src+=srcStride; ++ } ++} ++ ++static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) ++{ ++ int i; ++ for(i=0; i<h; i++) ++ { ++ ST32(dst , LD32(src )); ++ ST32(dst+4 , LD32(src+4 )); ++ ST32(dst+8 , LD32(src+8 )); ++ ST32(dst+12, LD32(src+12)); ++ dst+=dstStride; ++ src+=srcStride; ++ } ++} ++ ++ ++static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ ++ const int A=(8-x)*(8-y); ++ const int B=( x)*(8-y); ++ const int C=(8-x)*( y); ++ const int D=( x)*( y); ++ int i; ++ ++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF0_B, 32); ++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF1_B, 0); ++ PICO_PUT_W(PICO_COEFF2_A, 0); ++ PICO_PUT_W(PICO_COEFF2_B, 0); ++ PICO_PUT_W(PICO_CONFIG, ++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) ++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) ++ | PICO_COEFF_FRAC_BITS(6) ++ | PICO_OFFSET_FRAC_BITS(6)); ++ ++ for(i=0; i<h; i++) ++ { ++ ++ int src0 = LD32(src); ++ int src1 = LD32(src + stride); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0); ++ src += stride; ++ ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0)); ++ dst += stride; ++ } ++} ++ ++ ++static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ ++ const int A=(8-x)*(8-y);\ ++ const int B=( x)*(8-y); ++ const int C=(8-x)*( y); ++ const int D=( x)*( y); ++ int i; ++ ++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF0_B, 32); ++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF1_B, 0); ++ PICO_PUT_W(PICO_COEFF2_A, 0); ++ PICO_PUT_W(PICO_COEFF2_B, 0); ++ PICO_PUT_W(PICO_CONFIG, ++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) ++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) ++ | PICO_COEFF_FRAC_BITS(6) ++ | PICO_OFFSET_FRAC_BITS(6)); ++ ++ for(i=0; i<h; i++) ++ { ++ /* ++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); ++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); ++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); ++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); ++ dst+= stride; ++ src+= stride; ++ */ ++ ++ int src0 = LD32(src); ++ int src1 = (((int)src[4] << 24) | (int)src[stride]); ++ int src2 = LD32(src + stride + 1); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); ++ src += stride; ++ ST32(dst, PICO_GET_W(PICO_OUTPIX0)); ++ ++ dst += stride; ++ } ++} ++ ++static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ ++ const int A=(8-x)*(8-y); ++ const int B=( x)*(8-y); ++ const int C=(8-x)*( y); ++ const int D=( x)*( y); ++ int i; ++ ++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF0_B, 32); ++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF1_B, 0); ++ PICO_PUT_W(PICO_COEFF2_A, 0); ++ PICO_PUT_W(PICO_COEFF2_B, 0); ++ PICO_PUT_W(PICO_CONFIG, ++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) ++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) ++ | PICO_COEFF_FRAC_BITS(6) ++ | PICO_OFFSET_FRAC_BITS(6)); ++ ++ for(i=0; i<h; i++) ++ { ++ /* ++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); ++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); ++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); ++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); ++ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5])); ++ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6])); ++ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7])); ++ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8])); ++ dst+= stride; ++ src+= stride; ++ */ ++ int src0 = LD32(src); ++ int src1 = (((int)src[4] << 24) | (int)src[stride]); ++ int src2 = LD32(src + stride + 1); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); ++ ST32(dst, PICO_GET_W(PICO_OUTPIX0)); ++ ++ src0 = LD32(src + 4); ++ src1 = (src[8] << 24) | src[stride + 4]; ++ src2 = LD32(src + stride + 5); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); ++ src += stride; ++ ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0)); ++ ++ dst += stride; ++ } ++} ++ ++ ++static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ ++ const int A=(8-x)*(8-y); ++ const int B=( x)*(8-y); ++ const int C=(8-x)*( y); ++ const int D=( x)*( y); ++ int i; ++ ++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF0_B, 32); ++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF1_B, 0); ++ PICO_PUT_W(PICO_COEFF2_A, 0); ++ PICO_PUT_W(PICO_COEFF2_B, 0); ++ PICO_PUT_W(PICO_CONFIG, ++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) ++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) ++ | PICO_COEFF_FRAC_BITS(6) ++ | PICO_OFFSET_FRAC_BITS(6)); ++ ++ for(i=0; i<h; i++) ++ { ++ int src0 = LD32(src); ++ int src1 = LD32(src + stride); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0); ++ src += stride; ++ ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0))); ++ dst += stride; ++ } ++} ++ ++ ++static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ ++ const int A=(8-x)*(8-y);\ ++ const int B=( x)*(8-y); ++ const int C=(8-x)*( y); ++ const int D=( x)*( y); ++ int i; ++ ++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF0_B, 32); ++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF1_B, 0); ++ PICO_PUT_W(PICO_COEFF2_A, 0); ++ PICO_PUT_W(PICO_COEFF2_B, 0); ++ PICO_PUT_W(PICO_CONFIG, ++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) ++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) ++ | PICO_COEFF_FRAC_BITS(6) ++ | PICO_OFFSET_FRAC_BITS(6)); ++ ++ for(i=0; i<h; i++) ++ { ++ /* ++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); ++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); ++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); ++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); ++ dst+= stride; ++ src+= stride; ++ */ ++ ++ int src0 = *((int *)src); ++ int src1 = (int)((src[4] << 24) | src[stride]); ++ int src2 = *((int *)(src + stride + 1)); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); ++ src += stride; ++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0))); ++ dst += stride; ++ } ++} ++ ++static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ ++ const int A=(8-x)*(8-y); ++ const int B=( x)*(8-y); ++ const int C=(8-x)*( y); ++ const int D=( x)*( y); ++ int i; ++ ++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF0_B, 32); ++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); ++ PICO_PUT_W(PICO_COEFF1_B, 0); ++ PICO_PUT_W(PICO_COEFF2_A, 0); ++ PICO_PUT_W(PICO_COEFF2_B, 0); ++ PICO_PUT_W(PICO_CONFIG, ++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) ++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) ++ | PICO_COEFF_FRAC_BITS(6) ++ | PICO_OFFSET_FRAC_BITS(6)); ++ ++ for(i=0; i<h; i++) ++ { ++ /* ++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); ++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); ++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); ++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); ++ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5])); ++ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6])); ++ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7])); ++ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8])); ++ dst+= stride; ++ src+= stride; ++ */ ++ int src0 = *((int *)src); ++ int src1 = (volatile int)((src[4] << 24) | src[stride]); ++ int src2 = *((int *)(src + stride + 1)); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); ++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0))); ++ ++ src0 = *((int *)(src + 4)); ++ src1 = (int)((src[8] << 24) | src[stride + 4]); ++ src2 = *((int *)(src + stride + 5)); ++ ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); ++ src += stride; ++ ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0))); ++ dst += stride; ++ } ++} ++ ++static struct pico_config_t h264_qpel4_h_lowpass_config = { ++ .input_mode = PICO_HOR_FILTER_MODE, ++ .output_mode = PICO_PLANAR_MODE, ++ .coeff_frac_bits = 5, ++ .offset_frac_bits = 5, ++ .coeff0_0 = 1, ++ .coeff0_1 = -5, ++ .coeff0_2 = 20, ++ .coeff0_3 = 16, ++ .coeff1_0 = 20, ++ .coeff1_1 = -5, ++ .coeff1_2 = 1, ++ .coeff1_3 = 0, ++ .coeff2_0 = 0, ++ .coeff2_1 = 0, ++ .coeff2_2 = 0, ++ .coeff2_3 = 0 ++}; ++ ++ ++ ++static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ const int h=4; ++ int i; ++ ++ set_pico_config(&h264_qpel4_h_lowpass_config); ++ ++ for(i=0; i<h; i++){ ++ ++ /* ++ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ ++ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ ++ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ ++ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ ++ dst+=dstStride;\ ++ src+=srcStride;\ */ ++ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2)); ++ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2)); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9); ++ src += srcStride; ++ ST32(dst, PICO_GET_W(PICO_OUTPIX0)); ++ dst += dstStride; ++ } ++} ++ ++static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ const int h=4; ++ int i; ++ ++ set_pico_config(&h264_qpel4_h_lowpass_config); ++ ++ for(i=0; i<h; i++){ ++ ++ /* ++ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ ++ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ ++ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ ++ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ ++ dst+=dstStride;\ ++ src+=srcStride;\ */ ++ ++ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2)); ++ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2)); ++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6); ++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7); ++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8); ++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9); ++ src += srcStride; ++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0))); ++ dst += dstStride; ++ } ++} ++ ++static struct pico_config_t h264_qpel4_v_lowpass_config1 = { ++ .input_mode = PICO_VERT_FILTER_MODE, ++ .output_mode = PICO_PACKED_MODE, ++ .coeff_frac_bits = 5, ++ .offset_frac_bits = 5, ++ .coeff0_0 = 1, ++ .coeff0_1 = -5, ++ .coeff0_2 = 20, ++ .coeff0_3 = 16, ++ .coeff1_0 = 1, ++ .coeff1_1 = -5, ++ .coeff1_2 = 20, ++ .coeff1_3 = 16, ++ .coeff2_0 = 1, ++ .coeff2_1 = -5, ++ .coeff2_2 = 20, ++ .coeff2_3 = 16 ++}; ++ ++ ++ ++static struct pico_config_t h264_qpel4_v_lowpass_config2 = { ++ .input_mode = PICO_VERT_FILTER_MODE, ++ .output_mode = PICO_PLANAR_MODE, ++ .coeff_frac_bits = 5, ++ .offset_frac_bits = 5, ++ .coeff0_0 = 1, ++ .coeff0_1 = -5, ++ .coeff0_2 = 20, ++ .coeff0_3 = 16, ++ .coeff1_0 = 20, ++ .coeff1_1 = -5, ++ .coeff1_2 = 1, ++ .coeff1_3 = 0, ++ .coeff2_0 = 0, ++ .coeff2_1 = 0, ++ .coeff2_2 = 0, ++ .coeff2_3 = 0 ++}; ++ ++static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ ++ /* ++ const int w=4; ++ uint8_t *cm = cropTbl + MAX_NEG_CROP; ++ int i; ++ for(i=0; i<w; i++) ++ { ++ const int srcB= src[-2*srcStride];\ ++ const int srcA= src[-1*srcStride];\ ++ const int src0= src[0 *srcStride];\ ++ const int src1= src[1 *srcStride];\ ++ const int src2= src[2 *srcStride];\ ++ const int src3= src[3 *srcStride];\ ++ const int src4= src[4 *srcStride];\ ++ const int src5= src[5 *srcStride];\ ++ const int src6= src[6 *srcStride];\ ++ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ ++ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ ++ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ ++ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ ++ dst++;\ ++ src++;\ ++ */ ++ ++ set_pico_config(&h264_qpel4_v_lowpass_config1); ++ ++ { ++ int srcB= LD32(src - 2*srcStride); ++ int srcA= LD32(src - 1*srcStride); ++ int src0= LD32(src + 0 *srcStride); ++ int src1= LD32(src + 1 *srcStride); ++ int src2= LD32(src + 2 *srcStride); ++ int src3= LD32(src + 3 *srcStride); ++ int src4= LD32(src + 4 *srcStride); ++ int src5= LD32(src + 5 *srcStride); ++ int src6= LD32(src + 6 *srcStride); ++ ++ /* First compute the leftmost three colums */ ++ PICO_MVRC_W(PICO_INPIX0, srcB); ++ PICO_MVRC_W(PICO_INPIX1, srcA); ++ PICO_MVRC_W(PICO_INPIX2, src0); ++ PICO_OP(0, 0, 0, 3, 6); ++ PICO_MVRC_W(PICO_INPIX2, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX0, src3); ++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); ++ ST32(dst, PICO_GET_W(PICO_OUTPIX0)); ++ dst += dstStride; ++ PICO_MVRC_W(PICO_INPIX0, srcA); ++ PICO_MVRC_W(PICO_INPIX1, src0); ++ PICO_MVRC_W(PICO_INPIX2, src1); ++ PICO_OP(0, 0, 0, 3, 6); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_MVRC_W(PICO_INPIX1, src3); ++ PICO_MVRC_W(PICO_INPIX0, src4); ++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); ++ ST32(dst, PICO_GET_W(PICO_OUTPIX0)); ++ dst += dstStride; ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(0, 0, 0, 3, 6); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_MVRC_W(PICO_INPIX1, src4); ++ PICO_MVRC_W(PICO_INPIX0, src5); ++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); ++ ST32(dst, PICO_GET_W(PICO_OUTPIX0)); ++ dst += dstStride; ++ PICO_MVRC_W(PICO_INPIX0, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_OP(0, 0, 0, 3, 6); ++ PICO_MVRC_W(PICO_INPIX2, src4); ++ PICO_MVRC_W(PICO_INPIX1, src5); ++ PICO_MVRC_W(PICO_INPIX0, src6); ++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); ++ ST32(dst, PICO_GET_W(PICO_OUTPIX0)); ++ /* Now compute the last column */ ++ ++ union wordbytes { ++ int word; ++ struct { ++ unsigned int t:8; ++ unsigned int u:8; ++ unsigned int l:8; ++ unsigned int b:8; ++ } bytes; } tmp1, tmp2, tmp3; ++ ++ ++ tmp1.bytes.t = srcB; ++ tmp1.bytes.u = src1; ++ tmp1.bytes.l = src4; ++ ++ tmp2.bytes.t = srcA; ++ tmp2.bytes.u = src2; ++ tmp2.bytes.l = src5; ++ ++ tmp3.bytes.t = src0; ++ tmp3.bytes.u = src3; ++ tmp3.bytes.l = src6; ++ ++ PICO_MVRC_W(PICO_INPIX0, tmp1.word); ++ PICO_MVRC_W(PICO_INPIX1, tmp2.word); ++ PICO_MVRC_W(PICO_INPIX2, tmp3.word); |
