summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch6444
-rw-r--r--packages/mplayer/mplayer_0.0+1.0rc1.bb8
2 files changed, 6450 insertions, 2 deletions
diff --git a/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch b/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch
new file mode 100644
index 0000000000..800f43e8eb
--- /dev/null
+++ b/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch
@@ -0,0 +1,6444 @@
+ cfg-common.h | 4 +
+ cfg-mencoder.h | 4 +
+ cfg-mplayer.h | 4 +
+ configure | 13 +-
+ libaf/af_format.c | 7 +
+ libavcodec/Makefile | 7 +
+ libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
+ libavcodec/avr32/fdct.S | 541 ++++++++
+ libavcodec/avr32/h264idct.S | 451 +++++++
+ libavcodec/avr32/idct.S | 829 ++++++++++++
+ libavcodec/avr32/mc.S | 434 ++++++
+ libavcodec/avr32/pico.h | 260 ++++
+ libavcodec/bitstream.h | 77 +-
+ libavcodec/dsputil.c | 3 +
+ libavcodec/h264.c | 15 +
+ libavutil/common.h | 16 +
+ libavutil/internal.h | 9 +
+ libfaad2/common.h | 2 +-
+ libmpcodecs/ad_libmad.c | 5 +
+ libswscale/pico-avr32.h | 137 ++
+ libswscale/swscale_internal.h | 2 +-
+ libswscale/yuv2rgb.c | 14 +
+ libswscale/yuv2rgb_avr32.c | 416 ++++++
+ libvo/vo_fbdev2.c | 101 ++-
+ version.sh | 2 +-
+ 25 files changed, 6011 insertions(+), 20 deletions(-)
+ create mode 100644 libavcodec/avr32/dsputil_avr32.c
+ create mode 100644 libavcodec/avr32/fdct.S
+ create mode 100644 libavcodec/avr32/h264idct.S
+ create mode 100644 libavcodec/avr32/idct.S
+ create mode 100644 libavcodec/avr32/mc.S
+ create mode 100644 libavcodec/avr32/pico.h
+ create mode 100644 libswscale/pico-avr32.h
+ create mode 100644 libswscale/yuv2rgb_avr32.c
+
+diff --git a/cfg-common.h b/cfg-common.h
+index 780df38..7d878a8 100644
+--- a/cfg-common.h
++++ b/cfg-common.h
+@@ -235,6 +235,10 @@
+ {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
+ {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+
++#ifdef ARCH_AVR32
++ {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
++ {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
++#endif
+ // draw by slices or whole frame (useful with libmpeg2/libavcodec)
+ {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+ {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
+diff --git a/cfg-mencoder.h b/cfg-mencoder.h
+index 411b748..addf791 100644
+--- a/cfg-mencoder.h
++++ b/cfg-mencoder.h
+@@ -5,6 +5,10 @@
+
+ #include "cfg-common.h"
+
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ #ifdef USE_FAKE_MONO
+ extern int fakemono; // defined in dec_audio.c
+ #endif
+diff --git a/cfg-mplayer.h b/cfg-mplayer.h
+index 62b6eac..31499c2 100644
+--- a/cfg-mplayer.h
++++ b/cfg-mplayer.h
+@@ -4,6 +4,10 @@
+
+ #include "cfg-common.h"
+
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ extern int noconsolecontrols;
+
+ #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
+diff --git a/configure b/configure
+index 29002c8..56c6fe4 100755
+--- a/configure
++++ b/configure
+@@ -1203,6 +1203,15 @@ EOF
+ _optimizing="$proc"
+ ;;
+
++ avr32)
++ _def_arch='#define ARCH_AVR32'
++ _target_arch='TARGET_ARCH_AVR32 = yes'
++ iproc='avr32'
++ proc=''
++ _march=''
++ _mcpu=''
++ _optimizing=''
++ ;;
+ arm|armv4l|armv5tel)
+ _def_arch='#define ARCH_ARMV4L 1'
+ _target_arch='TARGET_ARCH_ARMV4L = yes'
+@@ -1533,7 +1542,7 @@ echores $_named_asm_args
+ # Checking for CFLAGS
+ _stripbinaries=yes
+ if test "$_profile" != "" || test "$_debug" != "" ; then
+- CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
++ CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
+ if test "$_cc_major" -ge "3" ; then
+ CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
+ fi
+@@ -3794,7 +3803,7 @@ fi
+
+
+ echocheck "X11 headers presence"
+- for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
++ for I in `echo $_inc_extra | sed s/-I//g`; do
+ if test -f "$I/X11/Xlib.h" ; then
+ _inc_x11="-I$I"
+ _x11_headers="yes"
+diff --git a/libaf/af_format.c b/libaf/af_format.c
+index e5b7cc9..5d7ea6d 100644
+--- a/libaf/af_format.c
++++ b/libaf/af_format.c
+@@ -20,7 +20,14 @@
+ // Integer to float conversion through lrintf()
+ #ifdef HAVE_LRINTF
+ #include <math.h>
++
++#ifdef ARCH_AVR32
++#define lrintf(x) rint(x)
++#define llrint(x) (long long)rint(x)
++#else
+ long int lrintf(float);
++#endif
++
+ #else
+ #define lrintf(x) ((int)(x))
+ #endif
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 17b6c45..8e1dc96 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \
+
+ sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
+
++# avr32 specific stuff
++ifeq ($(TARGET_ARCH_AVR32),yes)
++ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
++OBJS += avr32/dsputil_avr32.o
++endif
++
+ # sun mediaLib specific stuff
+ OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \
+
+@@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
+ clean::
+ rm -f \
+ i386/*.o i386/*~ \
++ avr32/*.o avr32/*~ \
+ armv4l/*.o armv4l/*~ \
+ mlib/*.o mlib/*~ \
+ alpha/*.o alpha/*~ \
+diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
+new file mode 100644
+index 0000000..200284d
+--- /dev/null
++++ b/libavcodec/avr32/dsputil_avr32.c
+@@ -0,0 +1,2678 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++#include "../dsputil.h"
++#include "pico.h"
++
++int avr32_use_pico = 1;
++
++//#define CHECK_DSP_FUNCS_AGAINST_C
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define DSP_FUNC_NAME(name) test_ ## name
++#else
++#define DSP_FUNC_NAME(name) name
++#endif
++
++union doubleword {
++ int64_t doubleword;
++ struct {
++ int32_t top;
++ int32_t bottom;
++ } words;
++};
++
++#undef LD16
++#undef LD32
++#undef LD64
++
++#define LD16(a) (*((uint16_t*)(a)))
++#define LD32(a) (*((uint32_t*)(a)))
++#define LD64(a) (*((uint64_t*)(a)))
++#define LD64_UNALIGNED(a) \
++ ({ union doubleword __tmp__; \
++ __tmp__.words.top = LD32(a); \
++ __tmp__.words.bottom = LD32(a + 4); \
++ __tmp__.doubleword; })
++
++#undef ST32
++#undef ST16
++
++#define ST16(a, b) *((uint16_t*)(a)) = (b)
++#define ST32(a, b) *((uint32_t*)(a)) = (b)
++
++#undef rnd_avg32
++#define rnd_avg32(a, b) \
++ ({ uint32_t __tmp__;\
++ asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
++ __tmp__;})
++
++void idct_avr32(DCTELEM *data);
++void fdct_avr32(DCTELEM *data);
++
++void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++
++void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++
++#define extern_dspfunc(PFX, NUM) \
++ void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++#undef extern_dspfunc
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define extern_dspfunc(PFX, NUM) \
++ void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++ void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 4);
++extern_dspfunc(put_no_rnd, 4);
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(put, 16);
++extern_dspfunc(put_no_rnd, 16);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++extern_dspfunc(avg, 16);
++extern_dspfunc(avg_no_rnd, 16);
++
++
++#undef extern_dspfunc
++#define extern_dspfunc(PFX, NUM) \
++void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
++void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
++
++extern_dspfunc(put_h264_qpel, 16);
++extern_dspfunc(put_h264_qpel, 8);
++extern_dspfunc(put_h264_qpel, 4);
++extern_dspfunc(avg_h264_qpel, 16);
++extern_dspfunc(avg_h264_qpel, 8);
++extern_dspfunc(avg_h264_qpel, 4);
++
++#undef extern_dspfunc
++
++void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++
++void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++
++
++void dump_block8(uint8_t *block, int line_size, int h);
++void dump_block4(uint8_t *block, int line_size, int h);
++void dump_block(uint8_t *block, int line_size, int h, int w);
++
++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, char *name, int max_dev);
++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, char *name, int max_dev);
++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
++ int h, int width, char *name, int max_dev);
++
++#define PIXOP2( OPNAME, OP ) \
++void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ a= LD32(&src1[i*src_stride1+4]);\
++ b= LD32(&src2[i*src_stride2+4]);\
++ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++ }\
++}\
++\
++void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ }\
++}\
++\
++void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
++ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#else
++#define PIXOP2( OPNAME, OP ) \
++static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ OP(*((uint32_t*)(block )), LD32(pixels ));\
++ OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++ OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
++ OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
++ pixels+=line_size;\
++ block +=line_size;\
++ }\
++}\
++static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ a= LD32(&src1[i*src_stride1+4]);\
++ b= LD32(&src2[i*src_stride2+4]);\
++ OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++ }\
++}\
++\
++static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ int i;\
++ for(i=0; i<h; i++){\
++ uint32_t a,b;\
++ a= LD32(&src1[i*src_stride1 ]);\
++ b= LD32(&src2[i*src_stride2 ]);\
++ OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
++ }\
++}\
++\
++static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++ int src_stride1, int src_stride2, int h){\
++ OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
++ OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#endif
++
++#define op_avg(a, b) a = rnd_avg32(a, b)
++#define op_put(a, b) a = b
++
++PIXOP2(avg, op_avg)
++PIXOP2(put, op_put)
++#undef op_avg
++#undef op_put
++
++
++
++static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++ int i;
++ for(i=0; i<h; i++)
++ {
++ ST32(dst , LD32(src ));
++ dst+=dstStride;
++ src+=srcStride;
++ }
++}
++
++static void clear_blocks_avr32(DCTELEM *blocks)
++{
++ int n = 12;
++ uint64_t tmp1, tmp2;
++ blocks += 6*64;
++ asm volatile ( "mov\t%1, 0\n"
++ "mov\t%m1, 0\n"
++ "mov\t%2, 0\n"
++ "mov\t%m2, 0\n"
++ "0:\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "stm\t--%3, %1, %m1, %2, %m2\n"
++ "sub\t%0, 1\n"
++ "brne\t0b\n"
++ : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
++ "+r"(blocks));
++}
++
++
++static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++ int i;
++ for(i=0; i<h; i++)
++ {
++ ST32(dst , LD32(src ));
++ ST32(dst+4 , LD32(src+4 ));
++ dst+=dstStride;
++ src+=srcStride;
++ }
++}
++
++static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++ int i;
++ for(i=0; i<h; i++)
++ {
++ ST32(dst , LD32(src ));
++ ST32(dst+4 , LD32(src+4 ));
++ ST32(dst+8 , LD32(src+8 ));
++ ST32(dst+12, LD32(src+12));
++ dst+=dstStride;
++ src+=srcStride;
++ }
++}
++
++
++static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++
++ int src0 = LD32(src);
++ int src1 = LD32(src + stride);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++ src += stride;
++ ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
++ dst += stride;
++ }
++}
++
++
++static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);\
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ dst+= stride;
++ src+= stride;
++ */
++
++ int src0 = LD32(src);
++ int src1 = (((int)src[4] << 24) | (int)src[stride]);
++ int src2 = LD32(src + stride + 1);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++
++ dst += stride;
++ }
++}
++
++static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++ dst+= stride;
++ src+= stride;
++ */
++ int src0 = LD32(src);
++ int src1 = (((int)src[4] << 24) | (int)src[stride]);
++ int src2 = LD32(src + stride + 1);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++
++ src0 = LD32(src + 4);
++ src1 = (src[8] << 24) | src[stride + 4];
++ src2 = LD32(src + stride + 5);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
++
++ dst += stride;
++ }
++}
++
++
++static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ int src0 = LD32(src);
++ int src1 = LD32(src + stride);
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++ src += stride;
++ ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
++ dst += stride;
++ }
++}
++
++
++static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);\
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ dst+= stride;
++ src+= stride;
++ */
++
++ int src0 = *((int *)src);
++ int src1 = (int)((src[4] << 24) | src[stride]);
++ int src2 = *((int *)(src + stride + 1));
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++ dst += stride;
++ }
++}
++
++static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++ const int A=(8-x)*(8-y);
++ const int B=( x)*(8-y);
++ const int C=(8-x)*( y);
++ const int D=( x)*( y);
++ int i;
++
++ PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF0_B, 32);
++ PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++ PICO_PUT_W(PICO_COEFF1_B, 0);
++ PICO_PUT_W(PICO_COEFF2_A, 0);
++ PICO_PUT_W(PICO_COEFF2_B, 0);
++ PICO_PUT_W(PICO_CONFIG,
++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
++ | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
++ | PICO_COEFF_FRAC_BITS(6)
++ | PICO_OFFSET_FRAC_BITS(6));
++
++ for(i=0; i<h; i++)
++ {
++ /*
++ OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++ OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++ OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++ OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++ OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++ OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++ OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++ OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++ dst+= stride;
++ src+= stride;
++ */
++ int src0 = *((int *)src);
++ int src1 = (volatile int)((src[4] << 24) | src[stride]);
++ int src2 = *((int *)(src + stride + 1));
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++
++ src0 = *((int *)(src + 4));
++ src1 = (int)((src[8] << 24) | src[stride + 4]);
++ src2 = *((int *)(src + stride + 5));
++
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++ src += stride;
++ ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
++ dst += stride;
++ }
++}
++
++static struct pico_config_t h264_qpel4_h_lowpass_config = {
++ .input_mode = PICO_HOR_FILTER_MODE,
++ .output_mode = PICO_PLANAR_MODE,
++ .coeff_frac_bits = 5,
++ .offset_frac_bits = 5,
++ .coeff0_0 = 1,
++ .coeff0_1 = -5,
++ .coeff0_2 = 20,
++ .coeff0_3 = 16,
++ .coeff1_0 = 20,
++ .coeff1_1 = -5,
++ .coeff1_2 = 1,
++ .coeff1_3 = 0,
++ .coeff2_0 = 0,
++ .coeff2_1 = 0,
++ .coeff2_2 = 0,
++ .coeff2_3 = 0
++};
++
++
++
++static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ const int h=4;
++ int i;
++
++ set_pico_config(&h264_qpel4_h_lowpass_config);
++
++ for(i=0; i<h; i++){
++
++ /*
++ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++ dst+=dstStride;\
++ src+=srcStride;\ */
++ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++ src += srcStride;
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ }
++}
++
++static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++ const int h=4;
++ int i;
++
++ set_pico_config(&h264_qpel4_h_lowpass_config);
++
++ for(i=0; i<h; i++){
++
++ /*
++ OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++ OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++ OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++ OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++ dst+=dstStride;\
++ src+=srcStride;\ */
++
++ PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++ PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++ PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++ PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++ PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++ PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++ src += srcStride;
++ ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++ dst += dstStride;
++ }
++}
++
++static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
++ .input_mode = PICO_VERT_FILTER_MODE,
++ .output_mode = PICO_PACKED_MODE,
++ .coeff_frac_bits = 5,
++ .offset_frac_bits = 5,
++ .coeff0_0 = 1,
++ .coeff0_1 = -5,
++ .coeff0_2 = 20,
++ .coeff0_3 = 16,
++ .coeff1_0 = 1,
++ .coeff1_1 = -5,
++ .coeff1_2 = 20,
++ .coeff1_3 = 16,
++ .coeff2_0 = 1,
++ .coeff2_1 = -5,
++ .coeff2_2 = 20,
++ .coeff2_3 = 16
++};
++
++
++
++static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
++ .input_mode = PICO_VERT_FILTER_MODE,
++ .output_mode = PICO_PLANAR_MODE,
++ .coeff_frac_bits = 5,
++ .offset_frac_bits = 5,
++ .coeff0_0 = 1,
++ .coeff0_1 = -5,
++ .coeff0_2 = 20,
++ .coeff0_3 = 16,
++ .coeff1_0 = 20,
++ .coeff1_1 = -5,
++ .coeff1_2 = 1,
++ .coeff1_3 = 0,
++ .coeff2_0 = 0,
++ .coeff2_1 = 0,
++ .coeff2_2 = 0,
++ .coeff2_3 = 0
++};
++
++static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++ /*
++ const int w=4;
++ uint8_t *cm = cropTbl + MAX_NEG_CROP;
++ int i;
++ for(i=0; i<w; i++)
++ {
++ const int srcB= src[-2*srcStride];\
++ const int srcA= src[-1*srcStride];\
++ const int src0= src[0 *srcStride];\
++ const int src1= src[1 *srcStride];\
++ const int src2= src[2 *srcStride];\
++ const int src3= src[3 *srcStride];\
++ const int src4= src[4 *srcStride];\
++ const int src5= src[5 *srcStride];\
++ const int src6= src[6 *srcStride];\
++ OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
++ OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
++ OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
++ OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
++ dst++;\
++ src++;\
++ */
++
++ set_pico_config(&h264_qpel4_v_lowpass_config1);
++
++ {
++ int srcB= LD32(src - 2*srcStride);
++ int srcA= LD32(src - 1*srcStride);
++ int src0= LD32(src + 0 *srcStride);
++ int src1= LD32(src + 1 *srcStride);
++ int src2= LD32(src + 2 *srcStride);
++ int src3= LD32(src + 3 *srcStride);
++ int src4= LD32(src + 4 *srcStride);
++ int src5= LD32(src + 5 *srcStride);
++ int src6= LD32(src + 6 *srcStride);
++
++ /* First compute the leftmost three colums */
++ PICO_MVRC_W(PICO_INPIX0, srcB);
++ PICO_MVRC_W(PICO_INPIX1, srcA);
++ PICO_MVRC_W(PICO_INPIX2, src0);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX0, src3);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ PICO_MVRC_W(PICO_INPIX0, srcA);
++ PICO_MVRC_W(PICO_INPIX1, src0);
++ PICO_MVRC_W(PICO_INPIX2, src1);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_MVRC_W(PICO_INPIX1, src3);
++ PICO_MVRC_W(PICO_INPIX0, src4);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ PICO_MVRC_W(PICO_INPIX0, src0);
++ PICO_MVRC_W(PICO_INPIX1, src1);
++ PICO_MVRC_W(PICO_INPIX2, src2);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_MVRC_W(PICO_INPIX1, src4);
++ PICO_MVRC_W(PICO_INPIX0, src5);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ dst += dstStride;
++ PICO_MVRC_W(PICO_INPIX0, src1);
++ PICO_MVRC_W(PICO_INPIX1, src2);
++ PICO_MVRC_W(PICO_INPIX2, src3);
++ PICO_OP(0, 0, 0, 3, 6);
++ PICO_MVRC_W(PICO_INPIX2, src4);
++ PICO_MVRC_W(PICO_INPIX1, src5);
++ PICO_MVRC_W(PICO_INPIX0, src6);
++ PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++ ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++ /* Now compute the last column */
++
++ union wordbytes {
++ int word;
++ struct {
++ unsigned int t:8;
++ unsigned int u:8;
++ unsigned int l:8;
++ unsigned int b:8;
++ } bytes; } tmp1, tmp2, tmp3;
++
++
++ tmp1.bytes.t = srcB;
++ tmp1.bytes.u = src1;
++ tmp1.bytes.l = src4;
++
++ tmp2.bytes.t = srcA;
++ tmp2.bytes.u = src2;
++ tmp2.bytes.l = src5;
++
++ tmp3.bytes.t = src0;
++ tmp3.bytes.u = src3;
++ tmp3.bytes.l = src6;
++
++ PICO_MVRC_W(PICO_INPIX0, tmp1.word);
++ PICO_MVRC_W(PICO_INPIX1, tmp2.word);
++ PICO_MVRC_W(PICO_INPIX2, tmp3.word);