2 files changed, 6450 insertions, 2 deletions
diff --git a/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch b/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch
new file mode 100644
index 0000000000..800f43e8eb
--- /dev/null
+++ b/packages/mplayer/files/mplayer-1.0rc1-atmel.2.patch
@@ -0,0 +1,6444 @@
+ cfg-common.h                     |    4 +
+ cfg-mencoder.h                   |    4 +
+ cfg-mplayer.h                    |    4 +
+ configure                        |   13 +-
+ libaf/af_format.c                |    7 +
+ libavcodec/Makefile              |    7 +
+ libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
+ libavcodec/avr32/fdct.S          |  541 ++++++++
+ libavcodec/avr32/h264idct.S      |  451 +++++++
+ libavcodec/avr32/idct.S          |  829 ++++++++++++
+ libavcodec/avr32/mc.S            |  434 ++++++
+ libavcodec/avr32/pico.h          |  260 ++++
+ libavcodec/bitstream.h           |   77 +-
+ libavcodec/dsputil.c             |    3 +
+ libavcodec/h264.c                |   15 +
+ libavutil/common.h               |   16 +
+ libavutil/internal.h             |    9 +
+ libfaad2/common.h                |    2 +-
+ libmpcodecs/ad_libmad.c          |    5 +
+ libswscale/pico-avr32.h          |  137 ++
+ libswscale/swscale_internal.h    |    2 +-
+ libswscale/yuv2rgb.c             |   14 +
+ libswscale/yuv2rgb_avr32.c       |  416 ++++++
+ libvo/vo_fbdev2.c                |  101 ++-
+ version.sh                       |    2 +-
+ 25 files changed, 6011 insertions(+), 20 deletions(-)
+ create mode 100644 libavcodec/avr32/dsputil_avr32.c
+ create mode 100644 libavcodec/avr32/fdct.S
+ create mode 100644 libavcodec/avr32/h264idct.S
+ create mode 100644 libavcodec/avr32/idct.S
+ create mode 100644 libavcodec/avr32/mc.S
+ create mode 100644 libavcodec/avr32/pico.h
+ create mode 100644 libswscale/pico-avr32.h
+ create mode 100644 libswscale/yuv2rgb_avr32.c
+
+diff --git a/cfg-common.h b/cfg-common.h
+index 780df38..7d878a8 100644
+--- a/cfg-common.h
++++ b/cfg-common.h
+@@ -235,6 +235,10 @@
+ 	{"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
+ 	{"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+ 
++#ifdef ARCH_AVR32
++        {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
++        {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
++#endif
+ 	// draw by slices or whole frame (useful with libmpeg2/libavcodec)
+ 	{"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+ 	{"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
+diff --git a/cfg-mencoder.h b/cfg-mencoder.h
+index 411b748..addf791 100644
+--- a/cfg-mencoder.h
++++ b/cfg-mencoder.h
+@@ -5,6 +5,10 @@
+ 
+ #include "cfg-common.h"
+ 
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ #ifdef USE_FAKE_MONO
+ extern int fakemono; // defined in dec_audio.c
+ #endif
+diff --git a/cfg-mplayer.h b/cfg-mplayer.h
+index 62b6eac..31499c2 100644
+--- a/cfg-mplayer.h
++++ b/cfg-mplayer.h
+@@ -4,6 +4,10 @@
+ 
+ #include "cfg-common.h"
+ 
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ extern int noconsolecontrols;
+ 
+ #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
+diff --git a/configure b/configure
+index 29002c8..56c6fe4 100755
+--- a/configure
++++ b/configure
+@@ -1203,6 +1203,15 @@ EOF
+     _optimizing="$proc"
+     ;;
+ 
++  avr32)
++    _def_arch='#define ARCH_AVR32'
++    _target_arch='TARGET_ARCH_AVR32 = yes'
++    iproc='avr32'
++    proc=''
++    _march=''
++    _mcpu=''
++    _optimizing=''
++    ;;
+   arm|armv4l|armv5tel)
+     _def_arch='#define ARCH_ARMV4L 1'
+     _target_arch='TARGET_ARCH_ARMV4L = yes'
+@@ -1533,7 +1542,7 @@ echores $_named_asm_args
+ # Checking for CFLAGS
+ _stripbinaries=yes
+ if test "$_profile" != "" || test "$_debug" != "" ; then
+-  CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
++  CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
+   if test "$_cc_major" -ge "3" ; then
+     CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
+   fi
+@@ -3794,7 +3803,7 @@ fi
+ 
+ 
+ echocheck "X11 headers presence"
+-  for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
++  for I in `echo $_inc_extra | sed s/-I//g`; do
+     if test -f "$I/X11/Xlib.h" ; then
+       _inc_x11="-I$I"
+       _x11_headers="yes"
+diff --git a/libaf/af_format.c b/libaf/af_format.c
+index e5b7cc9..5d7ea6d 100644
+--- a/libaf/af_format.c
++++ b/libaf/af_format.c
+@@ -20,7 +20,14 @@
+ // Integer to float conversion through lrintf()
+ #ifdef HAVE_LRINTF
+ #include <math.h>
++
++#ifdef ARCH_AVR32
++#define lrintf(x) rint(x)
++#define llrint(x) (long long)rint(x) 
++#else
+ long int lrintf(float);
++#endif
++
+ #else
+ #define lrintf(x) ((int)(x))
+ #endif
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 17b6c45..8e1dc96 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC)              += sparc/dsputil_vis.o \
+ 
+ sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
+ 
++# avr32 specific stuff
++ifeq ($(TARGET_ARCH_AVR32),yes)
++ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
++OBJS += avr32/dsputil_avr32.o
++endif
++
+ # sun mediaLib specific stuff
+ OBJS-$(HAVE_MLIB)                      += mlib/dsputil_mlib.o \
+ 
+@@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
+ clean::
+ 	rm -f \
+ 	   i386/*.o i386/*~ \
++	   avr32/*.o avr32/*~ \
+ 	   armv4l/*.o armv4l/*~ \
+ 	   mlib/*.o mlib/*~ \
+ 	   alpha/*.o alpha/*~ \
+diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
+new file mode 100644
+index 0000000..200284d
+--- /dev/null
++++ b/libavcodec/avr32/dsputil_avr32.c
+@@ -0,0 +1,2678 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++#include "../dsputil.h"
++#include "pico.h"
++
++int avr32_use_pico = 1;
++
++//#define CHECK_DSP_FUNCS_AGAINST_C
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define DSP_FUNC_NAME(name) test_ ## name
++#else
++#define DSP_FUNC_NAME(name) name
++#endif
++
++union doubleword {
++  int64_t doubleword;
++  struct {
++    int32_t top;
++    int32_t bottom;
++  } words; 
++};
++
++#undef  LD16
++#undef  LD32
++#undef  LD64
++  
++#define LD16(a) (*((uint16_t*)(a)))
++#define LD32(a) (*((uint32_t*)(a)))
++#define LD64(a) (*((uint64_t*)(a)))
++#define LD64_UNALIGNED(a) \
++  ({ union doubleword __tmp__; \
++   __tmp__.words.top = LD32(a); \
++   __tmp__.words.bottom = LD32(a + 4); \
++   __tmp__.doubleword; }) 
++
++#undef  ST32
++#undef  ST16
++
++#define ST16(a, b) *((uint16_t*)(a)) = (b)
++#define ST32(a, b) *((uint32_t*)(a)) = (b)
++
++#undef rnd_avg32
++#define rnd_avg32(a, b) \
++  ({ uint32_t __tmp__;\
++     asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
++     __tmp__;})
++
++void idct_avr32(DCTELEM *data);
++void fdct_avr32(DCTELEM *data);
++
++void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++
++void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++
++#define extern_dspfunc(PFX, NUM) \
++    void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h );     \
++    void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h );  \
++    void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h );  \
++    void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++#undef extern_dspfunc
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define extern_dspfunc(PFX, NUM)                                        \
++  void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++  void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++  void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++  void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 4);
++extern_dspfunc(put_no_rnd, 4);
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(put, 16);
++extern_dspfunc(put_no_rnd, 16);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++extern_dspfunc(avg, 16);
++extern_dspfunc(avg_no_rnd, 16);
++
++
++#undef extern_dspfunc
++#define extern_dspfunc(PFX, NUM) \
++void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride);  \
++
++extern_dspfunc(put_h264_qpel,  16);
++extern_dspfunc(put_h264_qpel,  8);
++extern_dspfunc(put_h264_qpel,  4);
++extern_dspfunc(avg_h264_qpel,  16);
++extern_dspfunc(avg_h264_qpel,  8);
++extern_dspfunc(avg_h264_qpel,  4);
++
++#undef extern_dspfunc
++
++void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++                         
++void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++
++
++void dump_block8(uint8_t *block, int line_size, int h);
++void dump_block4(uint8_t *block, int line_size, int h);
++void dump_block(uint8_t *block, int line_size, int h, int w);
++
++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                  int h, char *name, int max_dev);
++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                  int h, char *name, int max_dev);
++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                 int h, int width, char *name, int max_dev);
++
++#define PIXOP2( OPNAME, OP ) \
++void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++        a= LD32(&src1[i*src_stride1+4]);\
++        b= LD32(&src2[i*src_stride2+4]);\
++        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++    }\
++}\
++\
++void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++    }\
++}\
++\
++void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
++    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#else
++#define PIXOP2( OPNAME, OP ) \
++static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++        OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
++        OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++        a= LD32(&src1[i*src_stride1+4]);\
++        b= LD32(&src2[i*src_stride2+4]);\
++        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++    }\
++}\
++\
++static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++    }\
++}\
++\
++static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
++    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#endif
++
++#define op_avg(a, b) a = rnd_avg32(a, b)
++#define op_put(a, b) a = b
++
++PIXOP2(avg, op_avg)
++PIXOP2(put, op_put)
++#undef op_avg
++#undef op_put
++
++
++
++static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++  int i;
++  for(i=0; i<h; i++)
++    {
++      ST32(dst   , LD32(src   ));
++      dst+=dstStride;
++      src+=srcStride;
++    }
++}
++
++static void clear_blocks_avr32(DCTELEM *blocks)
++{
++  int n = 12;
++  uint64_t tmp1, tmp2;
++  blocks += 6*64;  
++  asm volatile ( "mov\t%1, 0\n" 
++                 "mov\t%m1, 0\n" 
++                 "mov\t%2, 0\n" 
++                 "mov\t%m2, 0\n" 
++                 "0:\n" 
++                 "stm\t--%3, %1, %m1, %2, %m2\n"
++                 "stm\t--%3, %1, %m1, %2, %m2\n"
++                 "stm\t--%3, %1, %m1, %2, %m2\n"
++                 "stm\t--%3, %1, %m1, %2, %m2\n"        
++                 "sub\t%0, 1\n"        
++                 "brne\t0b\n"        
++                 : "+r"(n), "=&r"(tmp1), "=&r"(tmp2), 
++                 "+r"(blocks));
++}
++
++
++static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++  int i;
++  for(i=0; i<h; i++)
++    {
++      ST32(dst   , LD32(src   ));
++      ST32(dst+4 , LD32(src+4 ));
++      dst+=dstStride;
++      src+=srcStride;
++    }
++}
++
++static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++  int i;
++  for(i=0; i<h; i++)
++    {
++      ST32(dst   , LD32(src   ));
++      ST32(dst+4 , LD32(src+4 ));
++      ST32(dst+8 , LD32(src+8 ));
++      ST32(dst+12, LD32(src+12));
++      dst+=dstStride;
++      src+=srcStride;
++    }
++}
++
++
++static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++  
++  for(i=0; i<h; i++)
++    {
++      
++      int src0 = LD32(src);
++      int src1 = LD32(src + stride);
++
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++      src += stride;
++      ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
++      dst += stride;
++    }
++}
++
++
++static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);\
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        dst+= stride;
++        src+= stride;
++      */
++      
++      int src0 = LD32(src);
++      int src1 = (((int)src[4] << 24) | (int)src[stride]);
++      int src2 = LD32(src + stride + 1);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++
++      dst += stride;
++    }
++}
++
++static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++        dst+= stride;
++        src+= stride;
++      */  
++      int src0 = LD32(src);
++      int src1 = (((int)src[4] << 24) | (int)src[stride]);
++      int src2 = LD32(src + stride + 1);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++      
++      src0 = LD32(src + 4);
++      src1 = (src[8] << 24) | src[stride + 4];
++      src2 = LD32(src + stride + 5);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
++
++      dst += stride;
++    }
++}
++
++
++static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++  
++  for(i=0; i<h; i++)
++    {
++      int src0 = LD32(src);
++      int src1 = LD32(src + stride);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++      src += stride;
++      ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
++      dst += stride;
++    }
++}
++
++
++static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);\
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        dst+= stride;
++        src+= stride;
++      */
++      
++      int src0 = *((int *)src);
++      int src1 = (int)((src[4] << 24) | src[stride]);
++      int src2 = *((int *)(src + stride + 1));
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++      dst += stride;
++    }
++}
++
++static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++        dst+= stride;
++        src+= stride;
++      */  
++      int src0 = *((int *)src);
++      int src1 = (volatile int)((src[4] << 24) | src[stride]);
++      int src2 = *((int *)(src + stride + 1));
++
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++
++      src0 = *((int *)(src + 4));
++      src1 = (int)((src[8] << 24) | src[stride + 4]);
++      src2 = *((int *)(src + stride + 5));
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
++      dst += stride;
++    }
++}
++
++static struct pico_config_t h264_qpel4_h_lowpass_config = { 
++  .input_mode = PICO_HOR_FILTER_MODE,
++  .output_mode = PICO_PLANAR_MODE,
++  .coeff_frac_bits = 5,
++  .offset_frac_bits = 5,
++  .coeff0_0 = 1,
++  .coeff0_1 = -5,
++  .coeff0_2 = 20,
++  .coeff0_3 = 16,
++  .coeff1_0 = 20,
++  .coeff1_1 = -5,
++  .coeff1_2 = 1,
++  .coeff1_3 = 0,
++  .coeff2_0 = 0,
++  .coeff2_1 = 0,
++  .coeff2_2 = 0,
++  .coeff2_3 = 0 
++};
++
++
++
++static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  const int h=4;
++  int i;
++    
++  set_pico_config(&h264_qpel4_h_lowpass_config);
++
++  for(i=0; i<h; i++){
++    
++    /*
++      OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++      OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++      OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++      OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++      dst+=dstStride;\
++      src+=srcStride;\ */
++    PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++    PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++    PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++    PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++    PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++    PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++    src += srcStride;
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++  }
++}
++
++static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  const int h=4;
++  int i;
++    
++  set_pico_config(&h264_qpel4_h_lowpass_config);
++  
++  for(i=0; i<h; i++){
++    
++    /*
++      OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++      OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++      OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++      OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++      dst+=dstStride;\
++      src+=srcStride;\ */
++        
++    PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++    PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++    PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++    PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++    PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++    PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++    src += srcStride;
++    ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++    dst += dstStride;
++  }
++}
++
++static struct pico_config_t h264_qpel4_v_lowpass_config1 = { 
++  .input_mode = PICO_VERT_FILTER_MODE,
++  .output_mode = PICO_PACKED_MODE,
++  .coeff_frac_bits = 5,
++  .offset_frac_bits = 5,
++  .coeff0_0 = 1,
++  .coeff0_1 = -5,
++  .coeff0_2 = 20,
++  .coeff0_3 = 16,
++  .coeff1_0 = 1,
++  .coeff1_1 = -5,
++  .coeff1_2 = 20,
++  .coeff1_3 = 16,
++  .coeff2_0 = 1,
++  .coeff2_1 = -5,
++  .coeff2_2 = 20,
++  .coeff2_3 = 16 
++};
++
++
++
++static struct pico_config_t h264_qpel4_v_lowpass_config2 = { 
++  .input_mode = PICO_VERT_FILTER_MODE,
++  .output_mode = PICO_PLANAR_MODE,
++  .coeff_frac_bits = 5,
++  .offset_frac_bits = 5,
++  .coeff0_0 = 1,
++  .coeff0_1 = -5,
++  .coeff0_2 = 20,
++  .coeff0_3 = 16,
++  .coeff1_0 = 20,
++  .coeff1_1 = -5,
++  .coeff1_2 = 1,
++  .coeff1_3 = 0,
++  .coeff2_0 = 0,
++  .coeff2_1 = 0,
++  .coeff2_2 = 0,
++  .coeff2_3 = 0 
++};
++
++static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++  /*
++    const int w=4;
++    uint8_t *cm = cropTbl + MAX_NEG_CROP;
++    int i;
++    for(i=0; i<w; i++)
++    {
++    const int srcB= src[-2*srcStride];\
++    const int srcA= src[-1*srcStride];\
++    const int src0= src[0 *srcStride];\
++    const int src1= src[1 *srcStride];\
++    const int src2= src[2 *srcStride];\
++    const int src3= src[3 *srcStride];\
++    const int src4= src[4 *srcStride];\
++    const int src5= src[5 *srcStride];\
++    const int src6= src[6 *srcStride];\
++    OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
++    OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
++    OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
++    OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
++    dst++;\
++    src++;\
++  */  
++  
++  set_pico_config(&h264_qpel4_v_lowpass_config1);
++  
++  {
++    int srcB= LD32(src - 2*srcStride);
++    int srcA= LD32(src - 1*srcStride);
++    int src0= LD32(src + 0 *srcStride);
++    int src1= LD32(src + 1 *srcStride);
++    int src2= LD32(src + 2 *srcStride);
++    int src3= LD32(src + 3 *srcStride);
++    int src4= LD32(src + 4 *srcStride);
++    int src5= LD32(src + 5 *srcStride);
++    int src6= LD32(src + 6 *srcStride);
++    
++    /* First compute the leftmost three colums */
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_MVRC_W(PICO_INPIX1, src3);
++    PICO_MVRC_W(PICO_INPIX0, src4);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_MVRC_W(PICO_INPIX1, src4);
++    PICO_MVRC_W(PICO_INPIX0, src5);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src4);
++    PICO_MVRC_W(PICO_INPIX1, src5);
++    PICO_MVRC_W(PICO_INPIX0, src6);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    /* Now compute the last column */
++ 
++    union wordbytes {
++      int word;
++      struct  {
++        unsigned int t:8;
++        unsigned int u:8;
++        unsigned int l:8;
++        unsigned int b:8; 
++      } bytes; } tmp1, tmp2, tmp3;
++    
++    
++    tmp1.bytes.t = srcB;
++    tmp1.bytes.u = src1;
++    tmp1.bytes.l = src4;
++    
++    tmp2.bytes.t = srcA;
++    tmp2.bytes.u = src2;
++    tmp2.bytes.l = src5;
++
++    tmp3.bytes.t = src0;
++    tmp3.bytes.u = src3;
++    tmp3.bytes.l = src6;
++    
++    PICO_MVRC_W(PICO_INPIX0, tmp1.word);
++    PICO_MVRC_W(PICO_INPIX1, tmp2.word);
++    PICO_MVRC_W(PICO_INPIX2, tmp3.word);