38 files changed, 16265 insertions, 0 deletions
diff --git a/recipes/mplayer/files/Makefile-codec-cfg.patch b/recipes/mplayer/files/Makefile-codec-cfg.patch
new file mode 100644
index 0000000000..9ce22a8839
--- /dev/null
+++ b/recipes/mplayer/files/Makefile-codec-cfg.patch
@@ -0,0 +1,11 @@
+--- /tmp/Makefile	2008-09-24 19:24:26.000000000 +0200
++++ trunk/Makefile	2008-09-24 19:25:01.683198000 +0200
+@@ -752,7 +752,7 @@
+ 	$(CC) -o $@ $^ $(LDFLAGS_MPLAYER)
+ 
+ codec-cfg$(EXESUF): codec-cfg.c codec-cfg.h help_mp.h
+-	$(HOST_CC) -O -DCODECS2HTML $(EXTRA_INC) -o $@ $<
++	$(BUILD_CC) -O -DCODECS2HTML $(EXTRA_INC) -o $@ $<
+ 
+ codecs.conf.h: codec-cfg$(EXESUF) etc/codecs.conf
+ 	./$^ > $@
diff --git a/recipes/mplayer/files/Makefile.patch b/recipes/mplayer/files/Makefile.patch
new file mode 100644
index 0000000000..f0464b9176
--- /dev/null
+++ b/recipes/mplayer/files/Makefile.patch
@@ -0,0 +1,13 @@
+Index: MPlayer-1.0rc1/Makefile
+===================================================================
+--- MPlayer-1.0rc1.orig/Makefile
++++ MPlayer-1.0rc1/Makefile
+@@ -399,7 +399,7 @@ osdep/mplayer-rc.o: osdep/mplayer.rc
+ 	windres -o $@ osdep/mplayer.rc
+ 
+ codec-cfg: codec-cfg.c codec-cfg.h help_mp.h
+-	$(HOST_CC) -I. -DCODECS2HTML codec-cfg.c -o $@
++	$(BUILD_CC) -I. -DCODECS2HTML codec-cfg.c -o $@
+ 
+ codecs.conf.h: codec-cfg etc/codecs.conf
+ 	./codec-cfg ./etc/codecs.conf > $@
diff --git a/recipes/mplayer/files/armv5te/configh b/recipes/mplayer/files/armv5te/configh
new file mode 100644
index 0000000000..46c647e2d5
--- /dev/null
+++ b/recipes/mplayer/files/armv5te/configh
@@ -0,0 +1,6 @@
+#define HAVE_LLRINT 1
+#define HAVE_ROUNDF 1
+#define ARCH_ARMV4L 1
+#define ENABLE_ARMV4L 1
+#define HAVE_ARMV5TE 1
+#define ENABLE_ARMV5TE 1
diff --git a/recipes/mplayer/files/armv5te/configmak b/recipes/mplayer/files/armv5te/configmak
new file mode 100644
index 0000000000..aa9978515d
--- /dev/null
+++ b/recipes/mplayer/files/armv5te/configmak
@@ -0,0 +1,3 @@
+ARCH_ARMV4L=yes
+HAVE_ARMV5TE=yes
+
diff --git a/recipes/mplayer/files/armv6/configh b/recipes/mplayer/files/armv6/configh
new file mode 100644
index 0000000000..2301e723d6
--- /dev/null
+++ b/recipes/mplayer/files/armv6/configh
@@ -0,0 +1,8 @@
+#define HAVE_LLRINT 1
+#define HAVE_ROUNDF 1
+#define ARCH_ARMV4L 1
+#define ENABLE_ARMV4L 1
+#define HAVE_ARMV5TE 1
+#define ENABLE_ARMV5TE 1
+#define HAVE_ARMV6 1
+#define ENABLE_ARMV6 1
diff --git a/recipes/mplayer/files/armv6/configmak b/recipes/mplayer/files/armv6/configmak
new file mode 100644
index 0000000000..4db5dc0dfd
--- /dev/null
+++ b/recipes/mplayer/files/armv6/configmak
@@ -0,0 +1,3 @@
+ARCH_ARMV4L=yes
+HAVE_ARMV5TE=yes
+HAVE_ARMV6=yes
diff --git a/recipes/mplayer/files/armv7a/configh b/recipes/mplayer/files/armv7a/configh
new file mode 100644
index 0000000000..245e40f56a
--- /dev/null
+++ b/recipes/mplayer/files/armv7a/configh
@@ -0,0 +1,14 @@
+#define HAVE_LLRINT 1
+#define HAVE_ROUNDF 1
+#define ARCH_ARMV4L 1
+#define ENABLE_ARMV4L 1
+#define HAVE_ARMV5TE 1
+#define ENABLE_ARMV5TE 1
+#define HAVE_ARMV6 1
+#define ENABLE_ARMV6 1
+#define HAVE_ARMV6T2 1
+#define ENABLE_ARMV6T2 1
+#define HAVE_ARMVFP 1
+#define ENABLE_ARMVFP 1
+#define HAVE_NEON 1
+#define ENABLE_NEON 1
diff --git a/recipes/mplayer/files/armv7a/configmak b/recipes/mplayer/files/armv7a/configmak
new file mode 100644
index 0000000000..50d549f794
--- /dev/null
+++ b/recipes/mplayer/files/armv7a/configmak
@@ -0,0 +1,6 @@
+ARCH_ARMV4L=yes
+HAVE_ARMV5TE=yes
+HAVE_ARMV6=yes
+HAVE_ARMV6T2=yes
+HAVE_ARMVFP=yes
+HAVE_NEON=yes
diff --git a/recipes/mplayer/files/configh b/recipes/mplayer/files/configh
new file mode 100644
index 0000000000..2fe7658383
--- /dev/null
+++ b/recipes/mplayer/files/configh
@@ -0,0 +1,2 @@
+#define HAVE_LLRINT 1
+#define HAVE_ROUNDF 1
diff --git a/recipes/mplayer/files/configmak b/recipes/mplayer/files/configmak
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/recipes/mplayer/files/configmak
diff --git a/recipes/mplayer/files/disable-executable-stack-test.patch b/recipes/mplayer/files/disable-executable-stack-test.patch
new file mode 100644
index 0000000000..dc8871b6ae
--- /dev/null
+++ b/recipes/mplayer/files/disable-executable-stack-test.patch
@@ -0,0 +1,30 @@
+Removes the "noexecstack" check from configure so we don't end up with:
+
+     mplayer: error while loading shared libraries: libmad.so.0: cannot 
+     enable executable stack as shared object requires: Error 14
+
+at runtime.
+
+#
+# Patch managed by http://www.holgerschurig.de/patcher.html
+#
+
+--- MPlayer-1.0pre8/configure~disable-executable-stack-test
++++ MPlayer-1.0pre8/configure
+@@ -7193,15 +7193,7 @@
+ fi
+ 
+ echocheck "compiler support for noexecstack"
+-cat > $TMPC <<EOF
+-int main(void) { return 0; }
+-EOF
+-if cc_check -Wl,-z,noexecstack ; then
+-  _ld_extra="-Wl,-z,noexecstack $_ld_extra"
+-  echores "yes"
+-else
+-  echores "no"
+-fi
++echores "no"
+ 
+ echocheck "ftello()"
+ # if we don't have ftello use the osdep/ compatibility module
diff --git a/recipes/mplayer/files/imageon-video_out.patch b/recipes/mplayer/files/imageon-video_out.patch
new file mode 100644
index 0000000000..fcb7953c1b
--- /dev/null
+++ b/recipes/mplayer/files/imageon-video_out.patch
@@ -0,0 +1,20 @@
+--- mplayer/libvo/video_out.c.orig	2006-11-27 12:49:51.000000000 -0800
++++ mplayer/libvo/video_out.c	2006-11-27 12:41:59.000000000 -0800
+@@ -87,6 +87,7 @@ extern vo_functions_t video_out_syncfb;
+ extern vo_functions_t video_out_fbdev;
+ extern vo_functions_t video_out_fbdev2;
+ extern vo_functions_t video_out_w100;
++extern vo_functions_t video_out_imageon;
+ extern vo_functions_t video_out_svga;
+ extern vo_functions_t video_out_png;
+ extern vo_functions_t video_out_ggi;
+@@ -200,6 +201,9 @@ vo_functions_t* video_out_drivers[] =
+ #ifdef HAVE_W100
+ 	&video_out_w100,
+ #endif
++#ifdef HAVE_IMAGEON
++	&video_out_imageon,
++#endif
+ #ifdef HAVE_SVGALIB
+ 	&video_out_svga,
+ #endif
diff --git a/recipes/mplayer/files/libmpdemux-ogg-include-svn.patch b/recipes/mplayer/files/libmpdemux-ogg-include-svn.patch
new file mode 100644
index 0000000000..52b7029bc5
--- /dev/null
+++ b/recipes/mplayer/files/libmpdemux-ogg-include-svn.patch
@@ -0,0 +1,11 @@
+--- trunk/libmpdemux/Makefile.orig	2006-07-24 10:11:06.000000000 +0100
++++ trunk/libmpdemux/Makefile	2006-07-24 10:12:02.000000000 +0100
+@@ -208,7 +208,7 @@
+ OBJS	= $(SRCS:.c=.o)
+ OBJS   += $(CPLUSPLUSSRCS:.cpp=.o)
+ INCLUDE = -I.. -I../loader $(LIBAV_INC)
+-CFLAGS  = $(OPTFLAGS) $(INCLUDE) $(XMMS_CFLAGS) $(CDPARANOIA_INC) $(DVB_INC)
++CFLAGS  = $(INCLUDE) $(OPTFLAGS) $(XMMS_CFLAGS) $(CDPARANOIA_INC) $(DVB_INC)
+ CPLUSPLUSFLAGS  = $(CFLAGS) $(CPLUSPLUSINCLUDE) -D__STDC_LIMIT_MACROS
+ CPLUSPLUS = $(CC)
+ 
diff --git a/recipes/mplayer/files/makefile-nostrip-rc2.patch b/recipes/mplayer/files/makefile-nostrip-rc2.patch
new file mode 100644
index 0000000000..d7c147565f
--- /dev/null
+++ b/recipes/mplayer/files/makefile-nostrip-rc2.patch
@@ -0,0 +1,24 @@
+upstream: not applicable, I think (unless somebody claims there is a more portable way to stripping)
+
+Index: MPlayer-1.0rc2/Makefile
+===================================================================
+--- MPlayer-1.0rc2.orig/Makefile	2008-04-19 10:31:18.000000000 +0200
++++ MPlayer-1.0rc2/Makefile	2008-04-19 10:31:55.000000000 +0200
+@@ -253,7 +253,7 @@
+ 	if test -f $(CONFDIR)/codecs.conf ; then mv -f $(CONFDIR)/codecs.conf $(CONFDIR)/codecs.conf.old ; fi
+ 
+ install-mplayer: mplayer$(EXESUF)
+-	$(INSTALL) -m 755 $(INSTALLSTRIP) mplayer$(EXESUF) $(BINDIR)
++	$(INSTALL) -m 755 mplayer$(EXESUF) $(BINDIR)
+ 
+ install-mplayer-man:
+ 	for i in $(MAN_LANG); do \
+@@ -266,7 +266,7 @@
+ 	done
+ 
+ install-mencoder: mencoder$(EXESUF)
+-	$(INSTALL) -m 755 $(INSTALLSTRIP) mencoder$(EXESUF) $(BINDIR)
++	$(INSTALL) -m 755 mencoder$(EXESUF) $(BINDIR)
+ 	for i in $(MAN_LANG); do \
+ 		if test "$$i" = en ; then \
+ 			cd $(MANDIR)/man1 && ln -sf mplayer.1 mencoder.1 ; \
diff --git a/recipes/mplayer/files/makefile-nostrip-svn.patch b/recipes/mplayer/files/makefile-nostrip-svn.patch
new file mode 100644
index 0000000000..c684001792
--- /dev/null
+++ b/recipes/mplayer/files/makefile-nostrip-svn.patch
@@ -0,0 +1,11 @@
+--- /tmp/Makefile	2008-06-10 21:05:55.613676241 +0200
++++ trunk/Makefile	2008-06-10 21:06:30.701172833 +0200
+@@ -797,7 +797,7 @@
+ 	$(INSTALL) -d $(BINDIR) $(CONFDIR)
+ 
+ install-%: %$(EXESUF) install-dirs
+-	$(INSTALL) -m 755 $(INSTALLSTRIP) $< $(BINDIR)
++	$(INSTALL) -m 755 $< $(BINDIR)
+ 
+ install-mplayer-man: $(foreach lang,$(MAN_LANG_ALL),install-mplayer-man-$(lang))
+ install-mencoder-man: $(foreach lang,$(MAN_LANG_ALL),install-mencoder-man-$(lang))
diff --git a/recipes/mplayer/files/makefile-nostrip.patch b/recipes/mplayer/files/makefile-nostrip.patch
new file mode 100644
index 0000000000..604433938b
--- /dev/null
+++ b/recipes/mplayer/files/makefile-nostrip.patch
@@ -0,0 +1,24 @@
+upstream: not applicable, I think (unless somebody claims there is a more portable way to stripping)
+
+Index: MPlayer-1.0rc1/Makefile
+===================================================================
+--- MPlayer-1.0rc1.orig/Makefile
++++ MPlayer-1.0rc1/Makefile
+@@ -416,7 +416,7 @@ ifeq ($(VIDIX),yes)
+ 	$(MAKE) -C vidix install
+ endif
+ 	$(INSTALL) -d $(BINDIR)
+-	$(INSTALL) -m 755 $(INSTALLSTRIP) $(PRG) $(BINDIR)/$(PRG)
++	$(INSTALL) -m 755 $(PRG) $(BINDIR)/$(PRG)
+ ifeq ($(GUI),yes)
+ 	-ln -sf $(PRG) $(BINDIR)/gmplayer
+ endif
+@@ -430,7 +430,7 @@ endif
+ 		fi ; \
+ 	done
+ ifeq ($(MENCODER),yes)
+-	$(INSTALL) -m 755 $(INSTALLSTRIP) $(PRG_MENCODER) $(BINDIR)/$(PRG_MENCODER)
++	$(INSTALL) -m 755 $(PRG_MENCODER) $(BINDIR)/$(PRG_MENCODER)
+ 	for i in $(MAN_LANG); do \
+ 		if test "$$i" = en ; then \
+ 			cd $(MANDIR)/man1 && ln -sf mplayer.1 mencoder.1 ; \
diff --git a/recipes/mplayer/files/motion-comp-pld.patch b/recipes/mplayer/files/motion-comp-pld.patch
new file mode 100644
index 0000000000..6d4160c7bb
--- /dev/null
+++ b/recipes/mplayer/files/motion-comp-pld.patch
@@ -0,0 +1,163 @@
+Index: MPlayer-1.0rc2/libmpeg2/motion_comp_arm_s.S
+===================================================================
+--- MPlayer-1.0rc2.orig/libmpeg2/motion_comp_arm_s.S	2007-11-02 14:16:50.000000000 +0000
++++ MPlayer-1.0rc2/libmpeg2/motion_comp_arm_s.S	2007-11-02 14:23:53.000000000 +0000
+@@ -18,6 +18,14 @@
+ @ along with this program; if not, write to the Free Software
+ @ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ 
++#if defined(__ARM_ARCH_5__) || \
++    defined(__ARM_ARCH_5T__) || \
++    defined(__ARM_ARCH_5TE__)
++#define PLD(code...)   code
++#else
++#define PLD(code...)
++#endif
++
+ 	.text
+ 	
+ @ ----------------------------------------------------------------
+@@ -25,7 +33,7 @@
+ 	.global MC_put_o_16_arm
+ MC_put_o_16_arm:
+ 	@@ void func(uint8_t * dest, const uint8_t * ref, int stride, int height)
+-	pld [r1]
++	PLD ( pld [r1] )
+         stmfd sp!, {r4-r11, lr} @ R14 is also called LR
+ 	and r4, r1, #3
+ 	adr r5, MC_put_o_16_arm_align_jt
+@@ -35,7 +43,7 @@
+ MC_put_o_16_arm_align0:
+ 	ldmia r1, {r4-r7}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	stmia r0, {r4-r7}
+ 	subs r3, r3, #1
+ 	add r0, r0, r2
+@@ -46,7 +54,7 @@
+ 	ldmia r1, {r4-r8}
+ 	add r1, r1, r2
+ 	mov r9, r4, lsr #(\shift)
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	mov r10, r5, lsr #(\shift)
+ 	orr r9, r9, r5, lsl #(32-\shift)
+ 	mov r11, r6, lsr #(\shift)
+@@ -85,7 +93,7 @@
+ 	.global MC_put_o_8_arm
+ MC_put_o_8_arm:
+ 	@@ void func(uint8_t * dest, const uint8_t * ref, int stride, int height)
+-	pld [r1]
++	PLD ( pld [r1] )
+         stmfd sp!, {r4-r10, lr} @ R14 is also called LR
+ 	and r4, r1, #3
+ 	adr r5, MC_put_o_8_arm_align_jt
+@@ -94,7 +102,7 @@
+ MC_put_o_8_arm_align0:
+ 	ldmia r1, {r4-r5}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	stmia r0, {r4-r5}
+ 	add r0, r0, r2
+ 	subs r3, r3, #1
+@@ -105,7 +113,7 @@
+ 	ldmia r1, {r4-r6}
+ 	add r1, r1, r2
+ 	mov r9, r4, lsr #(\shift)
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	mov r10, r5, lsr #(\shift)
+ 	orr r9, r9, r5, lsl #(32-\shift)
+ 	orr r10, r10, r6, lsl #(32-\shift)
+@@ -154,7 +162,7 @@
+ 	.global MC_put_x_16_arm
+ MC_put_x_16_arm:
+ 	@@ void func(uint8_t * dest, const uint8_t * ref, int stride, int height)
+-	pld [r1]
++	PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+ 	and r4, r1, #3
+ 	adr r5, MC_put_x_16_arm_align_jt
+@@ -179,7 +187,7 @@
+ MC_put_x_16_arm_align0:
+ 	ldmia r1, {r4-r8}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	AVG_PW r7, r8
+ 	AVG_PW r6, r7
+ 	AVG_PW r5, r6
+@@ -193,7 +201,7 @@
+ 	and r1, r1, #0xFFFFFFFC
+ 1:	ldmia r1, {r4-r8}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	ADJ_ALIGN_QW 8, r4, r5, r6, r7, r8
+ 	AVG_PW r7, r8
+ 	AVG_PW r6, r7
+@@ -208,7 +216,7 @@
+ 	and r1, r1, #0xFFFFFFFC
+ 1:	ldmia r1, {r4-r8}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	ADJ_ALIGN_QW 16, r4, r5, r6, r7, r8
+ 	AVG_PW r7, r8
+ 	AVG_PW r6, r7
+@@ -223,7 +231,7 @@
+ 	and r1, r1, #0xFFFFFFFC
+ 1:	ldmia r1, {r4-r8}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	ADJ_ALIGN_QW 24, r4, r5, r6, r7, r8
+ 	AVG_PW r7, r8
+ 	AVG_PW r6, r7
+@@ -246,7 +254,7 @@
+ 	.global MC_put_x_8_arm
+ MC_put_x_8_arm:
+ 	@@ void func(uint8_t * dest, const uint8_t * ref, int stride, int height)
+-	pld [r1]
++	PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+ 	and r4, r1, #3
+ 	adr r5, MC_put_x_8_arm_align_jt
+@@ -267,7 +275,7 @@
+ MC_put_x_8_arm_align0:
+ 	ldmia r1, {r4-r6}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	AVG_PW r5, r6
+ 	AVG_PW r4, r5
+ 	stmia r0, {r5-r6}
+@@ -279,7 +287,7 @@
+ 	and r1, r1, #0xFFFFFFFC
+ 1:	ldmia r1, {r4-r6}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	ADJ_ALIGN_DW 8, r4, r5, r6
+ 	AVG_PW r5, r6
+ 	AVG_PW r4, r5
+@@ -292,7 +300,7 @@
+ 	and r1, r1, #0xFFFFFFFC
+ 1:	ldmia r1, {r4-r6}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	ADJ_ALIGN_DW 16, r4, r5, r6
+ 	AVG_PW r5, r6
+ 	AVG_PW r4, r5
+@@ -305,7 +313,7 @@
+ 	and r1, r1, #0xFFFFFFFC
+ 1:	ldmia r1, {r4-r6}
+ 	add r1, r1, r2
+-	pld [r1]
++	PLD ( pld [r1] )
+ 	ADJ_ALIGN_DW 24, r4, r5, r6
+ 	AVG_PW r5, r6
+ 	AVG_PW r4, r5
diff --git a/recipes/mplayer/files/mplayer-1.0rc1-atmel.2.patch b/recipes/mplayer/files/mplayer-1.0rc1-atmel.2.patch
new file mode 100644
index 0000000000..800f43e8eb
--- /dev/null
+++ b/recipes/mplayer/files/mplayer-1.0rc1-atmel.2.patch
@@ -0,0 +1,6444 @@
+ cfg-common.h                     |    4 +
+ cfg-mencoder.h                   |    4 +
+ cfg-mplayer.h                    |    4 +
+ configure                        |   13 +-
+ libaf/af_format.c                |    7 +
+ libavcodec/Makefile              |    7 +
+ libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
+ libavcodec/avr32/fdct.S          |  541 ++++++++
+ libavcodec/avr32/h264idct.S      |  451 +++++++
+ libavcodec/avr32/idct.S          |  829 ++++++++++++
+ libavcodec/avr32/mc.S            |  434 ++++++
+ libavcodec/avr32/pico.h          |  260 ++++
+ libavcodec/bitstream.h           |   77 +-
+ libavcodec/dsputil.c             |    3 +
+ libavcodec/h264.c                |   15 +
+ libavutil/common.h               |   16 +
+ libavutil/internal.h             |    9 +
+ libfaad2/common.h                |    2 +-
+ libmpcodecs/ad_libmad.c          |    5 +
+ libswscale/pico-avr32.h          |  137 ++
+ libswscale/swscale_internal.h    |    2 +-
+ libswscale/yuv2rgb.c             |   14 +
+ libswscale/yuv2rgb_avr32.c       |  416 ++++++
+ libvo/vo_fbdev2.c                |  101 ++-
+ version.sh                       |    2 +-
+ 25 files changed, 6011 insertions(+), 20 deletions(-)
+ create mode 100644 libavcodec/avr32/dsputil_avr32.c
+ create mode 100644 libavcodec/avr32/fdct.S
+ create mode 100644 libavcodec/avr32/h264idct.S
+ create mode 100644 libavcodec/avr32/idct.S
+ create mode 100644 libavcodec/avr32/mc.S
+ create mode 100644 libavcodec/avr32/pico.h
+ create mode 100644 libswscale/pico-avr32.h
+ create mode 100644 libswscale/yuv2rgb_avr32.c
+
+diff --git a/cfg-common.h b/cfg-common.h
+index 780df38..7d878a8 100644
+--- a/cfg-common.h
++++ b/cfg-common.h
+@@ -235,6 +235,10 @@
+ 	{"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
+ 	{"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+ 
++#ifdef ARCH_AVR32
++        {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
++        {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
++#endif
+ 	// draw by slices or whole frame (useful with libmpeg2/libavcodec)
+ 	{"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
+ 	{"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
+diff --git a/cfg-mencoder.h b/cfg-mencoder.h
+index 411b748..addf791 100644
+--- a/cfg-mencoder.h
++++ b/cfg-mencoder.h
+@@ -5,6 +5,10 @@
+ 
+ #include "cfg-common.h"
+ 
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ #ifdef USE_FAKE_MONO
+ extern int fakemono; // defined in dec_audio.c
+ #endif
+diff --git a/cfg-mplayer.h b/cfg-mplayer.h
+index 62b6eac..31499c2 100644
+--- a/cfg-mplayer.h
++++ b/cfg-mplayer.h
+@@ -4,6 +4,10 @@
+ 
+ #include "cfg-common.h"
+ 
++#ifdef ARCH_AVR32
++extern int avr32_use_pico;
++#endif
++
+ extern int noconsolecontrols;
+ 
+ #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
+diff --git a/configure b/configure
+index 29002c8..56c6fe4 100755
+--- a/configure
++++ b/configure
+@@ -1203,6 +1203,15 @@ EOF
+     _optimizing="$proc"
+     ;;
+ 
++  avr32)
++    _def_arch='#define ARCH_AVR32'
++    _target_arch='TARGET_ARCH_AVR32 = yes'
++    iproc='avr32'
++    proc=''
++    _march=''
++    _mcpu=''
++    _optimizing=''
++    ;;
+   arm|armv4l|armv5tel)
+     _def_arch='#define ARCH_ARMV4L 1'
+     _target_arch='TARGET_ARCH_ARMV4L = yes'
+@@ -1533,7 +1542,7 @@ echores $_named_asm_args
+ # Checking for CFLAGS
+ _stripbinaries=yes
+ if test "$_profile" != "" || test "$_debug" != "" ; then
+-  CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
++  CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
+   if test "$_cc_major" -ge "3" ; then
+     CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
+   fi
+@@ -3794,7 +3803,7 @@ fi
+ 
+ 
+ echocheck "X11 headers presence"
+-  for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
++  for I in `echo $_inc_extra | sed s/-I//g`; do
+     if test -f "$I/X11/Xlib.h" ; then
+       _inc_x11="-I$I"
+       _x11_headers="yes"
+diff --git a/libaf/af_format.c b/libaf/af_format.c
+index e5b7cc9..5d7ea6d 100644
+--- a/libaf/af_format.c
++++ b/libaf/af_format.c
+@@ -20,7 +20,14 @@
+ // Integer to float conversion through lrintf()
+ #ifdef HAVE_LRINTF
+ #include <math.h>
++
++#ifdef ARCH_AVR32
++#define lrintf(x) rint(x)
++#define llrint(x) (long long)rint(x) 
++#else
+ long int lrintf(float);
++#endif
++
+ #else
+ #define lrintf(x) ((int)(x))
+ #endif
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 17b6c45..8e1dc96 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC)              += sparc/dsputil_vis.o \
+ 
+ sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
+ 
++# avr32 specific stuff
++ifeq ($(TARGET_ARCH_AVR32),yes)
++ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
++OBJS += avr32/dsputil_avr32.o
++endif
++
+ # sun mediaLib specific stuff
+ OBJS-$(HAVE_MLIB)                      += mlib/dsputil_mlib.o \
+ 
+@@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
+ clean::
+ 	rm -f \
+ 	   i386/*.o i386/*~ \
++	   avr32/*.o avr32/*~ \
+ 	   armv4l/*.o armv4l/*~ \
+ 	   mlib/*.o mlib/*~ \
+ 	   alpha/*.o alpha/*~ \
+diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
+new file mode 100644
+index 0000000..200284d
+--- /dev/null
++++ b/libavcodec/avr32/dsputil_avr32.c
+@@ -0,0 +1,2678 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++#include "../dsputil.h"
++#include "pico.h"
++
++int avr32_use_pico = 1;
++
++//#define CHECK_DSP_FUNCS_AGAINST_C
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define DSP_FUNC_NAME(name) test_ ## name
++#else
++#define DSP_FUNC_NAME(name) name
++#endif
++
++union doubleword {
++  int64_t doubleword;
++  struct {
++    int32_t top;
++    int32_t bottom;
++  } words; 
++};
++
++#undef  LD16
++#undef  LD32
++#undef  LD64
++  
++#define LD16(a) (*((uint16_t*)(a)))
++#define LD32(a) (*((uint32_t*)(a)))
++#define LD64(a) (*((uint64_t*)(a)))
++#define LD64_UNALIGNED(a) \
++  ({ union doubleword __tmp__; \
++   __tmp__.words.top = LD32(a); \
++   __tmp__.words.bottom = LD32(a + 4); \
++   __tmp__.doubleword; }) 
++
++#undef  ST32
++#undef  ST16
++
++#define ST16(a, b) *((uint16_t*)(a)) = (b)
++#define ST32(a, b) *((uint32_t*)(a)) = (b)
++
++#undef rnd_avg32
++#define rnd_avg32(a, b) \
++  ({ uint32_t __tmp__;\
++     asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
++     __tmp__;})
++
++void idct_avr32(DCTELEM *data);
++void fdct_avr32(DCTELEM *data);
++
++void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
++
++void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
++
++#define extern_dspfunc(PFX, NUM) \
++    void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h );     \
++    void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h );  \
++    void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h );  \
++    void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++#undef extern_dspfunc
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++#define extern_dspfunc(PFX, NUM)                                        \
++  void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++  void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++  void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
++  void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
++
++extern_dspfunc(put, 4);
++extern_dspfunc(put_no_rnd, 4);
++extern_dspfunc(put, 8);
++extern_dspfunc(put_no_rnd, 8);
++extern_dspfunc(put, 16);
++extern_dspfunc(put_no_rnd, 16);
++extern_dspfunc(avg, 8);
++extern_dspfunc(avg_no_rnd, 8);
++extern_dspfunc(avg, 16);
++extern_dspfunc(avg_no_rnd, 16);
++
++
++#undef extern_dspfunc
++#define extern_dspfunc(PFX, NUM) \
++void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride);  \
++void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride);  \
++
++extern_dspfunc(put_h264_qpel,  16);
++extern_dspfunc(put_h264_qpel,  8);
++extern_dspfunc(put_h264_qpel,  4);
++extern_dspfunc(avg_h264_qpel,  16);
++extern_dspfunc(avg_h264_qpel,  8);
++extern_dspfunc(avg_h264_qpel,  4);
++
++#undef extern_dspfunc
++
++void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++                         
++void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
++
++
++void dump_block8(uint8_t *block, int line_size, int h);
++void dump_block4(uint8_t *block, int line_size, int h);
++void dump_block(uint8_t *block, int line_size, int h, int w);
++
++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                  int h, char *name, int max_dev);
++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                  int h, char *name, int max_dev);
++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                 int h, int width, char *name, int max_dev);
++
++#define PIXOP2( OPNAME, OP ) \
++void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++        a= LD32(&src1[i*src_stride1+4]);\
++        b= LD32(&src2[i*src_stride2+4]);\
++        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++    }\
++}\
++\
++void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++    }\
++}\
++\
++void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
++    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#else
++#define PIXOP2( OPNAME, OP ) \
++static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
++        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
++        OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
++        OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
++        pixels+=line_size;\
++        block +=line_size;\
++    }\
++}\
++static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++        a= LD32(&src1[i*src_stride1+4]);\
++        b= LD32(&src2[i*src_stride2+4]);\
++        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
++    }\
++}\
++\
++static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    int i;\
++    for(i=0; i<h; i++){\
++        uint32_t a,b;\
++        a= LD32(&src1[i*src_stride1  ]);\
++        b= LD32(&src2[i*src_stride2  ]);\
++        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
++    }\
++}\
++\
++static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
++                                                int src_stride1, int src_stride2, int h){\
++    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
++    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
++}\
++
++#endif
++
++#define op_avg(a, b) a = rnd_avg32(a, b)
++#define op_put(a, b) a = b
++
++PIXOP2(avg, op_avg)
++PIXOP2(put, op_put)
++#undef op_avg
++#undef op_put
++
++
++
++static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++  int i;
++  for(i=0; i<h; i++)
++    {
++      ST32(dst   , LD32(src   ));
++      dst+=dstStride;
++      src+=srcStride;
++    }
++}
++
++static void clear_blocks_avr32(DCTELEM *blocks)
++{
++  int n = 12;
++  uint64_t tmp1, tmp2;
++  blocks += 6*64;  
++  asm volatile ( "mov\t%1, 0\n" 
++                 "mov\t%m1, 0\n" 
++                 "mov\t%2, 0\n" 
++                 "mov\t%m2, 0\n" 
++                 "0:\n" 
++                 "stm\t--%3, %1, %m1, %2, %m2\n"
++                 "stm\t--%3, %1, %m1, %2, %m2\n"
++                 "stm\t--%3, %1, %m1, %2, %m2\n"
++                 "stm\t--%3, %1, %m1, %2, %m2\n"        
++                 "sub\t%0, 1\n"        
++                 "brne\t0b\n"        
++                 : "+r"(n), "=&r"(tmp1), "=&r"(tmp2), 
++                 "+r"(blocks));
++}
++
++
++static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++  int i;
++  for(i=0; i<h; i++)
++    {
++      ST32(dst   , LD32(src   ));
++      ST32(dst+4 , LD32(src+4 ));
++      dst+=dstStride;
++      src+=srcStride;
++    }
++}
++
++static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
++{
++  int i;
++  for(i=0; i<h; i++)
++    {
++      ST32(dst   , LD32(src   ));
++      ST32(dst+4 , LD32(src+4 ));
++      ST32(dst+8 , LD32(src+8 ));
++      ST32(dst+12, LD32(src+12));
++      dst+=dstStride;
++      src+=srcStride;
++    }
++}
++
++
++static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++  
++  for(i=0; i<h; i++)
++    {
++      
++      int src0 = LD32(src);
++      int src1 = LD32(src + stride);
++
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++      src += stride;
++      ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
++      dst += stride;
++    }
++}
++
++
++static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);\
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        dst+= stride;
++        src+= stride;
++      */
++      
++      int src0 = LD32(src);
++      int src1 = (((int)src[4] << 24) | (int)src[stride]);
++      int src2 = LD32(src + stride + 1);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++
++      dst += stride;
++    }
++}
++
++static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++        dst+= stride;
++        src+= stride;
++      */  
++      int src0 = LD32(src);
++      int src1 = (((int)src[4] << 24) | (int)src[stride]);
++      int src2 = LD32(src + stride + 1);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++      
++      src0 = LD32(src + 4);
++      src1 = (src[8] << 24) | src[stride + 4];
++      src2 = LD32(src + stride + 5);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
++
++      dst += stride;
++    }
++}
++
++
++static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++  
++  for(i=0; i<h; i++)
++    {
++      int src0 = LD32(src);
++      int src1 = LD32(src + stride);
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
++      src += stride;
++      ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
++      dst += stride;
++    }
++}
++
++
++static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);\
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        dst+= stride;
++        src+= stride;
++      */
++      
++      int src0 = *((int *)src);
++      int src1 = (int)((src[4] << 24) | src[stride]);
++      int src2 = *((int *)(src + stride + 1));
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++      dst += stride;
++    }
++}
++
++static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
++  const int A=(8-x)*(8-y);
++  const int B=(  x)*(8-y);
++  const int C=(8-x)*(  y);
++  const int D=(  x)*(  y);
++  int i;
++  
++  PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF0_B, 32);
++  PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
++  PICO_PUT_W(PICO_COEFF1_B, 0);
++  PICO_PUT_W(PICO_COEFF2_A, 0);
++  PICO_PUT_W(PICO_COEFF2_B, 0);
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE) 
++             | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(6)
++             | PICO_OFFSET_FRAC_BITS(6));
++
++  for(i=0; i<h; i++)
++    {
++      /*
++        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
++        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
++        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
++        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
++        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
++        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
++        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
++        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
++        dst+= stride;
++        src+= stride;
++      */  
++      int src0 = *((int *)src);
++      int src1 = (volatile int)((src[4] << 24) | src[stride]);
++      int src2 = *((int *)(src + stride + 1));
++
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++
++      src0 = *((int *)(src + 4));
++      src1 = (int)((src[8] << 24) | src[stride + 4]);
++      src2 = *((int *)(src + stride + 5));
++      
++      PICO_MVRC_W(PICO_INPIX0, src0);
++      PICO_MVRC_W(PICO_INPIX1, src1);
++      PICO_MVRC_W(PICO_INPIX2, src2);
++      PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
++      PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
++      src += stride;
++      ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
++      dst += stride;
++    }
++}
++
++static struct pico_config_t h264_qpel4_h_lowpass_config = { 
++  .input_mode = PICO_HOR_FILTER_MODE,
++  .output_mode = PICO_PLANAR_MODE,
++  .coeff_frac_bits = 5,
++  .offset_frac_bits = 5,
++  .coeff0_0 = 1,
++  .coeff0_1 = -5,
++  .coeff0_2 = 20,
++  .coeff0_3 = 16,
++  .coeff1_0 = 20,
++  .coeff1_1 = -5,
++  .coeff1_2 = 1,
++  .coeff1_3 = 0,
++  .coeff2_0 = 0,
++  .coeff2_1 = 0,
++  .coeff2_2 = 0,
++  .coeff2_3 = 0 
++};
++
++
++
++static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  const int h=4;
++  int i;
++    
++  set_pico_config(&h264_qpel4_h_lowpass_config);
++
++  for(i=0; i<h; i++){
++    
++    /*
++      OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++      OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++      OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++      OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++      dst+=dstStride;\
++      src+=srcStride;\ */
++    PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++    PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++    PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++    PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++    PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++    PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++    src += srcStride;
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++  }
++}
++
++static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  const int h=4;
++  int i;
++    
++  set_pico_config(&h264_qpel4_h_lowpass_config);
++  
++  for(i=0; i<h; i++){
++    
++    /*
++      OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
++      OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
++      OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
++      OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
++      dst+=dstStride;\
++      src+=srcStride;\ */
++        
++    PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
++    PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
++    PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++    PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++    PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++    PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++    src += srcStride;
++    ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
++    dst += dstStride;
++  }
++}
++
++static struct pico_config_t h264_qpel4_v_lowpass_config1 = { 
++  .input_mode = PICO_VERT_FILTER_MODE,
++  .output_mode = PICO_PACKED_MODE,
++  .coeff_frac_bits = 5,
++  .offset_frac_bits = 5,
++  .coeff0_0 = 1,
++  .coeff0_1 = -5,
++  .coeff0_2 = 20,
++  .coeff0_3 = 16,
++  .coeff1_0 = 1,
++  .coeff1_1 = -5,
++  .coeff1_2 = 20,
++  .coeff1_3 = 16,
++  .coeff2_0 = 1,
++  .coeff2_1 = -5,
++  .coeff2_2 = 20,
++  .coeff2_3 = 16 
++};
++
++
++
++static struct pico_config_t h264_qpel4_v_lowpass_config2 = { 
++  .input_mode = PICO_VERT_FILTER_MODE,
++  .output_mode = PICO_PLANAR_MODE,
++  .coeff_frac_bits = 5,
++  .offset_frac_bits = 5,
++  .coeff0_0 = 1,
++  .coeff0_1 = -5,
++  .coeff0_2 = 20,
++  .coeff0_3 = 16,
++  .coeff1_0 = 20,
++  .coeff1_1 = -5,
++  .coeff1_2 = 1,
++  .coeff1_3 = 0,
++  .coeff2_0 = 0,
++  .coeff2_1 = 0,
++  .coeff2_2 = 0,
++  .coeff2_3 = 0 
++};
++
++static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++  /*
++    const int w=4;
++    uint8_t *cm = cropTbl + MAX_NEG_CROP;
++    int i;
++    for(i=0; i<w; i++)
++    {
++    const int srcB= src[-2*srcStride];\
++    const int srcA= src[-1*srcStride];\
++    const int src0= src[0 *srcStride];\
++    const int src1= src[1 *srcStride];\
++    const int src2= src[2 *srcStride];\
++    const int src3= src[3 *srcStride];\
++    const int src4= src[4 *srcStride];\
++    const int src5= src[5 *srcStride];\
++    const int src6= src[6 *srcStride];\
++    OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
++    OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
++    OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
++    OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
++    dst++;\
++    src++;\
++  */  
++  
++  set_pico_config(&h264_qpel4_v_lowpass_config1);
++  
++  {
++    int srcB= LD32(src - 2*srcStride);
++    int srcA= LD32(src - 1*srcStride);
++    int src0= LD32(src + 0 *srcStride);
++    int src1= LD32(src + 1 *srcStride);
++    int src2= LD32(src + 2 *srcStride);
++    int src3= LD32(src + 3 *srcStride);
++    int src4= LD32(src + 4 *srcStride);
++    int src5= LD32(src + 5 *srcStride);
++    int src6= LD32(src + 6 *srcStride);
++    
++    /* First compute the leftmost three colums */
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_MVRC_W(PICO_INPIX1, src3);
++    PICO_MVRC_W(PICO_INPIX0, src4);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_MVRC_W(PICO_INPIX1, src4);
++    PICO_MVRC_W(PICO_INPIX0, src5);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    dst += dstStride;
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src4);
++    PICO_MVRC_W(PICO_INPIX1, src5);
++    PICO_MVRC_W(PICO_INPIX0, src6);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(dst, PICO_GET_W(PICO_OUTPIX0));
++    /* Now compute the last column */
++ 
++    union wordbytes {
++      int word;
++      struct  {
++        unsigned int t:8;
++        unsigned int u:8;
++        unsigned int l:8;
++        unsigned int b:8; 
++      } bytes; } tmp1, tmp2, tmp3;
++    
++    
++    tmp1.bytes.t = srcB;
++    tmp1.bytes.u = src1;
++    tmp1.bytes.l = src4;
++    
++    tmp2.bytes.t = srcA;
++    tmp2.bytes.u = src2;
++    tmp2.bytes.l = src5;
++
++    tmp3.bytes.t = src0;
++    tmp3.bytes.u = src3;
++    tmp3.bytes.l = src6;
++    
++    PICO_MVRC_W(PICO_INPIX0, tmp1.word);
++    PICO_MVRC_W(PICO_INPIX1, tmp2.word);
++    PICO_MVRC_W(PICO_INPIX2, tmp3.word);
++    set_pico_config(&h264_qpel4_v_lowpass_config2);
++
++    
++    PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++    PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++    PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++    PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++    
++    PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
++    dst[3] = (char)(tmp1.bytes.b);
++    dst[3 - dstStride] = (char)(tmp1.bytes.l);
++    dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
++    dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
++    
++  }
++    /*}
++    
++
++    }*/
++}
++
++static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++
++  /*
++    const int w=4;
++    uint8_t *cm = cropTbl + MAX_NEG_CROP;
++    int i;
++    for(i=0; i<w; i++)
++    {
++    const int srcB= src[-2*srcStride];\
++    const int srcA= src[-1*srcStride];\
++    const int src0= src[0 *srcStride];\
++    const int src1= src[1 *srcStride];\
++    const int src2= src[2 *srcStride];\
++    const int src3= src[3 *srcStride];\
++    const int src4= src[4 *srcStride];\
++    const int src5= src[5 *srcStride];\
++    const int src6= src[6 *srcStride];\
++    OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
++    OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
++    OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
++    OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
++    dst++;\
++    src++;\
++  */  
++  uint8_t tmp_block[4*4];
++  
++  set_pico_config(&h264_qpel4_v_lowpass_config1);
++  
++  {
++    int srcB= LD32(src - 2*srcStride);
++    int srcA= LD32(src - 1*srcStride);
++    int src0= LD32(src + 0 *srcStride);
++    int src1= LD32(src + 1 *srcStride);
++    int src2= LD32(src + 2 *srcStride);
++    int src3= LD32(src + 3 *srcStride);
++    int src4= LD32(src + 4 *srcStride);
++    int src5= LD32(src + 5 *srcStride);
++    int src6= LD32(src + 6 *srcStride);
++    
++    /* First compute the leftmost three colums */
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_MVRC_W(PICO_INPIX1, src3);
++    PICO_MVRC_W(PICO_INPIX0, src4);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_MVRC_W(PICO_INPIX1, src4);
++    PICO_MVRC_W(PICO_INPIX0, src5);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(0, 0, 0, 3, 6);
++    PICO_MVRC_W(PICO_INPIX2, src4);
++    PICO_MVRC_W(PICO_INPIX1, src5);
++    PICO_MVRC_W(PICO_INPIX0, src6);
++    PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
++    ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
++    /* Now compute the last column */
++ 
++    union wordbytes {
++      int word;
++      struct  {
++        unsigned int t:8;
++        unsigned int u:8;
++        unsigned int l:8;
++        unsigned int b:8; 
++      } bytes; } tmp1, tmp2, tmp3;
++    
++    
++    tmp1.bytes.t = srcB;
++    tmp1.bytes.u = src1;
++    tmp1.bytes.l = src4;
++    
++    tmp2.bytes.t = srcA;
++    tmp2.bytes.u = src2;
++    tmp2.bytes.l = src5;
++
++    tmp3.bytes.t = src0;
++    tmp3.bytes.u = src3;
++    tmp3.bytes.l = src6;
++    
++    PICO_MVRC_W(PICO_INPIX0, tmp1.word);
++    PICO_MVRC_W(PICO_INPIX1, tmp2.word);
++    PICO_MVRC_W(PICO_INPIX2, tmp3.word);
++    set_pico_config(&h264_qpel4_v_lowpass_config2);
++
++    
++    PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
++    PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
++    PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
++    PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
++    
++    PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
++    tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
++    tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
++    tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
++    tmp_block[3] = (char)(tmp1.bytes.t);
++
++    /* Compute the average */
++    srcB= LD32(dst);
++    srcA= LD32(dst + dstStride);
++    src0= LD32(dst + dstStride*2);
++    src1= LD32(dst + dstStride*3);
++          
++    src2= LD32(tmp_block);
++    src3= LD32(tmp_block + 4);
++    src4= LD32(tmp_block + 8);
++    src5= LD32(tmp_block + 12);
++
++    ST32(dst, rnd_avg32(srcB, src2));
++    ST32(dst + dstStride, rnd_avg32(srcA, src3));
++    ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
++    ST32(dst + 3*dstStride, rnd_avg32(src1, src5));    
++  }
++}
++
++static struct pico_config_t h264_qpel4_hv_lowpass_config = { 
++  .input_mode = PICO_HOR_FILTER_MODE,
++  .output_mode = PICO_PACKED_MODE,
++  .coeff_frac_bits = 10,
++  .offset_frac_bits = 10,
++  .coeff0_0 = 1,
++  .coeff0_1 = -5,
++  .coeff0_2 = 20,
++  .coeff0_3 = 512,
++  .coeff1_0 = -5,
++  .coeff1_1 = 25,
++  .coeff1_2 = -100,
++  .coeff1_3 = 0,
++  .coeff2_0 = 20,
++  .coeff2_1 = -100,
++  .coeff2_2 = 400,
++  .coeff2_3 = 0 
++};
++
++static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst,  uint8_t *src, int dstStride, int srcStride){
++
++  int32_t tmp_block[48];
++  int32_t *tmp = tmp_block;
++  int i;
++  
++  set_pico_config(&h264_qpel4_hv_lowpass_config);
++
++  src -= 2;
++  for ( i = 0; i < 2; i++ ){ 
++    int srcB= LD32(src - 2*srcStride);
++    int srcA= LD32(src - 1*srcStride);
++    int src0= LD32(src + 0 *srcStride);
++    int src1= LD32(src + 1 *srcStride);
++    int src2= LD32(src + 2 *srcStride);
++    int src3= LD32(src + 3 *srcStride);
++    int src4= LD32(src + 4 *srcStride);
++    int src5= LD32(src + 5 *srcStride);
++    int src6= LD32(src + 6 *srcStride);
++    
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(0, 0, 0, 4, 8);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_OP(0, 0, 4, 8, 0);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_MVRC_W(PICO_INPIX1, src3);
++    PICO_MVRC_W(PICO_INPIX0, src4);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_MVRC_W(PICO_INPIX0, src2);
++    PICO_OP(0, 0, 4, 8, 0);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_MVRC_W(PICO_INPIX1, src4);
++    PICO_MVRC_W(PICO_INPIX0, src5);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(0, 0, 4, 8, 0);
++    PICO_MVRC_W(PICO_INPIX2, src4);
++    PICO_MVRC_W(PICO_INPIX1, src5);
++    PICO_MVRC_W(PICO_INPIX0, src6);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); 
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;    
++    src += 2;
++  }
++
++  src -= 1;
++  tmp -= 48;
++
++  
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE)  
++             | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(10)
++             | PICO_OFFSET_FRAC_BITS(10));
++
++  for ( i = 0; i < 2; i++ ){ 
++    int srcB= LD32(src - 2*srcStride);
++    int srcA= LD32(src - 1*srcStride);
++    int src0= LD32(src + 0 *srcStride);
++    int src1= LD32(src + 1 *srcStride);
++    int src2= LD32(src + 2 *srcStride);
++    int src3= LD32(src + 3 *srcStride);
++    int src4= LD32(src + 4 *srcStride);
++    int src5= LD32(src + 5 *srcStride);
++    int src6= LD32(src + 6 *srcStride);
++    
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++    
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_MVRC_W(PICO_INPIX1, src3);
++    PICO_MVRC_W(PICO_INPIX0, src4);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++    
++    ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16)); 
++    ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); 
++    
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_MVRC_W(PICO_INPIX1, src4);
++    PICO_MVRC_W(PICO_INPIX0, src5);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src4);
++    PICO_MVRC_W(PICO_INPIX1, src5);
++    PICO_MVRC_W(PICO_INPIX0, src6);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++
++    ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16)); 
++    ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); 
++
++    dst += 2;
++    src += 2;
++  }
++}
++
++
++
++
++static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst,  uint8_t *src, int dstStride, int srcStride){
++
++  int32_t tmp_block[48];
++  int32_t *tmp = tmp_block;
++  int i;
++  
++  set_pico_config(&h264_qpel4_hv_lowpass_config);
++
++  src -= 2;
++  for ( i = 0; i < 2; i++ ){ 
++    int srcB= LD32(src - 2*srcStride);
++    int srcA= LD32(src - 1*srcStride);
++    int src0= LD32(src + 0 *srcStride);
++    int src1= LD32(src + 1 *srcStride);
++    int src2= LD32(src + 2 *srcStride);
++    int src3= LD32(src + 3 *srcStride);
++    int src4= LD32(src + 4 *srcStride);
++    int src5= LD32(src + 5 *srcStride);
++    int src6= LD32(src + 6 *srcStride);
++    
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(0, 0, 0, 4, 8);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_OP(0, 0, 4, 8, 0);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_MVRC_W(PICO_INPIX1, src3);
++    PICO_MVRC_W(PICO_INPIX0, src4);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_MVRC_W(PICO_INPIX0, src2);
++    PICO_OP(0, 0, 4, 8, 0);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_MVRC_W(PICO_INPIX1, src4);
++    PICO_MVRC_W(PICO_INPIX0, src5);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(0, 0, 4, 8, 0);
++    PICO_MVRC_W(PICO_INPIX2, src4);
++    PICO_MVRC_W(PICO_INPIX1, src5);
++    PICO_MVRC_W(PICO_INPIX0, src6);
++    PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); 
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;
++    
++    PICO_OP(0, 0, 1, 5, 9);
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
++    PICO_STCM_W(tmp, 
++                PICO_REGVECT_VMU0_OUT, 
++                PICO_REGVECT_VMU1_OUT, 
++                PICO_REGVECT_VMU2_OUT);
++    tmp += 3;    
++    src += 2;
++  }
++
++  src -= 1;
++  tmp -= 48;
++
++  
++  PICO_PUT_W(PICO_CONFIG, 
++             PICO_OUTPUT_MODE(PICO_PLANAR_MODE)  
++             | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE) 
++             | PICO_COEFF_FRAC_BITS(10)
++             | PICO_OFFSET_FRAC_BITS(10));
++
++  for ( i = 0; i < 2; i++ ){ 
++    int srcB= LD32(src - 2*srcStride);
++    int srcA= LD32(src - 1*srcStride);
++    int src0= LD32(src + 0 *srcStride);
++    int src1= LD32(src + 1 *srcStride);
++    int src2= LD32(src + 2 *srcStride);
++    int src3= LD32(src + 3 *srcStride);
++    int src4= LD32(src + 4 *srcStride);
++    int src5= LD32(src + 5 *srcStride);
++    int src6= LD32(src + 6 *srcStride);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX0, src3);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, srcB);
++    PICO_MVRC_W(PICO_INPIX1, srcA);
++    PICO_MVRC_W(PICO_INPIX2, src0);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++    
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_MVRC_W(PICO_INPIX1, src3);
++    PICO_MVRC_W(PICO_INPIX0, src4);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, srcA);
++    PICO_MVRC_W(PICO_INPIX1, src0);
++    PICO_MVRC_W(PICO_INPIX2, src1);
++    PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++    
++    ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); 
++    ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0))); 
++    
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_MVRC_W(PICO_INPIX1, src4);
++    PICO_MVRC_W(PICO_INPIX0, src5);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, src0);
++    PICO_MVRC_W(PICO_INPIX1, src1);
++    PICO_MVRC_W(PICO_INPIX2, src2);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
++    PICO_MVRC_W(PICO_INPIX2, src4);
++    PICO_MVRC_W(PICO_INPIX1, src5);
++    PICO_MVRC_W(PICO_INPIX0, src6);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
++
++    PICO_LDCM_W_INC(tmp, 
++                    PICO_REGVECT_VMU0_OUT, 
++                    PICO_REGVECT_VMU1_OUT, 
++                    PICO_REGVECT_VMU2_OUT);
++    PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
++    PICO_MVRC_W(PICO_INPIX0, src1);
++    PICO_MVRC_W(PICO_INPIX1, src2);
++    PICO_MVRC_W(PICO_INPIX2, src3);
++    PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
++
++    ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); 
++    ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0))); 
++
++    dst += 2;
++    src += 2;
++  }
++}
++
++
++static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  put_h264_qpel4_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++  src += 4*srcStride;
++  dst += 4*dstStride;
++  put_h264_qpel4_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  avg_h264_qpel4_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++  src += 4*srcStride;
++  dst += 4*dstStride;
++  avg_h264_qpel4_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  put_h264_qpel4_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++  src += 4*srcStride;
++  dst += 4*dstStride;
++  put_h264_qpel4_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  avg_h264_qpel4_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++  src += 4*srcStride;
++  dst += 4*dstStride;
++  avg_h264_qpel4_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  put_h264_qpel4_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++  src += 4*srcStride;
++  dst += 4*dstStride;
++  put_h264_qpel4_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  avg_h264_qpel4_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++  src += 4*srcStride;
++  dst += 4*dstStride;
++  avg_h264_qpel4_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
++}
++
++static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  put_h264_qpel8_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++  src += 8*srcStride;
++  dst += 8*dstStride;
++  put_h264_qpel8_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  avg_h264_qpel8_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++  src += 8*srcStride;
++  dst += 8*dstStride;
++  avg_h264_qpel8_v_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  put_h264_qpel8_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++  src += 8*srcStride;
++  dst += 8*dstStride;
++  put_h264_qpel8_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  avg_h264_qpel8_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++  src += 8*srcStride;
++  dst += 8*dstStride;
++  avg_h264_qpel8_h_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  put_h264_qpel8_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++  src += 8*srcStride;
++  dst += 8*dstStride;
++  put_h264_qpel8_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
++  avg_h264_qpel8_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++  src += 8*srcStride;
++  dst += 8*dstStride;
++  avg_h264_qpel8_hv_lowpass_pico(dst  , src  , dstStride, srcStride);
++  avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
++}
++
++
++#define H264_MC(OPNAME, SIZE) \
++static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
++    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t half[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
++    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t half[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t half[SIZE*SIZE];\
++    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
++    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t half[SIZE*SIZE];\
++    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t halfH[SIZE*SIZE];\
++    uint8_t halfV[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
++    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t halfH[SIZE*SIZE];\
++    uint8_t halfV[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
++    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t halfH[SIZE*SIZE];\
++    uint8_t halfV[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
++    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t halfH[SIZE*SIZE];\
++    uint8_t halfV[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
++    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
++    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t halfH[SIZE*SIZE];\
++    uint8_t halfHV[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
++    put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t halfH[SIZE*SIZE];\
++    uint8_t halfHV[SIZE*SIZE];\
++    put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
++    put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t halfV[SIZE*SIZE];\
++    uint8_t halfHV[SIZE*SIZE];\
++    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++    put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++\
++static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
++    uint8_t full[SIZE*(SIZE+5)];\
++    uint8_t * const full_mid= full + SIZE*2;\
++    uint8_t halfV[SIZE*SIZE];\
++    uint8_t halfHV[SIZE*SIZE];\
++    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
++    put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
++    put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
++    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
++}\
++
++H264_MC(put_, 4)
++H264_MC(put_, 8)
++H264_MC(put_, 16)
++H264_MC(avg_, 4)
++H264_MC(avg_, 8)
++H264_MC(avg_, 16)
++
++
++
++#define dspfunc16(PFX) \
++    void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++          PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
++          PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
++    }\
++    void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++          PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
++          PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
++    }\
++    void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++          PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
++          PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
++    }\
++    void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
++          PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
++          PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
++    }\
++
++
++dspfunc16(put)
++dspfunc16(put_no_rnd)
++dspfunc16(avg)
++dspfunc16(avg_no_rnd)
++#undef dspfunc16
++
++static int pix_sum_avr32(uint8_t * pix, int line_size)
++{
++    int s, i;
++
++    s = 0;
++    for (i = 0; i < 16; i++) {
++      int tmp1,tmp2,tmp3,tmp4,tmp5;
++      __asm__ volatile ( "ld.w\t%0, %6[0]\n\t"
++                         "ld.w\t%1, %6[4]\n\t"
++                         "ld.w\t%2, %6[8]\n\t"
++                         "ld.w\t%3, %6[12]\n\t"
++                         "punpckub.h\t%4, %0:t\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         "punpckub.h\t%4, %0:b\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         "punpckub.h\t%4, %1:t\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         "punpckub.h\t%4, %1:b\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         "punpckub.h\t%4, %2:t\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         "punpckub.h\t%4, %2:b\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         "punpckub.h\t%4, %3:t\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         "punpckub.h\t%4, %3:b\n\t"
++                         "padd.h\t%5, %5, %4\n\t"
++                         : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s)
++                         : "r"(pix));
++      pix += line_size;
++    }
++    __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) );
++
++    return s;
++}
++
++
++//#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
++//#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
++//#define H264_WEIGHT(W,H) \
++//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
++//    int attribute_unused x, y; \
++//    offset <<= log2_denom; \
++//    if(log2_denom) offset += 1<<(log2_denom-1); \
++//    for(y=0; y<H; y++, block += stride){ \
++//      uint32_t tmp0, tmp1;
++//      if(W==2) { \
++//        asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \
++//                       "ld.ub\t%[tmp1], %[block][1]\n" \
++//                       "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \
++//                       "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
++//                       "asr\t%[tmp0], %[log2_denom]\n" \
++//                       "asr\t%[tmp1], %[log2_denom]\n" \
++//                       "satu\t%[tmp0] >> 0, 8\n" \
++//                       "satu\t%[tmp1] >> 0, 8\n" \
++//                       "st.b\t%[block][0], %[tmp0]\n" \
++//                       "st.b\t%[block][1], %[tmp1]\n" \
++//                       : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
++//                       : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \     
++//      } else if ( W==4 ) { \
++//        asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \
++//                       "punpckub.h\t%[tmp1], %[tmp0]:t\n" \
++//                       "punpckub.h\t%[tmp0], %[tmp0]:b\n" \
++//                       "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \
++//                       "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
++//                       "asr\t%[tmp0], %[log2_denom]\n" \
++//                       "asr\t%[tmp1], %[log2_denom]\n" \
++//                       "satu\t%[tmp0] >> 0, 8\n" \
++//                       "satu\t%[tmp1] >> 0, 8\n" \
++//                       "st.b\t%[block][0], %[tmp0]\n" \
++//                       "st.b\t%[block][1], %[tmp1]\n" \
++//                       : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
++//                       : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \     
++//
++//
++//
++//        if(W==4) continue; \
++//        op_scale1(4); \
++//        op_scale1(5); \
++//        op_scale1(6); \
++//        op_scale1(7); \
++//        if(W==8) continue; \
++//        op_scale1(8); \
++//        op_scale1(9); \
++//        op_scale1(10); \
++//        op_scale1(11); \
++//        op_scale1(12); \
++//        op_scale1(13); \
++//        op_scale1(14); \
++//        op_scale1(15); \
++//    } \
++//} \
++//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
++//    int attribute_unused x, y; \
++//    int offset = (offsets + offsetd + 1) >> 1; \
++//    offset = ((offset << 1) + 1) << log2_denom; \
++//    for(y=0; y<H; y++, dst += stride, src += stride){ \
++//        op_scale2(0); \
++//        op_scale2(1); \
++//        if(W==2) continue; \
++//        op_scale2(2); \
++//        op_scale2(3); \
++//        if(W==4) continue; \
++//        op_scale2(4); \
++//        op_scale2(5); \
++//        op_scale2(6); \
++//        op_scale2(7); \
++//        if(W==8) continue; \
++//        op_scale2(8); \
++//        op_scale2(9); \
++//        op_scale2(10); \
++//        op_scale2(11); \
++//        op_scale2(12); \
++//        op_scale2(13); \
++//        op_scale2(14); \
++//        op_scale2(15); \
++//    } \
++//}
++
++
++
++/* Returns zero in each byte where the absolute difference between <a> and <b>
++   is not less than <compare> */
++#define PABS_DIFF_LESS_THAN( a, b, compare) \
++  ({  uint32_t __tmp__, __tmp2__, __mask__; \
++      asm ( \
++      /* Check ABS( a - b ) < compare */  \
++        "psubs.ub\t%[tmp], %[opa], %[opb]\n"  \
++        "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
++        "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
++        /* This produces 0 for all bytes where the comparison is not true */ \
++        "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
++        : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__)  \
++        : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare)  ); \
++      __mask__; })
++
++/* 
++   Set all bytes containing zero in <value> to 255 and the rest to zero.
++   
++   Add with saturation 254 to all bytes making all bytes different from 
++   zero become 255. Then add one without saturation to make all bytes 
++   originally containing zero 255 and the rest 0. */ 
++#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
++  ({  uint32_t __tmp__; \
++      asm ( \
++        "padds.ub\t%[tmp], %[val], %[max_minus_one]\n"  \
++        "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
++        : [tmp] "=r"(__tmp__) \
++        : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
++      __tmp__; })
++
++#define  PACKW_SH(upper, lower) \
++  ({  uint32_t __tmp__; \
++      asm ( \
++        "packw.sh\t%[tmp], %[u], %[l]\n"  \
++        : [tmp] "=r"(__tmp__) \
++        : [u] "r"(upper), [l] "r"(lower) ); \
++      __tmp__; })
++
++#define  PACKSH_UB(upper, lower) \
++  ({  uint32_t __tmp__; \
++      asm ( \
++        "packsh.sb\t%[tmp], %[u], %[l]\n"  \
++        : [tmp] "=r"(__tmp__) \
++        : [u] "r"(upper), [l] "r"(lower) ); \
++      __tmp__; })
++
++static  void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
++{
++  int i;
++  
++  if ( alpha == 0 )
++    return;
++  
++  alpha = PACKW_SH(alpha, alpha);
++  alpha = PACKSH_UB(alpha, alpha);
++  beta = PACKW_SH(beta, beta);
++  beta = PACKSH_UB(beta, beta);
++
++  for( i = 0; i < 4; i++ ) {
++    uint32_t p0, p1, p2, q0, q1, q2;
++    uint32_t mask, mask2;
++    uint32_t tmp, tmp2, tmp3, tmp4;
++
++    if( tc0[i] < 0 ) {
++      pix += 4;
++      continue;
++    }
++
++/*    for( d = 0; d < 4; d++ ) {
++       const int p0 = pix[-1*stride];
++       const int p1 = pix[-2*stride];
++       const int p2 = pix[-3*stride];
++       const int q0 = pix[0];
++       const int q1 = pix[1*stride];
++       const int q2 = pix[2*stride];
++      
++       if( ABS( p0 - q0 ) < alpha &&
++           ABS( p1 - p0 ) < beta &&
++           ABS( q1 - q0 ) < beta ) { */
++
++    p0 = LD32(pix - stride);
++    p1 = LD32(pix - 2*stride);
++    q0 = LD32(pix);
++    q1 = LD32(pix + stride);
++    
++    /* Check which of the columns should be filtered, if any. */
++    mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
++    mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
++    mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
++        
++    if ( !mask )
++      continue;
++        
++    mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
++    
++
++    int tc =  PACKW_SH(tc0[i], tc0[i]);
++    int tc0_p = tc;
++    int tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
++
++    /*
++      int i_delta;
++      if( ABS( p2 - p0 ) < beta ) {
++      pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
++      tc++;
++      }*/ 
++
++    p2 = LD32(pix - 3*stride);
++    mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
++    
++    if ( mask2 ){
++      mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
++      asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
++           "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
++           "punpckub.h\t%[tmp2], %[tmp]:t\n"
++           "punpckub.h\t%[tmp], %[tmp]:b\n"
++           "punpckub.h\t%[tmp3], %[p1]:t\n"
++           "punpckub.h\t%[tmp4], %[p1]:b\n"
++           "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++           "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
++           "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
++           "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
++           "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
++           "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
++           "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++           "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
++           "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
++           "andn\t%[tmp], %[mask2]\n"
++           "and\t%[tmp2], %[q1], %[mask2]\n"
++           "or\t%[tmp], %[tmp2]\n"
++           : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
++           [tmp4]"=&r"(tmp4)
++           : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
++           [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
++      ST32(pix - 2*stride, tmp);      
++      tc += 0x00010001;
++    }
++
++        
++    q2 = LD32(pix + 2*stride);
++
++    /*
++      if( ABS( q2 - q0 ) < beta ) {
++      pix[   stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
++      tc++;
++      }
++    */
++    mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
++
++    if ( mask2 ){
++      mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
++      asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
++           "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
++           "punpckub.h\t%[tmp2], %[tmp]:t\n"
++           "punpckub.h\t%[tmp], %[tmp]:b\n"
++           "punpckub.h\t%[tmp3], %[q1]:t\n"
++           "punpckub.h\t%[tmp4], %[q1]:b\n"
++           "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++           "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
++           "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
++           "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
++           "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
++           "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
++           "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
++           "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
++           "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
++           "andn\t%[tmp], %[mask2]\n"
++           "and\t%[tmp2], %[q1], %[mask2]\n"
++           "or\t%[tmp], %[tmp2]\n"
++           : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
++           [tmp4]"=&r"(tmp4)
++           : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
++           [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
++      ST32(pix + stride, tmp); 
++      tc += 0x00010001;
++    }
++
++    uint32_t old_p0 = p0;
++    uint32_t old_q0 = q0;
++    
++    /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
++       pix[-stride] = clip_uint8( p0 + i_delta );   
++       pix[0]        = clip_uint8( q0 - i_delta );  */ 
++    
++    asm (
++         /* Check if the two upper pixels should be filtered */
++         "lsr\t%[tmp], %[inv_mask], 16\n"
++         "breq\t0f\n"
++
++         "punpckub.h\t%[tmp], %[p1]:t\n"
++         "punpckub.h\t%[tmp2], %[q1]:t\n"
++         
++         /* p1 - q1 */
++         "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
++
++         "punpckub.h\t%[tmp3], %[q0]:t\n"
++         "punpckub.h\t%[tmp4], %[p0]:t\n"
++
++         /* q0 - p0 */
++         "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
++         
++         /* (q0 - p0) << 2 */
++         "plsl.h\t%[tmp2], %[tmp2], 2\n"
++
++         /* ((q0 - p0) << 2) + (p1 - q1) */
++         "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
++
++         "mov\t%[tmp], 0x00040004\n"
++         /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
++         "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
++
++         /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
++         "pasr.h\t%[tmp2], %[tmp2], 3\n"
++
++         "mov\t%[tmp], 0\n"
++         "psub.h\t%[tmp], %[tmp], %[tc]\n"
++
++         /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
++         "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
++         "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
++
++
++         /* pix[-stride] = clip_uint8( p0 + i_delta ); */
++         "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
++
++         
++         /* pix[0]        = clip_uint8( q0 - i_delta ); */
++         "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
++
++         /* Check if the two lower pixels should be filtered */
++         "lsl\t%[tmp2], %[inv_mask], 16\n"
++         "breq\t1f\n"
++
++         "0:\n"
++         "punpckub.h\t%[p1], %[p1]:b\n"
++         "punpckub.h\t%[q1], %[q1]:b\n"
++
++         /* p1 - q1 */
++         "psub.h\t%[p1], %[p1], %[q1]\n"
++
++         "punpckub.h\t%[q0], %[q0]:b\n"
++         "punpckub.h\t%[p0], %[p0]:b\n"
++
++         /* q0 - p0 */
++         "psub.h\t%[tmp2], %[q0], %[p0]\n"
++
++         /* (q0 - p0) << 2 */
++         "plsl.h\t%[tmp2], %[tmp2], 2\n"
++
++         /* ((q0 - p0) << 2) + (p1 - q1) */
++         "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
++
++         "mov\t%[q1], 0x00040004\n"
++         /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
++         "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
++
++         /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
++         "pasr.h\t%[tmp2], %[tmp2], 3\n"
++ 
++         /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
++         "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
++         "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
++
++         /* pix[-stride] = clip_uint8( p0 + i_delta ); */
++         "padd.h\t%[p0], %[p0], %[tmp2]\n"
++
++         /* pix[0]        = clip_uint8( q0 - i_delta ); */
++         "psub.h\t%[q0], %[q0], %[tmp2]\n"
++
++         "1:\n"
++         "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
++         "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
++
++         : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
++         [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
++         : [tc]"r"(tc), [inv_mask]"r"(~mask));
++    
++    ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
++    ST32(pix, (mask & old_q0) | (q0 & ~mask));
++    
++  }
++  pix += 1;
++}
++
++
++
++
++#ifdef CHECK_DSP_FUNCS_AGAINST_C
++
++void dump_block8(uint8_t *block, int line_size, int h){
++  int i, j;
++
++  for ( i = 0; i < h ; i++ ){
++    av_log(NULL, AV_LOG_ERROR, "\t"); 
++    for ( j = 0; j < 8 ; j++ ){
++      av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); 
++    }
++    av_log(NULL, AV_LOG_ERROR, "\n"); 
++  }
++}
++
++void dump_block4(uint8_t *block, int line_size, int h){
++  int i, j;
++
++  for ( i = 0; i < h ; i++ ){
++    av_log(NULL, AV_LOG_ERROR, "\t"); 
++    for ( j = 0; j < 4 ; j++ ){
++      av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); 
++    }
++    av_log(NULL, AV_LOG_ERROR, "\n"); 
++  }
++}
++
++void dump_block(uint8_t *block, int line_size, int h, int w){
++  int i, j;
++
++  for ( i = 0; i < h ; i++ ){
++    av_log(NULL, AV_LOG_ERROR, "\t"); 
++    for ( j = 0; j < w ; j++ ){
++      av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); 
++    }
++    av_log(NULL, AV_LOG_ERROR, "\n"); 
++  }
++}
++
++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                  int h, char *name, int max_dev){
++  int i,j;
++  for ( i = 0; i < 8 ; i++ ){
++    for ( j = 0; j < h ; j++ ){
++      int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
++      diff = diff < 0 ? -diff : diff;
++      if ( diff > max_dev ){
++        av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", 
++               i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);        
++        av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
++        dump_block8(test, line_size_test, h); 
++        av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
++        dump_block8(correct, line_size_correct, h);         
++        exit(1);
++      }
++    }
++  }  
++}
++
++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                  int h, char *name, int max_dev){
++  int i,j;
++  for ( i = 0; i < 4 ; i++ ){
++    for ( j = 0; j < h ; j++ ){
++      int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
++      diff = diff < 0 ? -diff : diff;
++      if ( diff > max_dev ){
++        av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", 
++               i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);        
++        av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
++        dump_block8(test, line_size_test, h); 
++        av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
++        dump_block4(correct, line_size_correct, h);         
++        exit(1);
++      }
++    }
++  }  
++}
++
++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, 
++                 int h, int width, char *name, int max_dev){
++  int i,j;
++  for ( i = 0; i < width ; i++ ){
++    for ( j = 0; j < h ; j++ ){
++      int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
++      diff = diff < 0 ? -diff : diff;
++      if ( diff > max_dev ){
++        av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", 
++               i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);        
++        av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
++        dump_block(test, line_size_test, h, width); 
++        av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
++        dump_block(correct, line_size_correct, h, width);         
++        exit(1);
++      }
++    }
++  }  
++}
++
++void dump_dct_block(DCTELEM *block){
++  int i, j;
++
++  for ( i = 0; i < 8 ; i++ ){
++    av_log(NULL, AV_LOG_ERROR, "\t"); 
++    for ( j = 0; j < 8 ; j++ ){
++      av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]); 
++    }
++    av_log(NULL, AV_LOG_ERROR, "\n"); 
++  }
++}
++
++void test_idct_avr32(DCTELEM *block){
++  DCTELEM testBlock[64];
++  int i, j;
++
++  /* Copy transposed block to testBlock */ 
++  for ( i = 0; i < 8 ; i++ ){
++    for ( j = 0; j < 8 ; j++ ){
++      testBlock[i + 8*j] = block[j + i*8]; 
++    }
++  }
++  
++  idct_avr32(block);
++  simple_idct(&testBlock);
++  
++  for ( i = 0; i < 64 ; i++ ){
++    if ( block[i] != testBlock[i] ){
++      av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
++      dump_dct_block(block); 
++      av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
++      dump_dct_block(testBlock);         
++      exit(1);
++    }
++  }
++}
++
++void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
++  uint8_t testBlock[64];
++  DCTELEM blockCopy[64];
++  int i, j;
++
++  /* Copy transposed block to blockCopy */ 
++  for ( i = 0; i < 8 ; i++ ){
++    for ( j = 0; j < 8 ; j++ ){
++      blockCopy[i + 8*j] = block[j + i*8]; 
++    }
++  }
++
++  idct_put_avr32(dest, line_size, block);
++  simple_idct_put(&testBlock, 8, blockCopy);
++  
++  check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
++}
++
++
++void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
++  uint8_t testBlock[64];
++  DCTELEM blockCopy[64];
++  int i, j;
++  
++  /* Copy dest to testBlock */ 
++  for ( i = 0; i < 8 ; i++ ){
++    for ( j = 0; j < 8 ; j++ ){
++      testBlock[i + 8*j] = dest[i + j*line_size]; 
++    }
++  }
++
++  /* Copy transposed block to blockCopy */ 
++  for ( i = 0; i < 8 ; i++ ){
++    for ( j = 0; j < 8 ; j++ ){
++      blockCopy[i + 8*j] = block[j + i*8]; 
++    }
++  }
++
++  idct_add_avr32(dest, line_size, block);
++  simple_idct_add(&testBlock, 8, blockCopy);
++  
++  check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
++}
++
++void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
++  uint8_t testBlock[16];
++  DCTELEM blockCopy[16];
++  int i, j;
++  
++  /* Copy dest to testBlock */ 
++  for ( i = 0; i < 4 ; i++ ){
++    for ( j = 0; j < 4 ; j++ ){
++      testBlock[i + 4*j] = dest[i + j*stride]; 
++    }
++  }
++
++  /* Copy transposed block to blockCopy */ 
++  for ( i = 0; i < 16 ; i++ ){
++    blockCopy[i] = block[i]; 
++  }
++
++  ff_h264_idct_add_c(dest, block, stride);
++  
++  h264_idct_add_avr32(testBlock, blockCopy, 4);
++  
++  check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
++}
++
++void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
++  uint8_t testBlock[8*8];
++  DCTELEM blockCopy[8*8];
++  int i, j;
++  
++  /* Copy dest to testBlock */ 
++  for ( i = 0; i < 8 ; i++ ){
++    for ( j = 0; j < 8 ; j++ ){
++      testBlock[i + 8*j] = dest[i + j*stride]; 
++    }
++  }
++
++  /* Copy source block to blockCopy */ 
++  for ( i = 0; i < 8*8 ; i++ ){
++    blockCopy[i] = block[i]; 
++  }
++
++  ff_h264_idct8_add_c(dest, block, stride);
++  h264_idct8_add_avr32(testBlock, blockCopy, 8);
++  
++  check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
++}
++
++void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block, 
++                       const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
++  uint8_t *testBlock, *testBlock2;
++  int i, j;
++  int input_v_size = h + in_v_size;
++  int input_h_size = 8 + in_h_size;
++
++  testBlock = alloca(input_h_size*input_v_size);
++  testBlock2 = alloca(input_h_size*input_v_size);
++
++  for ( i = 0; i < input_h_size ; i++ ){
++    for ( j = 0; j < input_v_size ; j++ ){
++      testBlock[i + input_h_size*j] = pixels[i + j*line_size]; 
++    }
++  }
++
++  test(block, pixels, line_size, h);
++  correct(testBlock2, testBlock, input_h_size, h);
++  
++  check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
++
++} 
++
++void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst, 
++                               uint8_t *src, int stride, int h, int w, int x, int y, char *name){
++  uint8_t *testBlock, *testBlock2;
++  int i, j;
++  int input_v_size = h + 1;
++  int input_h_size = ((w + 1) + 3) & ~3;
++
++  testBlock = alloca(input_h_size*input_v_size);
++  testBlock2 = alloca(input_h_size*input_v_size);
++
++  for ( i = 0; i < w + 1 ; i++ ){
++    for ( j = 0; j < h + 1 ; j++ ){
++      testBlock[i + input_h_size*j] = src[i + j*stride]; 
++    }
++  }
++
++  for ( i = 0; i < w ; i++ ){
++    for ( j = 0; j < h ; j++ ){
++      testBlock2[i + input_h_size*j] = dst[i + j*stride]; 
++    }
++  }
++
++  test(dst, src, stride, h, x, y);
++  correct(testBlock2, testBlock, input_h_size, h, x, y);
++  
++  check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
++
++} 
++
++void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst, 
++                        uint8_t *src, int stride, int size, char *name){
++  uint8_t *testBlock, *testBlock2;
++  int i, j;
++  int test_stride = size + 8;
++
++  testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
++  testBlock2 = alloca(test_stride*size);
++  
++  for ( i = -4; i < size+4 ; i++ ){
++    for ( j = -4; j < size+4 ; j++ ){
++      testBlock[i + test_stride*j] = src[i + j*stride]; 
++    }
++  }
++  
++  for ( i = 0; i < size ; i++ ){
++    for ( j = 0; j < size ; j++ ){
++      testBlock2[i + test_stride*j] = dst[i + j*stride]; 
++    }
++  }
++
++  correct(dst, src, stride);
++  test(testBlock2, testBlock, test_stride);
++  
++  check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
++
++} 
++
++
++#define test_pixels_funcs(PFX, NUM ) \
++void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++  test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
++                         block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
++void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++  test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
++                         block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
++void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++  test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
++                         block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
++void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
++  test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
++                         block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
++
++test_pixels_funcs(put, 8);
++test_pixels_funcs(put_no_rnd, 8);
++test_pixels_funcs(put, 16);
++test_pixels_funcs(put_no_rnd, 16);
++
++test_pixels_funcs(avg, 8);
++test_pixels_funcs(avg_no_rnd, 8);
++test_pixels_funcs(avg, 16);
++test_pixels_funcs(avg_no_rnd, 16);
++
++#define test_h264_chroma_mc_funcs(PFX, NUM ) \
++void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
++  test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
++                         dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
++
++test_h264_chroma_mc_funcs(put, 2);
++test_h264_chroma_mc_funcs(put, 4);
++test_h264_chroma_mc_funcs(put, 8);
++test_h264_chroma_mc_funcs(avg, 2);
++test_h264_chroma_mc_funcs(avg, 4);
++test_h264_chroma_mc_funcs(avg, 8);
++
++#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
++void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
++  test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
++                         dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); } 
++
++#define test_qpel_mc_funcs(PFX, NUM) \
++  test_qpel_mc_funcs_type(PFX, NUM, mc00);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc10);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc20);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc30);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc01);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc11);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc21);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc31);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc02);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc12);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc22);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc32);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc03);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc13);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc23);\
++  test_qpel_mc_funcs_type(PFX, NUM, mc33)
++
++test_qpel_mc_funcs(put_h264_qpel, 4);
++test_qpel_mc_funcs(put_h264_qpel, 8);
++test_qpel_mc_funcs(put_h264_qpel, 16);
++test_qpel_mc_funcs(avg_h264_qpel, 4);
++test_qpel_mc_funcs(avg_h264_qpel, 8);
++test_qpel_mc_funcs(avg_h264_qpel, 16);
++
++
++#define dspfunc(PFX, IDX, NUM) \
++    c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
++    c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
++    c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
++    c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
++    c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
++    c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
++    c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
++
++#endif
++
++void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
++{
++
++  /* H264 */
++
++  if ( 0 /*avr32_use_pico*/ ){
++    c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
++    c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
++    c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
++    
++    c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
++    c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
++    c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
++  }
++
++#define dspfunc(PFX, IDX, NUM) \
++    c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
++    c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
++    c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
++    c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
++    c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
++    c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
++    c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
++    c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
++
++  if ( avr32_use_pico ){
++    dspfunc(put_h264_qpel, 0, 16);
++    dspfunc(put_h264_qpel, 1, 8);
++    dspfunc(put_h264_qpel, 2, 4);
++    dspfunc(avg_h264_qpel, 0, 16);
++    dspfunc(avg_h264_qpel, 1, 8);
++    dspfunc(avg_h264_qpel, 2, 4);
++  }
++
++  c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
++  c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
++  c->idct    = DSP_FUNC_NAME(idct_avr32);
++  c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
++  c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
++
++  /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/
++  
++  c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
++  
++  c->fdct = fdct_avr32;
++
++  c->clear_blocks = clear_blocks_avr32;
++  
++#undef dspfunc  
++#define dspfunc(PFX, IDX, NUM) \
++    c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 );     \
++    c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32);  \
++    c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32);  \
++    c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
++
++    dspfunc(put, 0, 16);
++    dspfunc(put_no_rnd, 0, 16);
++    dspfunc(put, 1, 8);
++    dspfunc(put_no_rnd, 1, 8);
++
++    dspfunc(avg, 1, 8);
++    dspfunc(avg_no_rnd, 1, 8);
++    dspfunc(avg, 0, 16);
++    dspfunc(avg_no_rnd, 0, 16);
++#undef dspfunc
++
++}
++
++
++
++#if 0
++int main(int argc, char *argv[]){
++  
++
++}
++#endif
++
+diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S
+new file mode 100644
+index 0000000..be45b86
+--- /dev/null
++++ b/libavcodec/avr32/fdct.S
+@@ -0,0 +1,541 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++//********************************************************** 
++//* 2-D fDCT, Based on: 		          		  *
++//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
++//* Fast 1-D DCT Algorithms with 11 Multiplications",      *
++//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal    *
++//* Processing 1989 (ICASSP '89), pp. 988-991.             *
++//*							  *
++//* Fixed point implementation optimized for the AVR-II	  *	
++//* instruction set. If a table is used for the 		  *
++//* coeffisients we can load two and two of them from      *
++//* This will give a reduction of 
++//*							  *
++//*							  *
++//**********************************************************
++
++
++/* This routine is a slow-but-accurate integer implementation of the
++ * forward DCT (Discrete Cosine Transform). Taken from the IJG software
++ *
++ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
++ * on each column.  Direct algorithms are also available, but they are
++ * much more complex and seem not to be any faster when reduced to code.
++ *
++ * This implementation is based on an algorithm described in
++ *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
++ *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
++ *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
++ * The primary algorithm described there uses 11 multiplies and 29 adds.
++ * We use their alternate method with 12 multiplies and 32 adds.
++ * The advantage of this method is that no data path contains more than one
++ * multiplication; this allows a very simple and accurate implementation in
++ * scaled fixed-point arithmetic, with a minimal number of shifts.
++ *
++ * The poop on this scaling stuff is as follows:
++ *
++ * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
++ * larger than the true DCT outputs.  The final outputs are therefore
++ * a factor of N larger than desired; since N=8 this can be cured by
++ * a simple right shift at the end of the algorithm.  The advantage of
++ * this arrangement is that we save two multiplications per 1-D DCT,
++ * because the y0 and y4 outputs need not be divided by sqrt(N).
++ * In the IJG code, this factor of 8 is removed by the quantization step
++ * (in jcdctmgr.c), here it is removed.
++ *
++ * We have to do addition and subtraction of the integer inputs, which
++ * is no problem, and multiplication by fractional constants, which is
++ * a problem to do in integer arithmetic.  We multiply all the constants
++ * by CONST_SCALE and convert them to integer constants (thus retaining
++ * CONST_BITS bits of precision in the constants).  After doing a
++ * multiplication we have to divide the product by CONST_SCALE, with proper
++ * rounding, to produce the correct output.  This division can be done
++ * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
++ * as long as possible so that partial sums can be added together with
++ * full fractional precision.
++ *
++ * The outputs of the first pass are scaled up by PASS1_BITS bits so that
++ * they are represented to better-than-integral precision.  These outputs
++ * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
++ * with the recommended scaling.  (For 12-bit sample data, the intermediate
++ * array is INT32 anyway.)
++ *
++ * To avoid overflow of the 32-bit intermediate results in pass 2, we must
++ * have 8 + CONST_BITS + PASS1_BITS <= 26.  Error analysis
++ * shows that the values given below are the most effective.
++ *
++ * We can gain a little more speed, with a further compromise in accuracy,
++ * by omitting the addition in a descaling shift.  This yields an incorrectly
++ * rounded result half the time...
++ */
++
++	.global		fdct_avr32
++
++
++
++#define CONST_BITS  13
++#define PASS1_BITS  2
++
++#define FIX_0_298631336  2446	/* FIX(0.298631336) */
++#define FIX_0_390180644  3196	/* FIX(0.390180644) */
++#define FIX_0_541196100  4433	/* FIX(0.541196100) */
++#define FIX_0_765366865  6270	/* FIX(0.765366865) */
++#define FIX_0_899976223  7373	/* FIX(0.899976223) */
++#define FIX_1_175875602  9633	/* FIX(1.175875602) */
++#define FIX_1_501321110  12299	/* FIX(1.501321110) */
++#define FIX_1_847759065  15137	/* FIX(1.847759065) */
++#define FIX_1_961570560  16069	/* FIX(1.961570560) */
++#define FIX_2_053119869  16819	/* FIX(2.053119869) */
++#define FIX_2_562915447  20995	/* FIX(2.562915447) */
++#define FIX_3_072711026  25172	/* FIX(3.072711026) */
++
++
++/*
++ * Perform an integer forward DCT on one block of samples.
++ */
++
++//void
++//fdct_int32(short *const block)
++//{
++//	int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
++//	int tmp10, tmp11, tmp12, tmp13;
++//	int z1, z2, z3, z4, z5;
++//	short *blkptr;
++//	int *dataptr;
++//	int data[64];
++//	int i;
++//
++//       /* Pass 1: process rows. */
++//       /* Note results are scaled up by sqrt(8) compared to a true DCT; */
++//       /* furthermore, we scale the results by 2**PASS1_BITS. */
++//
++//	dataptr = data;
++//	blkptr = block;
++
++	.text
++fdct_avr32:
++	pushm		r0-r3, r4-r7, lr
++#define		loop_ctr	r0
++#define		blkptr		r12
++#define		x0		r1
++#define		x1		r2
++#define		x2		r3
++#define		x3		r4
++#define		x4		r5
++#define		x5		r6
++#define		x6		r7
++#define		x7		r8
++#define		tmp0		r5
++#define		tmp7		r2
++#define		tmp1		r3
++#define		tmp6		r4
++#define		tmp2		r9
++#define		tmp5		r8
++#define		tmp3		r7
++#define		tmp4		r6
++
++
++	mov		loop_ctr, 8
++//	for (i = 0; i < 8; i++) {
++ROW_LOOP:
++
++	ldm		blkptr, r1, r2, r3, r4 
++
++//		tmp2 = blkptr[2] + blkptr[5];
++//		tmp3 = blkptr[3] + blkptr[4];
++	paddx.h		r5, r3, r2  
++//		tmp5 = blkptr[2] - blkptr[5];
++//		tmp4 = blkptr[3] - blkptr[4];
++	psubx.h		r6, r3, r2 
++//		tmp0 = blkptr[0] + blkptr[7];
++//		tmp1 = blkptr[1] + blkptr[6];
++	paddx.h		r2, r4, r1
++//		tmp7 = blkptr[0] - blkptr[7];
++//		tmp6 = blkptr[1] - blkptr[6];
++	psubx.h		r3, r4, r1
++
++//		/* Even part per LL&M figure 1 --- note that published figure is faulty;
++//		 * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
++//		 */
++
++#define		tmp10		r1
++#define		tmp13		r5
++#define		tmp11		r7
++#define		tmp12		r3
++#define		z1		r9
++
++//		tmp10 = tmp0 + tmp3;
++//		tmp13 = tmp0 - tmp3;
++	paddsub.h	r1, r2:t, r5:b	
++//		tmp11 = tmp1 + tmp2;
++//		tmp12 = tmp1 - tmp2;
++	paddsub.h	r4, r2:b, r5:t	
++	
++
++//		dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
++//		dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
++	paddsub.h	r7, r1:t, r4:t
++	ld.w		r10, pc[const_table - .] 
++	plsl.h		r7, r7, PASS1_BITS
++	
++//		z1 = (tmp12 + tmp13) * FIX_0_541196100;
++	addhh.w		r8, r4:b, r1:b
++	mulhh.w		r8, r8:b, r10:t		
++	
++//		dataptr[2] =
++//			DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
++//		dataptr[6] =
++//			DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
++	mulhh.w		r9, r1:b, r10:b
++	ld.w		r10, pc[const_table - . + 4]	
++	add		r1, r8, r9	
++	satrnds		r1 >> (CONST_BITS - PASS1_BITS), 31	
++
++	mulhh.w		r9, r4:b, r10:t
++	add		r4, r8, r9
++	satrnds		r4 >> (CONST_BITS - PASS1_BITS), 31	
++		
++
++//		/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
++//		 * cK represents cos(K*pi/16).
++//		 * i0..i3 in the paper are tmp4..tmp7 here.
++//		 */
++
++#define		z2		r5
++#define		z3		r6
++#define		z4		r7
++#define		z5		r8
++
++//		z4 = tmp5 + tmp7;
++//		z3 = tmp4 + tmp6;
++	padd.h		r2, r6, r3 
++//		z2 = tmp5 + tmp6;
++//		z1 = tmp4 + tmp7;
++	paddx.h		r5, r6, r3 	
++
++	lddpc		r9, pc[const_table - . + 8]
++//		z5 = (z3 + z4) * FIX_1_175875602;	/* sqrt(2) * c3 */
++	addhh.w		r8, r2:t, r2:b
++	mulhh.w		r8, r8:b, r10:b
++	lddpc		r10, pc[const_table - . + 12]
++	
++
++//		tmp4 *= FIX_0_298631336;	/* sqrt(2) * (-c1+c3+c5-c7) */
++	mulhh.w		r11, r6:b, r9:t
++	
++//		tmp5 *= FIX_2_053119869;	/* sqrt(2) * ( c1+c3-c5+c7) */
++	mulhh.w		r6, r6:t, r9:b
++
++//		tmp6 *= FIX_3_072711026;	/* sqrt(2) * ( c1+c3+c5-c7) */
++	lddpc		r9, pc[const_table - . + 20]
++	mulhh.w		lr, r3:b, r10:t
++	
++//		tmp7 *= FIX_1_501321110;	/* sqrt(2) * ( c1+c3-c5-c7) */
++	mulhh.w		r3, r3:t, r10:b
++
++//		z3 *= -FIX_1_961570560;	/* sqrt(2) * (-c3-c5) */
++	mulhh.w		r10, r2:b, r9:t
++
++//		z4 *= -FIX_0_390180644;	/* sqrt(2) * (c5-c3) */
++	mulhh.w		r2, r2:t, r9:b
++	lddpc		r9, pc[const_table - . + 16]
++//		z3 += z5;
++//		z4 += z5;
++	add		r10, r8
++	add		r2, r8
++	
++//		z1 *= -FIX_0_899976223;	/* sqrt(2) * (c7-c3) */
++	mulhh.w		r8, r5:b, r9:t
++
++//		z2 *= -FIX_2_562915447;	/* sqrt(2) * (-c1-c3) */
++	mulhh.w		r5, r5:t, r9:b
++	
++//		dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
++	add		r11, r8
++	add		r11, r10
++	satrnds		r11 >> (CONST_BITS - PASS1_BITS), 31	
++	
++//		dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
++	add		r6, r5
++
++	sthh.w		blkptr[6*2], r4:b, r11:b  	
++	add		r6, r2
++	satrnds		r6 >> (CONST_BITS - PASS1_BITS), 31	
++
++//		dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
++	add		lr, r5
++	sthh.w		blkptr[4*2], r7:b, r6:b	
++	add		lr, r10
++	satrnds		lr >> (CONST_BITS - PASS1_BITS), 31	
++	
++//		dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
++	add		r3, r8
++	sthh.w		blkptr[2*2], r1:b, lr:b
++	add		r3, r2
++	satrnds		r3 >> (CONST_BITS - PASS1_BITS), 31	
++
++		
++
++//		dataptr += 8;			/* advance pointer to next row */
++//		blkptr += 8;
++	sthh.w		blkptr[0], r7:t, r3:b	
++	sub		blkptr, -16
++	sub		loop_ctr, 1
++	brne		ROW_LOOP
++
++//	}
++
++	/* Pass 2: process columns.
++	 * We remove the PASS1_BITS scaling, but leave the results scaled up
++	 * by an overall factor of 8.
++	 */
++
++//	dataptr = data;
++	sub		blkptr, 128
++
++	mov		loop_ctr, 4
++//	for (i = 0; i < 8; i++) {
++COLOUMN_LOOP:
++	ld.w		r1, blkptr[0]
++	ld.w		r2, blkptr[1*8*2]
++	ld.w		r3, blkptr[2*8*2]
++	ld.w		r4, blkptr[3*8*2]
++	ld.w		r5, blkptr[4*8*2]
++	ld.w		r6, blkptr[5*8*2]
++	ld.w		r7, blkptr[6*8*2]
++	ld.w		r8, blkptr[7*8*2]
++	
++//		tmp0 = blkptr[0] + blkptr[7*8];
++	padds.sh	r9, r1, r8 
++//		tmp7 = blkptr[0] - blkptr[7*8];
++	psubs.sh	r1, r1, r8 
++//		tmp1 = blkptr[1*8] + blkptr[6*8];
++	padds.sh	r8, r2, r7 
++//		tmp6 = blkptr[1*8] - blkptr[6*8];
++	psubs.sh	r2, r2, r7 
++//		tmp2 = blkptr[2*8] + blkptr[5*8];
++	padds.sh	r7, r3, r6 
++//		tmp5 = blkptr[2*8] - blkptr[5*8];
++	psubs.sh	r3, r3, r6 
++//		tmp3 = blkptr[3*8] + blkptr[4*8];
++	padds.sh	r6, r4, r5 
++//		tmp4 = blkptr[3*8] - blkptr[4*8];
++	psubs.sh	r4, r4, r5 
++
++//		/* even part per ll&m figure 1 --- note that published figure is faulty;
++//		 * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
++//		 */
++//
++//		tmp10 = tmp0 + tmp3;
++	padds.sh	r5, r9, r6 
++//		tmp13 = tmp0 - tmp3;
++	psubs.sh	r9, r9, r6 
++//		tmp11 = tmp1 + tmp2;
++	padds.sh	r6, r8, r7 
++//		tmp12 = tmp1 - tmp2;
++	psubs.sh	r8, r8, r7 
++
++//		dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
++//		dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
++//Might get an overflow here
++	padds.sh	r7, r5, r6
++	psubs.sh	r5, r5, r6
++
++	//Rounding	
++	mov		lr,  (1 << (PASS1_BITS + 2))
++	orh		lr, hi(1 << (16 + PASS1_BITS + 2))	
++	padds.sh	r7, r7, lr
++	padds.sh	r5, r5, lr
++	
++	pasr.h		r7, r7, PASS1_BITS + 3
++	pasr.h		r5, r5, PASS1_BITS + 3
++	st.w		r12[0], r7
++	st.w		r12[4*8*2], r5
++				 
++	lddpc		r10, const_table2
++
++
++//		z1 = (tmp12 + tmp13) * FIX_0_541196100;
++	padds.sh	r5, r8, r9
++	mulhh.w		r6, r5:t, r10:t
++	mulhh.w		r7, r5:b, r10:t
++
++//		dataptr[16] =
++//			DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
++	lddpc		r11, const_table2 + 4
++	mulhh.w		lr, r9:t, r10:b
++	mulhh.w		r9, r9:b, r10:b
++	add		lr, r6
++	add		r9, r7
++	satrnds		lr >> (CONST_BITS + PASS1_BITS + 3), 31
++	satrnds		r9 >> (CONST_BITS + PASS1_BITS + 3), 31
++	sthh.w		r12[2*8*2], lr:b, r9:b
++			
++//		dataptr[48] =
++//			DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
++	mulhh.w		lr, r8:t, r11:t
++	mulhh.w		r8, r8:b, r11:t
++	add		lr, r6
++	add		r8, r7
++	satrnds		lr >> (CONST_BITS + PASS1_BITS + 3), 31
++	satrnds		r8 >> (CONST_BITS + PASS1_BITS + 3), 31
++	sthh.w		r12[6*8*2], lr:b, r8:b
++
++//		/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
++//		 * cK represents cos(K*pi/16).
++//		 * i0..i3 in the paper are tmp4..tmp7 here.
++//		 */
++//
++//		z2 = tmp5 + tmp6;
++//		z3 = tmp4 + tmp6;
++//		z4 = tmp5 + tmp7;
++	padds.sh	r5, r3, r2
++	padds.sh	r6, r4, r2
++	padds.sh	r7, r3, r1
++	
++//		z5 = (z3 + z4) * FIX_1_175875602;	/* sqrt(2) * c3 */
++	padds.sh	r8, r6, r7
++	mulhh.w		r9, r8:t, r11:b
++	mulhh.w		r8, r8:b, r11:b
++	
++//		z3 *= -FIX_1_961570560;	/* sqrt(2) * (-c3-c5) */
++//		z3 += z5;
++	lddpc		r11, const_table2 + 8
++	mulhh.w		r10, r6:t, r11:t
++	mulhh.w		r6, r6:b, r11:t
++	add		r10, r9
++	add		r6, r8
++	
++//		z4 *= -FIX_0_390180644;	/* sqrt(2) * (c5-c3) */
++//		z4 += z5;
++	mulhh.w		lr, r7:t, r11:b
++	mulhh.w		r7, r7:b, r11:b
++	lddpc		r11, const_table2 + 12
++	st.w		--sp,r0
++	add		lr, r9
++	add		r7, r8
++
++//		tmp6 *= FIX_3_072711026;	/* sqrt(2) * ( c1+c3+c5-c7) */
++	mulhh.w		r0, r2:t, r11:t	  
++	machh.w		r0, r5:t, r11:b
++	mulhh.w		r2, r2:b, r11:t	  
++	machh.w		r2, r5:b, r11:b
++
++//		z2 *= -FIX_2_562915447;	/* sqrt(2) * (-c1-c3) */
++//		dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
++	add		r0, r10
++	lddpc		r11, const_table2 + 16
++	add		r2, r6
++	satrnds		r0 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	satrnds		r2 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	sthh.w		r12[3*8*2], r0:b, r2:b		
++//		tmp5 *= FIX_2_053119869;	/* sqrt(2) * ( c1+c3-c5+c7) */
++	mulhh.w		r0, r3:t, r11:t	  
++	machh.w		r0, r5:t, r11:b
++	mulhh.w		r2, r3:b, r11:t	  
++	machh.w		r2, r5:b, r11:b
++	add		r0, lr
++	lddpc		r11, const_table2 + 20
++	add		r2, r7
++	
++//		dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
++	satrnds		r0 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	satrnds		r2 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	sthh.w		r12[5*8*2], r0:b, r2:b		
++
++
++//		z1 = tmp4 + tmp7;
++	padds.sh	r2, r4, r1
++
++//		tmp4 *= FIX_0_298631336;	/* sqrt(2) * (-c1+c3+c5-c7) */
++	mulhh.w		r3, r4:t, r11:t
++	machh.w		r3, r2:t, r11:b
++	mulhh.w		r4, r4:b, r11:t
++	machh.w		r4, r2:b, r11:b
++	add		r3, r10
++	lddpc		r11, const_table2 + 24
++	add		r4, r6
++	
++//		z1 *= -FIX_0_899976223;	/* sqrt(2) * (c7-c3) */
++//		dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
++	satrnds		r3 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	satrnds		r4 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	sthh.w		r12[7*8*2], r3:b, r4:b		
++
++
++//		tmp7 *= FIX_1_501321110;	/* sqrt(2) * ( c1+c3-c5-c7) */
++	mulhh.w		r3, r1:t, r11:t
++	machh.w		r3, r2:t, r11:b
++	mulhh.w		r4, r1:b, r11:t
++	machh.w		r4, r2:b, r11:b
++	add		r3, lr
++	add		r4, r7
++
++//		dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
++	satrnds		r3 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	satrnds		r4 >> (CONST_BITS + PASS1_BITS + 3), 31	
++	sthh.w		r12[1*8*2], r3:b, r4:b		
++	ld.w		r0, sp++ 
++	
++//		dataptr++;				/* advance pointer to next column */
++	sub		blkptr, -4
++	sub		loop_ctr, 1
++	brne		COLOUMN_LOOP
++
++//	}
++
++	popm		r0-r3, r4-r7, pc
++	
++//	/* descale */
++//	for (i = 0; i < 64; i++)
++//		block[i] = (short int) DESCALE(data[i], 3);
++
++
++//}
++
++
++	.align	2
++const_table:	.short	FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
++		.short	FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
++		.short	-FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
++	
++const_table2:	.short	FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
++		.short	-FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
++		.short	FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336,  -FIX_0_899976223 
++		.short	FIX_1_501321110, -FIX_0_899976223
++	
++
++
++
+diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S
+new file mode 100644
+index 0000000..4b23e2d
+--- /dev/null
++++ b/libavcodec/avr32/h264idct.S
+@@ -0,0 +1,451 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++	.global	h264_idct_add_avr32
++
++	/* Macro for performing the 1-D transform on one row line. 
++
++	   The register 'w01' should contain the first two pixels,
++	   and the register 'w23' should contain the last two pixels
++	   in the line. The resulting line is placed in p01 and p23
++	   so that { w01, w23 } = { x0, x1, x3, x2 }.
++	   'tmp' and 'tmp2' should be scratchpad registers. */ 
++	.macro	transform_row 	w01, w23, tmp, tmp2 
++	add		\tmp, \w23, \w01 << 1   /* tmp = { xxxx, 2*w1 + w3 } */
++	sub		\tmp2, \w01, \w23 << 1  /* tmp2 = { xxxx, w1 - 2*w3 } */ 
++	bfins		\tmp2, \tmp, 16, 16     /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
++	pasr.h		\tmp2, \tmp2, 1	        /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
++	paddsub.h	\tmp, \w01:t, \w23:t    /* tmp =  { w0 + w2, w0 - w2 }  */
++	padd.h		\w01, \tmp, \tmp2	/* w01 =  { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
++	psub.h		\w23, \tmp, \tmp2	/* w23 =  { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
++	.endm			        	
++
++	/* Macro for performing the 1-D transform on two columns. 
++	   
++	   The registers w0, w1, w2, w3 should each contain two 
++	   packed samples from the two colomns to transform.
++	   tmp and tmp2 are scratchpad registers. 
++	  
++	   The resulting transformed columns are placed in the
++	   same positions as the input columns. 
++	*/ 
++	.macro	transform_2columns	w0, w1, w2, w3, tmp, tmp2
++	padd.h		\tmp, \w0, \w2  /* tmp = z0 = w0 + w2 */
++	psub.h		\w0, \w0, \w2   /* w0 = z1 = w0 - w2 */
++	pasr.h		\w2, \w1, 1	/* w2 = w1/2 */
++	pasr.h		\tmp2, \w3, 1	/* tmp2 = w3/2 */
++	psub.h		\w3, \w2, \w3	/* w3 = z2 = w1/2 - w3 */
++	padd.h		\tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
++	padd.h		\w1, \w0, \w3	/* w1 = x1 = z1 + z2 */
++	psub.h		\w2, \w0, \w3	/* w2 = x2 = z1 - z2 */
++	padd.h		\w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
++	psub.h		\w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
++	/* Scale down result. */
++	pasr.h		\w0, \w0, 6
++	pasr.h		\w1, \w1, 6
++	pasr.h		\w2, \w2, 6
++	pasr.h		\w3, \w3, 6
++	.endm			        	
++	
++/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
++
++h264_idct_add_avr32:
++		
++	stm	--sp,r0-r3,r4-r7, lr 
++
++	/* Setup rounding factor. */
++	mov	r0, (1 << 5)
++	lsl	r0, 16	
++
++	/* Load block */
++        ldm	r11,r2-r9 
++        /* r9 = { w00, w01 }, 
++           r8 = { w02, w03 }, 
++           r7 = { w10, w11 }, 
++           r6 = { w12, w13 }, 
++           r5 = { w20, w21 }, 
++           r4 = { w22, w23 }, 
++           r3 = { w30, w31 }, 
++           r2 = { w32, w33 } */
++
++	
++	/* Add the rounding factor to w00. */
++	add			r9, r0
++	
++	/* Transform rows */
++	transform_row	        r9, r8, r0, r1
++	transform_row	        r7, r6, r0, r1
++	transform_row	        r5, r4, r0, r1
++	transform_row	        r3, r2, r0, r1
++
++	/* Transform columns */
++	transform_2columns	r9, r7, r5, r3, r0, r1
++	transform_2columns	r8, r6, r4, r2, r0, r1
++
++	/* Load predicted pixels.*/
++	ld.w			lr, r12[0]
++        ld.w			r11, r12[r10]
++
++	/* Unpack to halwords. */
++        punpckub.h		r0, lr:t
++        punpckub.h		r1, lr:b
++
++	/* Add with transformed row. */
++        padd.h			r0, r0, r9
++        paddx.h			r1, r1, r8
++	/* Pack and saturate back to 8-bit pixels. */
++        packsh.ub		r0, r0, r1
++
++	/* Unpack to halwords. */
++        punpckub.h		lr, r11:t
++        punpckub.h		r11, r11:b
++
++	/* Add with transformed row. */
++        padd.h			lr, lr, r7
++        paddx.h			r11, r11, r6
++	/* Pack and saturate back to 8-bit pixels. */
++        packsh.ub		r1, lr, r11
++
++	/* Store back to frame. */
++	st.w			r12[0], r0
++	st.w			r12[r10], r1
++
++	add			r12, r12, r10 << 1
++
++	/* Load predicted pixels.*/
++	ld.w			lr, r12[0]
++        ld.w			r11, r12[r10]
++
++	/* Unpack to halwords. */
++        punpckub.h		r0, lr:t
++        punpckub.h		r1, lr:b
++
++	/* Add with transformed row. */
++        padd.h			r0, r0, r5
++        paddx.h			r1, r1, r4
++	/* Pack and saturate back to 8-bit pixels. */
++        packsh.ub		r0, r0, r1
++
++	/* Unpack to halwords. */
++        punpckub.h		lr, r11:t
++        punpckub.h		r11, r11:b
++
++	/* Add with transformed row. */
++        padd.h			lr, lr, r3
++        paddx.h			r11, r11, r2
++	/* Pack and saturate back to 8-bit pixels. */
++        packsh.ub		r1, lr, r11
++
++	/* Store back to frame. */
++	st.w			r12[0], r0
++	st.w			r12[r10], r1
++	
++	ldm			sp++,r0-r3,r4-r7, pc 
++
++
++	.global	h264_idct8_add_avr32
++//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
++
++h264_idct8_add_avr32:		
++	stm         --sp,r0-r3,r4-r7, lr 
++
++	/* Push dst and stride on stack */
++	stm         --sp,r10,r12 
++
++//    int i;
++//    DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
++//    uint8_t *cm = cropTbl + MAX_NEG_CROP;
++
++//    block[0] += 32;
++
++
++//    for( i = 0; i < 8; i++ )
++//    {
++	mov		lr, 4
++0:
++	ld.w		r7, r11[0*(8*2)]
++	ld.w		r6, r11[1*(8*2)]
++	ld.w		r5, r11[2*(8*2)]	
++	ld.w		r4, r11[3*(8*2)]	
++	ld.w		r3, r11[4*(8*2)]	
++	ld.w		r2, r11[5*(8*2)]	
++	ld.w		r1, r11[6*(8*2)]	
++	ld.w		r0, r11[7*(8*2)]	
++	
++/*	
++
++        const int a0 =  src[0][i] + src[4][i];
++        const int a2 =  src[0][i] - src[4][i];
++        const int a4 = (src[2][i]>>1) - src[6][i];
++        const int a6 = (src[6][i]>>1) + src[2][i]; 
++*/
++	padd.h		r8, r7, r3	/* r8 = a0 */	
++	psub.h		r7, r7, r3	/* r7 = a2 */	
++	pasr.h		r3, r5, 1	/* r3 = src[2][i] >> 1 */
++	pasr.h		r9, r1, 1	/* r9 = src[6][i] >> 1 */
++	psub.h		r3, r3, r1	/* r3 = a4 */
++	padd.h		r9, r9, r5	/* r9 = a6 */
++
++/*
++        const int b0 = a0 + a6;
++        const int b2 = a2 + a4;
++        const int b4 = a2 - a4;
++        const int b6 = a0 - a6; 
++*/
++	padd.h		r1, r8, r9	/* r1 = b0 */
++	psub.h		r8, r8, r9	/* r8 = b6 */
++	padd.h		r5, r7, r3	/* r5 = b2 */
++	psub.h		r7, r7, r3	/* r7 = b4 */
++	
++/*	
++        const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
++        const int a3 =  src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
++        const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
++        const int a7 =  src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
++*/
++	pasr.h		r3, r0, 1
++	padd.h		r3, r3, r0
++	psub.h		r3, r2, r3
++	psub.h		r3, r3, r4 /* r3 = a1 */
++	
++	pasr.h		r9, r4, 1
++	padd.h		r9, r9, r4
++	psub.h		r9, r0, r9
++	padd.h		r9, r6, r9 /* r9 = a3 */
++	
++	pasr.h		r10, r2, 1
++	padd.h		r10, r10, r2
++	padd.h		r10, r10, r0
++	psub.h		r10, r10, r6 /* r10 = a5 */
++	
++	pasr.h		r0, r6, 1
++	padd.h		r0, r0, r6
++	padd.h		r0, r0, r2
++	padd.h		r0, r0, r4 /* r0 = a7 */
++/*
++        const int b1 = (a7>>2) + a1;
++        const int b3 =  a3 + (a5>>2);
++        const int b5 = (a3>>2) - a5;
++        const int b7 =  a7 - (a1>>2);
++*/
++	pasr.h		r2, r0, 2
++	padd.h		r2, r2, r3 /* r2 = b1 */
++	pasr.h		r3, r3, 2
++	psub.h		r3, r0, r3 /* r3 = b7 */
++
++	pasr.h		r0, r10, 2
++	padd.h		r0, r0, r9 /* r0 = b3 */
++	pasr.h		r9, r9, 2
++	psub.h		r9, r9, r10 /* r9 = b5 */
++	
++	
++/*	
++        src[0][i] = b0 + b7;
++        src[7][i] = b0 - b7;
++        src[1][i] = b2 + b5;
++        src[6][i] = b2 - b5;
++        src[2][i] = b4 + b3;
++        src[5][i] = b4 - b3;
++        src[3][i] = b6 + b1;
++        src[4][i] = b6 - b1; */
++
++	padd.h		r4, r1, r3
++	psub.h		r1, r1, r3
++	st.w		r11[0*(8*2)], r4
++	st.w		r11[7*(8*2)], r1
++		
++	padd.h		r3, r5, r9
++	psub.h		r5, r5, r9
++	st.w		r11[1*(8*2)], r3
++	st.w		r11[6*(8*2)], r5
++	
++	padd.h		r9, r7, r0
++	psub.h		r7, r7, r0
++	st.w		r11[2*(8*2)], r9
++	st.w		r11[5*(8*2)], r7
++
++	padd.h		r0, r8, r2
++	psub.h		r8, r8, r2
++	st.w		r11[3*(8*2)], r0
++	st.w		r11[4*(8*2)], r8
++
++	sub		r11, -4
++	sub		lr, 1
++	brne		0b
++	
++//    }
++
++	lddsp		r12, sp[0]	/* r12 = dst */ 
++	sub		r11, 4*4
++	ldm		r11++, r4-r7
++	mov		lr, 8
++	/* Push dst and stride on stack */
++	
++1:		
++//    for( i = 0; i < 8; i++ )
++//    {
++
++	/* r7 = {src[i][0], src[i][1]}  
++           r6 = {src[i][2], src[i][3]}
++	   r5 = {src[i][4], src[i][5]}
++           r4 = {src[i][6], src[i][7]}	*/	
++
++/*
++        const int a0 =  src[i][0] + src[i][4];
++        const int a2 =  src[i][0] - src[i][4];
++        const int a4 = (src[i][2]>>1) - src[i][6];
++        const int a6 = (src[i][6]>>1) + src[i][2]; 
++*/
++	pasr.h		r8, r6, 1
++	pasr.h		r9, r4, 1
++	addhh.w		r0, r7:t, r5:t	/* r0 = a0 */
++	subhh.w		r1, r7:t, r5:t	/* r1 = a2 */
++	subhh.w		r2, r8:t, r4:t	/* r2 = a4 */
++	addhh.w		r3, r9:t, r6:t	/* r3 = a6 */		
++		
++/*
++        const int b0 = a0 + a6;
++        const int b2 = a2 + a4;
++        const int b4 = a2 - a4;
++        const int b6 = a0 - a6;	
++*/
++	add		r10, r0, r3	/* r10 = b0 */ 
++	sub		r0, r3		/* r0 = b6 */
++	add		r3, r1, r2	/* r3 = b2 */
++	sub		r1, r2		/* r1 = b4 */		
++/*
++	
++
++          const int a7 =  src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
++          const int a1 =  src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
++          const int a3 =  src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
++          const int a5 =  src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */
++	addhh.w		r8, r8:b, r6:b
++	addhh.w		r2, r4:b, r7:b
++	sub		r2, r8		/* r2 = a3 */	
++
++	addhh.w		r9, r9:b, r4:b
++	subhh.w		r8, r5:b, r6:b
++	sub		r8, r9		/* r8 = a1 */	
++
++	pasr.h		r9, r7, 1
++	addhh.w		r9, r9:b, r7:b
++	addhh.w		r6, r5:b, r6:b
++	add		r6, r9		/* r6 = a7 */	
++		
++	pasr.h		r9, r5, 1
++	addhh.w		r9, r9:b, r5:b
++	subhh.w		r5, r4:b, r7:b
++	add		r5, r9		/* r5 = a5 */	
++			
++/*        const int b1 = (a7>>2) + a1;
++          const int b3 = (a5>>2) + a3;
++          const int b5 = (a3>>2) - a5;
++          const int b7 = -(a1>>2) + a7  ;  */
++	asr		r4, r6, 2
++	add		r4, r8		/* r4 = b1 */
++	asr		r8, 2
++	rsub		r8, r6		/* r8 = b7 */
++	
++	asr		r6, r5, 2
++	add		r6, r2		/* r6 = b3 */
++	asr		r2, 2
++	sub		r2, r5		/* r2 = b5 */
++	
++/*
++        dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
++        dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
++        dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
++        dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
++        dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
++        dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
++        dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
++        dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
++*/
++	add		r5, r10, r8
++	satrnds		r5 >> 6, 0	/* r5 = (b0 + b7) >> 6 */
++	sub		r10, r8
++	satrnds		r10 >> 6, 0	/* r10 = (b0 - b7) >> 6 */
++	add		r8, r3, r2
++	satrnds		r8 >> 6, 0	/* r8 = (b2 + b5) >> 6 */
++	sub		r3, r2
++	satrnds		r3 >> 6, 0	/* r3 = (b2 - b5) >> 6 */
++	
++	add		r2, r1, r6	
++	satrnds		r2 >> 6, 0	/* r2 = (b4 + b3) >> 6 */
++	sub		r1, r6	
++	satrnds		r1 >> 6, 0	/* r1 = (b4 - b3) >> 6 */
++				
++	add		r6, r0, r4	
++	satrnds		r6 >> 6, 0	/* r6 = (b6 + b1) >> 6 */
++	sub		r0, r4	
++	satrnds		r0 >> 6, 0	/* r0 = (b6 - b1) >> 6 */
++
++	ld.w		r4, r12[0]
++	
++	packw.sh	r8, r5, r8
++	packw.sh	r7, r2, r6
++	ld.w		r9, r12[4]
++	packw.sh	r6, r0, r1
++	packw.sh	r5, r3, r10
++	
++	punpckub.h	r10, r4:t
++	punpckub.h	r4, r4:b
++	punpckub.h	r3, r9:t
++	punpckub.h	r9, r9:b
++
++	padd.h		r8, r8, r10
++	padd.h		r7, r7, r4
++	padd.h		r6, r6, r3
++	padd.h		r5, r5, r9
++	
++	lddsp		r10, sp[4]	/* r10 = stride */
++	packsh.ub	r0, r8, r7
++	packsh.ub	r1, r6, r5
++
++	st.w		r12[0], r0
++	st.w		r12[4], r1
++	
++	ldm		r11++, r4-r7
++	add		r12, r10	/* dst += stride */
++	
++	sub		lr, 1
++	brne		1b			
++			
++	sub		sp, -8
++	ldm		sp++,r0-r3,r4-r7, pc
++
++
++		
++//    }
++//}
+diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S
+new file mode 100644
+index 0000000..e7551ec
+--- /dev/null
++++ b/libavcodec/avr32/idct.S
+@@ -0,0 +1,829 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++
++	.global idct_add_avr32
++	.global idct_put_avr32
++	.global idct_avr32
++	
++
++#define CONST_BITS  13
++#define PASS1_BITS  2
++
++#define ONE	((INT32) 1)
++
++#define CONST_SCALE (ONE << CONST_BITS)
++
++#define	LINE_SIZE	32
++
++#define FIX_0_298631336  (2446)	/* FIX(0.298631336) */
++#define FIX_0_390180644  (3196)	/* FIX(0.390180644) */
++#define FIX_0_541196100  (4433)	/* FIX(0.541196100) */
++#define FIX_0_765366865  (6270)	/* FIX(0.765366865) */
++#define FIX_0_899976223  (7373)	/* FIX(0.899976223) */
++#define FIX_1_175875602  (9633)	/* FIX(1.175875602) */
++#define FIX_1_501321110  (12299)/* FIX(1.501321110) */
++#define FIX_1_847759065  (15137)/* FIX(1.847759065) */
++#define FIX_1_961570560  (16069)/* FIX(1.961570560) */
++#define FIX_2_053119869  (16819)/* FIX(2.053119869) */
++#define FIX_2_562915447  (20995)/* FIX(2.562915447) */
++#define FIX_3_072711026  (25172)/* FIX(3.072711026) */
++
++
++#define loop_cnt	r11	
++	
++	.text
++	
++idct_add_avr32:
++	pushm		r0-r3, r4-r7, lr	//Free up registers to use for local variables
++
++	// Give room for some variables on the stack
++	sub		sp, 8
++	stdsp		SP[0], r12 // rfp
++	stdsp		SP[4], r11 // iinc
++		
++	mov 		loop_cnt, 8		//Initialize loop counter
++
++FOR_ROW:
++
++	ldm		r10, r0, r1, r2, r3	//Load 8 DCT-coeffisients from the current row in the DCT-block
++	mov		r6, 0
++#ifdef USE_PREFETCH
++	pref		r10[LINE_SIZE]		//Prefetch next line 
++#endif
++	or		r4, r2, r3 << 16 
++	or		r4, r1			//Check if all DCT-coeffisients except the DC is zero
++	or		r4, r0
++	brne 		AC_ROW			//If there are non-zero AC coeffisients perform row-transform
++
++	paddsub.h	r5, r3:t, r6:b		//Extract the DC-coeff from r5			
++	plsl.h		r5, r5, PASS1_BITS
++	mov		r4, r5
++	st.d		r10++, r4
++	st.d		r10++, r4
++	
++	sub		loop_cnt, 1		//Decrement loop counter
++	brne		FOR_ROW			//Perform loop one more time if loop_cnt is not zero 
++
++	bral		COLOUMN_TRANSFORM	//Perform coloumn transform after row transform is computed 
++		
++		
++AC_ROW:	
++
++
++	ld.w		r12, pc[coef_table - .]
++	ld.w		r9, pc[coef_table - . + 4]
++
++	padd.h		r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] 
++	mulhh.w		r5, r4:t, r12:t
++	mulhh.w		r6, r0:t, r12:b
++	ld.w		r12, pc[coef_table - . + 8]
++	mulhh.w		r7, r2:t, r9:t
++	add		r6, r5	// tmp2
++	satrnds		r6 >> (CONST_BITS - PASS1_BITS), 31
++	add		r7, r5	// tmp3
++	satrnds		r7 >> (CONST_BITS - PASS1_BITS), 31
++
++	paddsub.h	r5, r3:t, r1:t
++	plsl.h		r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
++	
++	paddsub.h	r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
++	paddsub.h	r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
++	
++	    
++	addhh.w		lr, r3:b, r1:b // lr = z4	
++	addhh.w		r5, r4:b, lr:b
++	mulhh.w		r5, r5:b, r9:b // r5 = z5
++	
++	ld.w		r9, pc[coef_table - . + 12]
++	mulhh.w		r4, r4:b, r12:t // r4 = z3
++	mulhh.w		lr, lr:b, r12:b // lr = z4
++	
++	add		r4, r5	
++	add		lr, r5	
++	
++	addhh.w		r5, r2:b, r1:b // r5 = z2	
++	addhh.w		r8, r3:b, r0:b // r8 = z1	
++
++	    
++	mulhh.w		r0, r0:b, r9:t // r0 = tmp0
++	ld.w		r12, pc[coef_table - . + 16]
++	mulhh.w		r1, r1:b, r9:b // r1 = tmp1
++	ld.w		r9, pc[coef_table - . + 20]
++	mulhh.w		r2, r2:b, r12:t // r2 = tmp2
++	mulhh.w		r3, r3:b, r12:b // r3 = tmp3
++	mulhh.w		r8, r8:b, r9:t // r8 = z1	
++	mulhh.w		r5, r5:b, r9:b // r5 = z2	
++    
++    
++	add		r0, r8
++	add		r0, r4
++	add		r1, r5
++	add		r1, lr
++	add		r2, r5
++	add		r2, r4	 
++	add		r3, r8
++	add		r3, lr	 
++
++	satrnds		r0 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r1 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r2 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r3 >> (CONST_BITS - PASS1_BITS), 31
++	
++	paddsub.h	r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
++	paddsub.h	r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
++	paddsub.h	r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
++	paddsub.h	r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
++
++	sthh.w		r10[0], r4:t, r5:t
++	sthh.w		r10[4], r3:t, r2:t
++	sthh.w		r10[8], r2:b, r3:b
++	sthh.w		r10[12], r5:b, r4:b
++	
++	
++
++	sub		r10, -16 
++	sub		loop_cnt, 1
++	brne		FOR_ROW, e
++
++COLOUMN_TRANSFORM:
++
++	sub		r10, 128	//Set pointer to start of DCT block
++
++
++	mov		loop_cnt, 8
++FOR_COLOUMN:	
++	ldins.h		r3:t,r10[0]    // r3:t = dataptr[0]  
++	ldins.h		r1:t,r10[1*8*2]// r1:t = dataptr[1] 
++	ldins.h		r2:t,r10[2*8*2]// r2:t = dataptr[2] 
++	ldins.h		r0:t,r10[5*8*2]// r0:t = dataptr[5] 
++	ldins.h		r3:b,r10[4*8*2]// r3:b = dataptr[4] 
++	ldins.h		r1:b,r10[3*8*2]// r1:b = dataptr[3] 
++	ldins.h		r2:b,r10[6*8*2]// r2:b = dataptr[6] 
++	ldins.h		r0:b,r10[7*8*2]// r0:b = dataptr[7] 
++		
++	or		r4, r1, r3 << 16
++	or		r4, r2	
++	or		r4, r0
++	brne 		AC_COLOUMN			//If there are non-zero AC coeffisients perform row-transform
++
++	lddsp		r12, SP[0]       // rfp
++	lddsp		r9, SP[4]	// iinc
++	satrnds		r3 >> ( PASS1_BITS + 3 + 16 ), 9	
++	ld.d		r0, r12[0]
++	sub		r10, -2	// Increment the dataptr
++	bfins		r3, r3, 16, 16
++	punpckub.h	r2, r1:t
++	padd.h		r2, r2, r3
++	punpckub.h	r1, r1:b
++	padd.h		r1, r1, r3
++	packsh.ub	r1, r2, r1
++	punpckub.h	r2, r0:t
++	padd.h		r2, r2, r3
++	punpckub.h	r0, r0:b
++	padd.h		r0, r0, r3
++	packsh.ub	r0, r2, r0
++	st.d		r12[0], r0
++	add		r12, r9	// increment rfp
++	stdsp		SP[0], r12		
++			
++	sub		loop_cnt, 1//Decrement loop counter
++	brne		FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero 
++
++	sub		sp, -8	
++	popm		r0-r3, r4-r7, pc//Pop back registers and PC 
++
++AC_COLOUMN:
++	
++	ld.w		r12, pc[coef_table - .]
++	ld.w		r9, pc[coef_table - . + 4]
++
++	addhh.w		r4, r2:t, r2:b
++	mulhh.w		r4, r4:b, r12:t	// r4 = z1
++	mulhh.w		r5, r2:b, r12:b
++	ld.w		r12, pc[coef_table - . + 8]
++	mulhh.w		r6, r2:t, r9:t
++	add		r5, r4	// r5 = tmp2
++	add		r6, r4	// r6 = tmp3
++
++	addhh.w		r7, r3:t, r3:b
++	subhh.w		r8, r3:t, r3:b
++
++	lsl		r7, CONST_BITS
++	lsl		r8, CONST_BITS
++			
++	add		r2, r7, r6 // r2 = tmp10
++	sub		r3, r7, r6 // r3 = tmp13
++	add		r4, r8, r5 // r4 = tmp11
++	sub		r5, r8, r5 // r5 = tmp12
++	   
++	padd.h		r6, r0, r1 // r6:t = z4, r6:b = z3
++	addhh.w		r7, r6:t, r6:b
++	mulhh.w		r7, r7:b, r9:b // r7 = z5
++	
++	ld.w		r9, pc[coef_table - . + 12]
++	mulhh.w		r8, r6:b, r12:t // r8 = z3
++	mulhh.w		r6, r6:t, r12:b // r6 = z4
++	
++	add		r8, r7	
++	add		r6, r7	
++	
++	paddx.h		r7, r0, r1 // r7:t = z2, r7:b = z1
++
++	mulhh.w		r12, r0:b, r9:t // r12 = tmp0
++	mulhh.w		r0, r0:t, r9:b // r0 = tmp1
++	ld.w		r9, pc[coef_table - . + 16]
++	add		r12, r8
++	add		r0, r6
++		
++	ld.w		lr, pc[coef_table - . + 20]
++	machh.w		r8, r1:b, r9:t // r8 = tmp2
++	machh.w		r6, r1:t, r9:b // r6 = tmp3
++	mulhh.w		r9, r7:b, lr:t // r9 = z1	
++	mulhh.w		r7, r7:t, lr:b // r7 = z2	
++    
++    
++	add		r12, r9
++	add		r0, r7
++	add		r8, r7
++	add		r6, r9
++
++	add		r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
++	sub		r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
++	add		r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
++	sub		r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
++	add		r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
++	sub		r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
++	add		r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
++	sub		r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
++	
++	satrnds		r1 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r2 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r6 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r4 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r8 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r5 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r0 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r3 >> (CONST_BITS+PASS1_BITS+3), 9
++
++	packw.sh	r1, r1, r6
++	packw.sh	r8, r8, r0
++	packw.sh	r3, r3, r5
++	packw.sh	r4, r4, r2
++		
++	lddsp		r12, SP[0]       // rfp
++	lddsp		r9, SP[4]	// iinc
++	ld.d		r6, r12[0]
++	sub		r10, -2	// Increment the dataptr
++	punpckub.h	r0, r7:t
++	padd.h		r1, r1, r0
++	punpckub.h	r0, r7:b
++	padd.h		r8, r8, r0
++	packsh.ub	r7, r1, r8
++	punpckub.h	r0, r6:t
++	padd.h		r3, r3, r0
++	punpckub.h	r0, r6:b
++	padd.h		r4, r4, r0
++	packsh.ub	r6, r3, r4
++	st.d		r12[0], r6
++	add		r12, r9	// increment rfp
++	stdsp		SP[0], r12		
++				
++	sub		loop_cnt, 1		//Decrement loop counter
++	brne		FOR_COLOUMN			//Perform loop one more time if loop_cnt is not zero 
++	
++	sub		sp, -8	
++	popm		r0-r3, r4-r7, pc	//Pop back registers and PC 
++	
++
++
++//Coeffisient Table:
++	.align	2
++coef_table:	
++	.short	FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 
++	.short	- FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
++	.short	FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
++
++
++idct_put_avr32:
++	pushm		r0-r3, r4-r7, lr	//Free up registers to use for local variables
++
++		//; Give room for some variables on the stack
++	sub		sp, 8
++	stdsp		SP[0], r12 // rfp
++	stdsp		SP[4], r11 // iinc
++		
++	mov 		loop_cnt, 8		//Initialize loop counter
++
++0:
++
++	ldm		r10, r0, r1, r2, r3	//Load 8 DCT-coeffisients from the current row in the DCT-block
++	mov		r6, 0
++#ifdef USE_PREFETCH
++	pref		r10[LINE_SIZE]		//Prefetch next line 
++#endif
++	or		r4, r2, r3 << 16 
++	or		r4, r1			//Check if all DCT-coeffisients except the DC is zero
++	or		r4, r0
++	brne 		1f			//If there are non-zero AC coeffisients perform row-transform
++
++	paddsub.h	r5, r3:t, r6:b		//Extract the DC-coeff from r5			
++	plsl.h		r5, r5, PASS1_BITS
++	mov		r4, r5
++	st.d		r10++, r4
++	st.d		r10++, r4
++	
++	sub		loop_cnt, 1		//Decrement loop counter
++	brne		0b			//Perform loop one more time if loop_cnt is not zero 
++
++	bral		2f	                //Perform coloumn transform after row transform is computed 
++		
++1:	
++
++	ld.w		r12, pc[coef_table_copy - .]
++	ld.w		r9, pc[coef_table_copy - . + 4]
++
++	padd.h		r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] 
++	mulhh.w		r5, r4:t, r12:t
++	mulhh.w		r6, r0:t, r12:b
++	ld.w		r12, pc[coef_table_copy - . + 8]
++	mulhh.w		r7, r2:t, r9:t
++	add		r6, r5	// tmp2
++	satrnds		r6 >> (CONST_BITS - PASS1_BITS), 31
++	add		r7, r5	// tmp3
++	satrnds		r7 >> (CONST_BITS - PASS1_BITS), 31
++
++	paddsub.h	r5, r3:t, r1:t
++	plsl.h		r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
++	
++	paddsub.h	r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
++	paddsub.h	r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
++	
++	    
++
++	addhh.w		lr, r3:b, r1:b // lr = z4	
++	addhh.w		r5, r4:b, lr:b
++	mulhh.w		r5, r5:b, r9:b // r5 = z5
++	
++	ld.w		r9, pc[coef_table_copy - . + 12]
++	mulhh.w		r4, r4:b, r12:t // r4 = z3
++	mulhh.w		lr, lr:b, r12:b // lr = z4
++	
++	add		r4, r5	
++	add		lr, r5	
++	
++	addhh.w		r5, r2:b, r1:b // r5 = z2	
++	addhh.w		r8, r3:b, r0:b // r8 = z1	
++
++	    
++	mulhh.w		r0, r0:b, r9:t // r0 = tmp0
++	ld.w		r12, pc[coef_table_copy - . + 16]
++	mulhh.w		r1, r1:b, r9:b // r1 = tmp1
++	ld.w		r9, pc[coef_table_copy - . + 20]
++	mulhh.w		r2, r2:b, r12:t // r2 = tmp2
++	mulhh.w		r3, r3:b, r12:b // r3 = tmp3
++	mulhh.w		r8, r8:b, r9:t // r8 = z1	
++	mulhh.w		r5, r5:b, r9:b // r5 = z2	
++    
++    
++	add		r0, r8
++	add		r0, r4
++	add		r1, r5
++	add		r1, lr
++	add		r2, r5
++	add		r2, r4	 
++	add		r3, r8
++	add		r3, lr	 
++
++	satrnds		r0 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r1 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r2 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r3 >> (CONST_BITS - PASS1_BITS), 31
++	
++	paddsub.h	r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
++	paddsub.h	r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
++	paddsub.h	r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
++	paddsub.h	r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
++
++	sthh.w		r10[0], r4:t, r5:t
++	sthh.w		r10[4], r3:t, r2:t
++	sthh.w		r10[8], r2:b, r3:b
++	sthh.w		r10[12], r5:b, r4:b
++	
++	
++
++	sub		r10, -16 
++	sub		loop_cnt, 1
++	brne		0b
++
++2:
++
++	sub		r10, 128	//Set pointer to start of DCT block
++
++	mov		loop_cnt, 8
++
++0:	
++	ldins.h		r3:t,r10[0]    // r3:t = dataptr[0]  
++	ldins.h		r1:t,r10[1*8*2]// r1:t = dataptr[1] 
++	ldins.h		r2:t,r10[2*8*2]// r2:t = dataptr[2] 
++	ldins.h		r0:t,r10[5*8*2]// r0:t = dataptr[5] 
++	ldins.h		r3:b,r10[4*8*2]// r3:b = dataptr[4] 
++	ldins.h		r1:b,r10[3*8*2]// r1:b = dataptr[3] 
++	ldins.h		r2:b,r10[6*8*2]// r2:b = dataptr[6] 
++	ldins.h		r0:b,r10[7*8*2]// r0:b = dataptr[7] 
++		
++	or		r4, r1, r3 << 16
++	or		r4, r2	
++	or		r4, r0
++	brne 		1f			//If there are non-zero AC coeffisients perform row-transform
++
++	lddsp		r12, SP[0]       // rfp
++	lddsp		r9, SP[4]	// iinc
++	satrnds		r3 >> ( PASS1_BITS + 3 + 16 ), 31	
++	packw.sh	r3, r3, r3
++	packsh.ub	r3, r3, r3
++	mov		r2, r3
++	st.d		r12[0], r2
++	add		r12, r9	// increment rfp
++	sub		r10, -2	// Increment the dataptr
++	stdsp		SP[0], r12		
++			
++	sub		loop_cnt, 1//Decrement loop counter
++	brne		0b         //Perform loop one more time if loop_cnt is not zero 
++
++	sub		sp, -8	
++	popm		r0-r3, r4-r7, pc//Pop back registers and PC 
++
++1:
++	
++	ld.w		r12, pc[coef_table_copy - .]
++	ld.w		r9, pc[coef_table_copy - . + 4]
++
++	addhh.w		r4, r2:t, r2:b
++	mulhh.w		r4, r4:b, r12:t	// r4 = z1
++	mulhh.w		r5, r2:b, r12:b
++	ld.w		r12, pc[coef_table_copy - . + 8]
++	mulhh.w		r6, r2:t, r9:t
++	add		r5, r4	// r5 = tmp2
++	add		r6, r4	// r6 = tmp3
++
++	addhh.w		r7, r3:t, r3:b
++	subhh.w		r8, r3:t, r3:b
++
++	lsl		r7, CONST_BITS
++	lsl		r8, CONST_BITS
++			
++	add		r2, r7, r6 // r2 = tmp10
++	sub		r3, r7, r6 // r3 = tmp13
++	add		r4, r8, r5 // r4 = tmp11
++	sub		r5, r8, r5 // r5 = tmp12
++	   
++
++	padd.h		r6, r0, r1 // r6:t = z4, r6:b = z3
++	addhh.w		r7, r6:t, r6:b
++	mulhh.w		r7, r7:b, r9:b // r7 = z5
++	
++	ld.w		r9, pc[coef_table_copy - . + 12]
++	mulhh.w		r8, r6:b, r12:t // r8 = z3
++	mulhh.w		r6, r6:t, r12:b // r6 = z4
++	
++	add		r8, r7	
++	add		r6, r7	
++	
++	paddx.h		r7, r0, r1 // r7:t = z2, r7:b = z1
++
++	mulhh.w		r12, r0:b, r9:t // r12 = tmp0
++	mulhh.w		r0, r0:t, r9:b // r0 = tmp1
++	ld.w		r9, pc[coef_table_copy - . + 16]
++	add		r12, r8
++	add		r0, r6
++		
++	ld.w		lr, pc[coef_table_copy - . + 20]
++	machh.w		r8, r1:b, r9:t // r8 = tmp2
++	machh.w		r6, r1:t, r9:b // r6 = tmp3
++	mulhh.w		r9, r7:b, lr:t // r9 = z1	
++	mulhh.w		r7, r7:t, lr:b // r7 = z2	
++    
++    
++	add		r12, r9
++	add		r0, r7
++	add		r8, r7
++	add		r6, r9
++
++	add		r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
++	sub		r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
++	add		r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
++	sub		r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
++	add		r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
++	sub		r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
++	add		r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
++	sub		r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
++	
++	satrnds		r1 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r2 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r6 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r4 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r8 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r5 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r0 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r3 >> (CONST_BITS+PASS1_BITS+3), 9
++
++	packw.sh	r1, r1, r6
++	packw.sh	r8, r8, r0
++	packw.sh	r3, r3, r5
++	packw.sh	r4, r4, r2
++
++	packsh.ub	r1, r1, r8
++	packsh.ub	r0, r3, r4		
++	lddsp		r12, SP[0]       // rfp
++	lddsp		r9, SP[4]	// iinc
++	st.d		r12[0], r0
++	sub		r10, -2	// Increment the dataptr
++	add		r12, r9	// increment rfp
++	stdsp		SP[0], r12		
++				
++	sub		loop_cnt, 1		//Decrement loop counter
++	brne		0b			//Perform loop one more time if loop_cnt is not zero 
++	
++	sub		sp, -8	
++	popm		r0-r3, r4-r7, pc	//Pop back registers and PC 
++	
++
++
++	.align 2
++coef_table_copy:	
++	.short	FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 
++	.short	- FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
++	.short	FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
++	
++
++idct_avr32:
++	pushm		r0-r3, r4-r7, lr	//Free up registers to use for local variables
++
++		//; Give room for a temporary block on the stack
++	sub		sp, 8*8*2
++		
++	mov 		loop_cnt, 8		//Initialize loop counter
++
++0:
++
++	ldm		r12++, r0, r1, r2, r3	//Load 8 DCT-coeffisients from the current row in the DCT-block
++	mov		r6, 0
++#ifdef USE_PREFETCH
++	pref		r12[LINE_SIZE]		//Prefetch next line 
++#endif
++	or		r4, r2, r3 << 16 
++	or		r4, r1			//Check if all DCT-coeffisients except the DC is zero
++	or		r4, r0
++	brne 		1f			//If there are non-zero AC coeffisients perform row-transform
++
++	paddsub.h	r5, r3:t, r6:b		//Extract the DC-coeff from r5			
++	plsl.h		r5, r5, PASS1_BITS
++	mov		r4, r5
++	st.d		sp++, r4
++	st.d		sp++, r4
++	
++	sub		loop_cnt, 1		//Decrement loop counter
++	brne		0b			//Perform loop one more time if loop_cnt is not zero 
++
++	bral		2f	                //Perform coloumn transform after row transform is computed 
++		
++1:	
++
++	ld.w		r10, pc[coef_table_idct - .]
++	ld.w		r9, pc[coef_table_idct - . + 4]
++
++	padd.h		r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] 
++	mulhh.w		r5, r4:t, r10:t
++	mulhh.w		r6, r0:t, r10:b
++	ld.w		r10, pc[coef_table_idct - . + 8]
++	mulhh.w		r7, r2:t, r9:t
++	add		r6, r5	// tmp2
++	satrnds		r6 >> (CONST_BITS - PASS1_BITS), 31
++	add		r7, r5	// tmp3
++	satrnds		r7 >> (CONST_BITS - PASS1_BITS), 31
++
++	paddsub.h	r5, r3:t, r1:t
++	plsl.h		r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
++	
++	paddsub.h	r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
++	paddsub.h	r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
++	
++	    
++
++	addhh.w		lr, r3:b, r1:b // lr = z4	
++	addhh.w		r5, r4:b, lr:b
++	mulhh.w		r5, r5:b, r9:b // r5 = z5
++	
++	ld.w		r9, pc[coef_table_idct - . + 12]
++	mulhh.w		r4, r4:b, r10:t // r4 = z3
++	mulhh.w		lr, lr:b, r10:b // lr = z4
++	
++	add		r4, r5	
++	add		lr, r5	
++	
++	addhh.w		r5, r2:b, r1:b // r5 = z2	
++	addhh.w		r8, r3:b, r0:b // r8 = z1	
++
++	    
++	mulhh.w		r0, r0:b, r9:t // r0 = tmp0
++	ld.w		r10, pc[coef_table_idct - . + 16]
++	mulhh.w		r1, r1:b, r9:b // r1 = tmp1
++	ld.w		r9, pc[coef_table_idct - . + 20]
++	mulhh.w		r2, r2:b, r10:t // r2 = tmp2
++	mulhh.w		r3, r3:b, r10:b // r3 = tmp3
++	mulhh.w		r8, r8:b, r9:t // r8 = z1	
++	mulhh.w		r5, r5:b, r9:b // r5 = z2	
++    
++    
++	add		r0, r8
++	add		r0, r4
++	add		r1, r5
++	add		r1, lr
++	add		r2, r5
++	add		r2, r4	 
++	add		r3, r8
++	add		r3, lr	 
++
++	satrnds		r0 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r1 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r2 >> (CONST_BITS - PASS1_BITS), 31
++	satrnds		r3 >> (CONST_BITS - PASS1_BITS), 31
++	
++	paddsub.h	r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
++	paddsub.h	r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
++	paddsub.h	r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
++	paddsub.h	r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
++
++	sthh.w		sp[0], r4:t, r5:t
++	sthh.w		sp[4], r3:t, r2:t
++	sthh.w		sp[8], r2:b, r3:b
++	sthh.w		sp[12], r5:b, r4:b
++	
++	
++
++	sub		sp, -16 
++	sub		loop_cnt, 1
++	brne		0b
++
++2:
++
++	sub		sp, 8*8*2	//Set pointer to start of DCT block
++	sub		r12, 8*8*2	//Set pointer to start of DCT block
++
++	mov		loop_cnt, 8
++
++0:	
++	ldins.h		r3:t,sp[0]    // r3:t = dataptr[0]  
++	ldins.h		r1:t,sp[1*8*2]// r1:t = dataptr[1] 
++	ldins.h		r2:t,sp[2*8*2]// r2:t = dataptr[2] 
++	ldins.h		r0:t,sp[5*8*2]// r0:t = dataptr[5] 
++	ldins.h		r3:b,sp[4*8*2]// r3:b = dataptr[4] 
++	ldins.h		r1:b,sp[3*8*2]// r1:b = dataptr[3] 
++	ldins.h		r2:b,sp[6*8*2]// r2:b = dataptr[6] 
++	ldins.h		r0:b,sp[7*8*2]// r0:b = dataptr[7] 
++		
++	or		r4, r1, r3 << 16
++	or		r4, r2	
++	or		r4, r0
++	brne 		1f			//If there are non-zero AC coeffisients perform row-transform
++
++	satrnds		r3 >> ( PASS1_BITS + 3 + 16 ), 31	
++	packw.sh	r3, r3, r3
++	mov		r2, r3
++	st.d		r12++, r2
++	st.d		r12++, r2
++	sub		sp, -2	// Increment the dataptr
++			
++	sub		loop_cnt, 1//Decrement loop counter
++	brne		0b         //Perform loop one more time if loop_cnt is not zero 
++
++	sub		sp, -(8*8*2 - 8)	
++	popm		r0-r3, r4-r7, pc//Pop back registers and PC 
++
++1:
++	
++	ld.w		r10, pc[coef_table_idct - .]
++	ld.w		r9, pc[coef_table_idct - . + 4]
++
++	addhh.w		r4, r2:t, r2:b
++	mulhh.w		r4, r4:b, r10:t	// r4 = z1
++	mulhh.w		r5, r2:b, r10:b
++	ld.w		r10, pc[coef_table_idct - . + 8]
++	mulhh.w		r6, r2:t, r9:t
++	add		r5, r4	// r5 = tmp2
++	add		r6, r4	// r6 = tmp3
++
++	addhh.w		r7, r3:t, r3:b
++	subhh.w		r8, r3:t, r3:b
++
++	lsl		r7, CONST_BITS
++	lsl		r8, CONST_BITS
++			
++	add		r2, r7, r6 // r2 = tmp10
++	sub		r3, r7, r6 // r3 = tmp13
++	add		r4, r8, r5 // r4 = tmp11
++	sub		r5, r8, r5 // r5 = tmp12
++	   
++
++	padd.h		r6, r0, r1 // r6:t = z4, r6:b = z3
++	addhh.w		r7, r6:t, r6:b
++	mulhh.w		r7, r7:b, r9:b // r7 = z5
++	
++	ld.w		r9, pc[coef_table_idct - . + 12]
++	mulhh.w		r8, r6:b, r10:t // r8 = z3
++	mulhh.w		r6, r6:t, r10:b // r6 = z4
++	
++	add		r8, r7	
++	add		r6, r7	
++	
++	paddx.h		r7, r0, r1 // r7:t = z2, r7:b = z1
++
++	mulhh.w		r10, r0:b, r9:t // r10 = tmp0
++	mulhh.w		r0, r0:t, r9:b // r0 = tmp1
++	ld.w		r9, pc[coef_table_idct - . + 16]
++	add		r10, r8
++	add		r0, r6
++		
++	ld.w		lr, pc[coef_table_idct - . + 20]
++	machh.w		r8, r1:b, r9:t // r8 = tmp2
++	machh.w		r6, r1:t, r9:b // r6 = tmp3
++	mulhh.w		r9, r7:b, lr:t // r9 = z1	
++	mulhh.w		r7, r7:t, lr:b // r7 = z2	
++    
++    
++	add		r10, r9
++	add		r0, r7
++	add		r8, r7
++	add		r6, r9
++
++	add		r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
++	sub		r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
++	add		r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
++	sub		r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
++	add		r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
++	sub		r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
++	add		r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
++	sub		r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
++	
++	satrnds		r1 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r2 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r6 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r4 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r8 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r5 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r0 >> (CONST_BITS+PASS1_BITS+3), 9
++	satrnds		r3 >> (CONST_BITS+PASS1_BITS+3), 9
++
++	packw.sh	r7, r1, r6
++	packw.sh	r6, r8, r0
++	packw.sh	r5, r3, r5
++	packw.sh	r4, r4, r2
++
++	stm		r12, r4-r7
++	sub		sp, -2	// Increment the dataptr
++	sub		r12, -16
++					
++	sub		loop_cnt, 1		//Decrement loop counter
++	brne		0b			//Perform loop one more time if loop_cnt is not zero 
++	
++	sub		sp, -(8*8*2 - 8)	
++	popm		r0-r3, r4-r7, pc	//Pop back registers and PC 
++	
++
++
++	.align 2
++coef_table_idct:	
++	.short	FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 
++	.short	- FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
++	.short	FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
++	
+diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S
+new file mode 100644
+index 0000000..07a002d
+--- /dev/null
++++ b/libavcodec/avr32/mc.S
+@@ -0,0 +1,434 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++		
++	
++	/* Macro for masking the lowest bit of each byte in a
++	   packed word */
++	.macro	packedmask1	reg, round
++	.if	\round
++	and	\reg, \reg, r8 >> 1
++	.else
++	and	\reg, r8
++	.endif
++	.endm		
++
++	/* Macro for 8 pixel wide horizontal and vertical interpolation functions */	
++	.macro	pixels8_hv	round, put		
++
++
++	pushm	r0-r7, lr
++
++	/* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++	
++	/* Rounding immediate */
++	.if	\round
++	mov	r8, lo(0x02020202)
++	orh	r8, hi(0x02020202) 
++	.else
++	mov	r8, lo(0x01010101)
++	orh	r8, hi(0x01010101) 
++	.endif
++	mov	r7, 2
++	
++	/* Pixel naming convention :	 
++	
++		|-----------------------------------------------------|
++		| s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
++		|----d00---d01---d02---d03---d04---d05---d06---d07----|
++		| s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
++		|-----------------------------------------------------|
++	*/
++1:	
++	ld.w	r0, r11[0]		// r0 = { s00, s01, s02, s03 } 
++	ld.w	r1, r11[1]		// r1 = { s01, s02, s03, s04 }
++	mov	lr, r9
++	eor	r2, r0, r1	 	
++	packedmask1	r2, \round		 	 
++	add	r2, r8
++
++	paddh.ub	r0, r0, r1	// r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++	
++	add		r11, r10	// pixels += line_size
++	ld.w	r1, r11[0]		// r1 = { s10, s11, s12, s13 } 
++	ld.w	r3, r11[1]		// r3 = { s11, s12, s13, s14 }
++0:
++	eor	r5, r1, r3	 	
++	packedmask1	r5, \round		 	 
++	add	r2, r5
++	
++	paddh.ub	r1, r1, r3	// r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
++	eor	r6, r0, r1		
++	packedmask1	r6, \round		 	 
++	add	r2, r2, r6 << 1	
++						
++	ld.w	r3, r11[r10]		// r3 = { s00, s01, s02, s03 } 
++	add	r11, r10		// pixels += line_size
++	ld.w	r4, r11[1]		// r4 = { s01, s02, s03, s04 }
++
++	paddh.ub	r0, r0, r1
++	plsr.b		r2, r2, 2
++	padd.b		r0, r0, r2	// r0 = { d00, d01, d02, d03 }
++	
++	/* Next row */
++	.if	\put
++	eor	r2, r3, r4	 	
++	packedmask1	r2, \round		 	 
++	add	r2, r8
++	.else
++	ld.w	r6, r12[0]
++	eor	r2, r3, r4	 	
++	packedmask1	r2, \round		 	 
++	add	r2, r8
++	pavg.ub	r0, r0, r6
++	.endif
++	st.w	r12[0], r0		// Put data into the block
++		
++	add	r5, r2
++	paddh.ub	r0, r3, r4	// r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++				
++	eor	r6, r0, r1		
++	packedmask1	r6, \round		 	 
++	add	r5, r5, r6 << 1	
++
++	.if	\put
++	paddh.ub	r1, r0, r1
++	plsr.b		r5, r5, 2
++	padd.b		r1, r1, r5	// r1 = { d10, d11, d12, d13 }
++	.else
++	ld.w		r3, r12[r10]
++	paddh.ub	r1, r0, r1
++	plsr.b		r5, r5, 2
++	padd.b		r1, r1, r5	// r1 = { d10, d11, d12, d13 }
++	pavg.ub		r1, r1, r3
++	.endif
++		
++	st.w	r12[r10], r1		// Put data into the block
++	
++	
++	ld.w	r1, r11[r10]		// r1 = { s10, s11, s12, s13 } 
++	add	r11, r10		// pixels += line_size
++	ld.w	r3, r11[1]		// r3 = { s11, s12, s13, s14 }
++	add	r12, r12, r10 << 1	// block += 2*line_size
++	sub	lr, 2
++	brne	0b
++
++	mul	r0, r10, r9		// r0 = line_size * h
++	rsub	r0, r0, 4		// r0 = 4 - (line_size * h)  
++	add	r11, r0
++	sub	r11, r10		// pixels += 4 - (line_size * (h+1))
++	add	r12, r0			// pixels += 4 - (line_size * (h))
++	sub	r7, 1
++	brne	1b
++	
++	popm	r0-r7, pc
++	.endm
++
++
++	/* Macro for 8 pixel wide vertical interpolation functions */	
++
++	.macro	pixels8_v	round, put		
++	pushm	r4-r7,lr	
++	/* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++
++	/* 
++		Pixel Naming Convention :	
++		|-----------------------------------------------|
++		| s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
++		|-d00---d01---d02---d03---d04---d05---d06---d07-|
++		| s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
++		|-----------------------------------------------|
++	*/
++	ld.w	r8, r11[r10]		// r8 = { s10, s11, s12, s13 }
++	ld.w	lr, r11++		// lr = { s00, s01, s02, s03 }, src += 4
++	ld.w	r7, r11[0]		// r7 = { s04, s05, s06, s07 }
++	ld.w	r6, r11[r10]		// r6 = { s14, s15, s16, s17 }
++	sub	r10, 4			// stride -= 4 
++	add	r11, r11, r10 << 1	// src += 2*stride 
++	sub	r11, -4			// src += 4                               
++	
++0:	
++	.if	\round
++	pavg.ub r5, r8, lr		// r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++	pavg.ub r4, r6, r7		// r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++	.else
++	paddh.ub r5, r8, lr		// r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++	paddh.ub r4, r6, r7		// r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++	.endif
++
++	.if	\put
++	st.w	r12++, r5		// *dst++ = { d00, d01, d02, d03 } 
++	ld.w	lr, r11++		// lr = { s10, s11, s12, s13 }, src += 4 
++	st.w	r12[0], r4		// *dst = { d04, d05, d06, d07 }
++	ld.w	r7, r11[0]		// r7 = { s14, s15, s16, s17 }
++	.else
++	ld.w	lr, r12[0]		
++	ld.w	r7, r12[4]	
++	pavg.ub	r5, r5, lr	 
++	pavg.ub	r4, r4, r7	 
++	st.w	r12++, r5		// *dst++ = { d00, d01, d02, d03 } 
++	ld.w	lr, r11++		// lr = { s10, s11, s12, s13 }, src += 4 
++	st.w	r12[0], r4		// *dst = { d04, d05, d06, d07 }
++	ld.w	r7, r11[0]		// r7 = { s14, s15, s16, s17 }
++	.endif
++	add	r11, r10		// src += stride							
++#ifdef USE_PREFETCH
++	pref		r11[0]
++#endif		
++	add	r12, r10		// dst += stride
++
++	.if	\round
++	pavg.ub r5, r8, lr		// r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++	pavg.ub r4, r6, r7		// r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++	.else
++	paddh.ub r5, r8, lr		// r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
++	paddh.ub r4, r6, r7		// r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
++	.endif
++	.if	\put
++	st.w	r12++, r5		// *dst++ = { d00, d01, d02, d03 }                       
++	ld.w	r8, r11++		// r8 = { s10, s11, s12, s13 }, src += 4                 
++	st.w	r12[0], r4		// *dst = { d04, d05, d06, d07 }                         
++	ld.w	r6, r11[0]		// r6 = { s14, s15, s16, s17 }                           
++	.else
++	ld.w	r8, r12[0]		
++	ld.w	r6, r12[4]	
++	pavg.ub	r5, r5, r8	 
++	pavg.ub	r4, r4, r6	 
++	st.w	r12++, r5		// *dst++ = { d00, d01, d02, d03 }                       
++	ld.w	r8, r11++		// r8 = { s10, s11, s12, s13 }, src += 4                 
++	st.w	r12[0], r4		// *dst = { d04, d05, d06, d07 }                         
++	ld.w	r6, r11[0]		// r6 = { s14, s15, s16, s17 }                           
++	.endif
++	
++	add	r11, r10		// src += stride                                         
++#ifdef USE_PREFETCH
++	pref		r11[0]
++#endif		
++	add	r12, r10		// dst += stride                                         
++	sub	r9, 2
++	brne	0b
++		
++	popm	r4-r7,pc
++	.endm
++
++	/* Macro for 8 pixel wide horizontal interpolation functions */	
++
++	.macro	pixels8_h	round, put
++	pushm	r4-r7, lr	
++
++	/* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++	/*
++		 Pixel Naming Convention:	
++		|--------------------------------------------------------------------|
++		| s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
++		|------|-------|-------|-------|-------|-------|-------|-------|-----|
++		| s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
++		|--------------------------------------------------------------------|
++	*/
++
++	ld.w	lr, r11[0]	// lr = { s00, s01, s02, s03 }
++	ld.w	r8, r11[1]	// r8 = { s01, s02, s03, s04 }
++	ld.w	r7, r11[4]	// r7 = { s04, s05, s06, s07 }
++	ld.w	r6, r11[5]	// r6 = { s05, s06, s07, s08 }
++	add	r11, r10	// src += stride			
++
++0:	
++	.if	\round
++	pavg.ub lr, r8, lr	// lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++	pavg.ub r7, r6, r7	// r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++	.else
++	paddh.ub lr, r8, lr	// lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++	paddh.ub r7, r6, r7	// r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++	.endif
++	.if	\put
++	ld.w	r5, r11[0]	// r5 = { s00, s01, s02, s03 }
++	ld.w	r4, r11[1]	// r4 = { s01, s02, s03, s04 }
++	.else
++	ld.w	r8, r12[0]
++	ld.w	r6, r12[4]
++	ld.w	r5, r11[0]	// r5 = { s00, s01, s02, s03 }
++	ld.w	r4, r11[1]	// r4 = { s01, s02, s03, s04 }
++	pavg.ub	lr, lr, r8
++	pavg.ub	r7, r7, r6
++	.endif
++	st.w	r12[0], lr	// dst = { d00, d01, d02, d03 }
++	st.w	r12[4], r7	// dst = { d04, d05, d06, d07 }
++	ld.w	r8, r11[4]	// r8 = { s04, s05, s06, s07 }
++	ld.w	r6, r11[5]	// r6 = { s05, s06, s07, s08 }
++	add	r11, r10	// src += stride						
++#ifdef USE_PREFETCH
++	pref		r11[0]
++#endif		
++	add	r12, r10	// dst += stride
++
++	.if	\round
++	pavg.ub r5, r4, r5	// r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++	pavg.ub r4, r6, r8	// r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++	.else
++	paddh.ub r5, r4, r5	// r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
++	paddh.ub r4, r6, r8	// r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
++	.endif
++	.if	\put
++	ld.w	lr, r11[0]	// lr = { s00, s01, s02, s03 }                           
++	ld.w	r8, r11[1]	// r8 = { s01, s02, s03, s04 }                           
++	.else
++	ld.w	r7, r12[0]
++	ld.w	r6, r12[4]
++	ld.w	lr, r11[0]	// lr = { s00, s01, s02, s03 }                           
++	ld.w	r8, r11[1]	// r8 = { s01, s02, s03, s04 }                           
++	pavg.ub	r5, r5, r7
++	pavg.ub	r4, r4, r6
++	.endif
++	st.w	r12[0], r5	// dst = { d00, d01, d02, d03 }                          
++	st.w	r12[4], r4	// dst = { d04, d05, d06, d07 }                          
++	ld.w	r7, r11[4]	// r7 = { s04, s05, s06, s07 }                           
++	ld.w	r6, r11[5]	// r6 = { s05, s06, s07, s08 }                           
++	add	r11, r10	// src += stride						                                         
++#ifdef USE_PREFETCH
++	pref		r11[0]
++#endif		
++	add	r12, r10	// dst += stride                                         
++	sub	r9, 2
++	brne	0b
++		
++	popm	r4-r7, pc
++	.endm
++	
++	/* Macro for 8 pixel wide copy functions */	
++	.macro	pixels8	put
++	stm		--sp, r3-r7,lr
++	/* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
++	mov		lr, r9
++	sub		r3, r10, 2	        // stride2 = stride - 2 
++0:	
++	.if	\put
++	ld.w		r9, r11[r10]	        // r9 = { s10, s11, s12, s13 }
++	ld.w		r7, r11++	        // r7 = { s00, s01, s02, s03 }, src += 4
++	ld.w		r6, r11[0]	        // r6 = { s04, s05, s06, s07 }
++	ld.w		r8, r11[r10]	        // r8 = { s14, s15, s16, s17 }
++	.else
++	ld.w		r9, r11[r10]	        // r9 = { s10, s11, s12, s13 }
++	ld.d		r4, r12[0]
++	ld.w		r7, r11++	        // r7 = { s00, s01, s02, s03 }, src += 4
++	ld.w		r6, r11[0]	        // r6 = { s04, s05, s06, s07 }
++	ld.w		r8, r11[r10]	        // r8 = { s14, s15, s16, s17 }
++	pavg.ub		r6, r6, r4
++	pavg.ub		r7, r7, r5
++	ld.d		r4, r12[r10]
++	.endif
++	st.d		r12, r6			// *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }   
++	add		r11, r11, r3 << 1	// src += stride2 * 2
++	.ifeq	\put
++	pavg.ub		r8, r8, r4
++	pavg.ub		r9, r9, r5	
++	.endif	
++	st.d		r12[r10 << 0], r8	// *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 } 
++	add		r12, r12, r10 << 1	// dst += 2*stride
++	sub		lr, 2
++	brne		0b
++	ldm		sp++, r3-r7,pc
++
++	.endm	
++	
++	.global	put_no_rnd_pixels8_hv_avr32
++	.text
++put_no_rnd_pixels8_hv_avr32:
++	pixels8_hv	0, 1	
++
++	.global	put_pixels8_hv_avr32
++	.text
++put_pixels8_hv_avr32:
++	pixels8_hv	1, 1	
++
++	.global	avg_no_rnd_pixels8_hv_avr32
++	.text
++avg_no_rnd_pixels8_hv_avr32:
++	pixels8_hv	0, 0	
++
++	.global	avg_pixels8_hv_avr32
++	.text
++avg_pixels8_hv_avr32:
++	pixels8_hv	1, 0	
++
++	.global	put_no_rnd_pixels8_v_avr32
++	.text
++put_no_rnd_pixels8_v_avr32:
++	pixels8_v	0, 1	
++
++	.global	put_pixels8_v_avr32
++	.text
++put_pixels8_v_avr32:
++	pixels8_v	1, 1	
++
++	.global	avg_no_rnd_pixels8_v_avr32
++	.text
++avg_no_rnd_pixels8_v_avr32:
++	pixels8_v	0, 0	
++
++	.global	avg_pixels8_v_avr32
++	.text
++avg_pixels8_v_avr32:
++	pixels8_v	1, 0	
++
++	.global	put_no_rnd_pixels8_h_avr32
++	.text
++put_no_rnd_pixels8_h_avr32:
++	pixels8_h	0, 1	
++
++	.global	put_pixels8_h_avr32
++	.text
++put_pixels8_h_avr32:
++	pixels8_h	1, 1	
++
++	.global	avg_no_rnd_pixels8_h_avr32
++	.text
++avg_no_rnd_pixels8_h_avr32:
++	pixels8_h	0, 0	
++
++	.global	avg_pixels8_h_avr32
++	.text
++avg_pixels8_h_avr32:
++	pixels8_h	1, 0	
++
++	.global	put_pixels8_avr32
++	.global	put_no_rnd_pixels8_avr32
++	.text
++put_pixels8_avr32:
++put_no_rnd_pixels8_avr32:
++	pixels8	1	
++
++	.global	avg_no_rnd_pixels8_avr32
++	.global	avg_pixels8_avr32
++	.text
++avg_pixels8_avr32:
++avg_no_rnd_pixels8_avr32:
++	pixels8	0	
+diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h
+new file mode 100644
+index 0000000..32201ba
+--- /dev/null
++++ b/libavcodec/avr32/pico.h
+@@ -0,0 +1,260 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++#ifndef __PICO_H__
++#define __PICO_H__
++
++
++
++/* Coprocessor Number */
++#define PICO_CPNO  1
++
++/* Pixel Coprocessor Register file */
++#define PICO_REGVECT_INPIX2  cr0
++#define PICO_REGVECT_INPIX1  cr1
++#define PICO_REGVECT_INPIX0  cr2
++#define PICO_REGVECT_OUTPIX2 cr3
++#define PICO_REGVECT_OUTPIX1 cr4
++#define PICO_REGVECT_OUTPIX0 cr5
++#define PICO_REGVECT_COEFF0_A cr6
++#define PICO_REGVECT_COEFF0_B cr7
++#define PICO_REGVECT_COEFF1_A cr8
++#define PICO_REGVECT_COEFF1_B cr9
++#define PICO_REGVECT_COEFF2_A cr10
++#define PICO_REGVECT_COEFF2_B cr11
++#define PICO_REGVECT_VMU0_OUT cr12
++#define PICO_REGVECT_VMU1_OUT cr13
++#define PICO_REGVECT_VMU2_OUT cr14
++#define PICO_REGVECT_CONFIG   cr15
++
++#define PICO_INPIX2  0
++#define PICO_INPIX1  1
++#define PICO_INPIX0  2
++#define PICO_OUTPIX2 3
++#define PICO_OUTPIX1 4
++#define PICO_OUTPIX0 5
++#define PICO_COEFF0_A 6
++#define PICO_COEFF0_B 7
++#define PICO_COEFF1_A 8
++#define PICO_COEFF1_B 9
++#define PICO_COEFF2_A 10
++#define PICO_COEFF2_B 11
++#define PICO_VMU0_OUT 12
++#define PICO_VMU1_OUT 13
++#define PICO_VMU2_OUT 14
++#define PICO_CONFIG   15
++
++/* Config Register */
++#define PICO_COEFF_FRAC_BITS_OFFSET  0
++#define PICO_COEFF_FRAC_BITS_SIZE  4
++#define PICO_OFFSET_FRAC_BITS_OFFSET  4
++#define PICO_OFFSET_FRAC_BITS_SIZE  4
++#define PICO_INPUT_MODE_OFFSET  8
++#define PICO_INPUT_MODE_SIZE  2
++#define PICO_OUTPUT_MODE_OFFSET 10
++#define PICO_OUTPUT_MODE_SIZE 1
++
++struct pico_config_t {
++  unsigned int          : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
++  unsigned int          output_mode : PICO_OUTPUT_MODE_SIZE;
++  unsigned int          input_mode : PICO_INPUT_MODE_SIZE;
++  unsigned int          offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
++  unsigned int          coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
++  int                   vmu2_out;
++  int                   vmu1_out;
++  int                   vmu0_out;
++  short                 coeff2_2;
++  short                 coeff2_3;
++  short                 coeff2_0;
++  short                 coeff2_1;
++  short                 coeff1_2;
++  short                 coeff1_3;
++  short                 coeff1_0;
++  short                 coeff1_1;
++  short                 coeff0_2;
++  short                 coeff0_3;
++  short                 coeff0_0;
++  short                 coeff0_1;
++};
++
++
++#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
++#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
++#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
++#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
++
++#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
++#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
++#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
++#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
++
++enum pico_input_mode { PICO_TRANSFORMATION_MODE,
++                       PICO_HOR_FILTER_MODE,
++                       PICO_VERT_FILTER_MODE };
++
++enum pico_output_mode { PICO_PACKED_MODE,
++                        PICO_PLANAR_MODE };
++
++/* Bits in coefficients */
++#define PICO_COEFF_BITS 12
++
++/* Operation bits */
++#define PICO_MATRIX (0)
++#define PICO_USE_ACC (1 << 2)
++#define PICO_SINGLE_VECTOR (1 << 3)
++
++
++#define __str(x...) #x
++#define __xstr(x...) __str(x)
++
++#define PICO_PUT_W(pico_reg, x) \
++  __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
++#define PICO_GET_W(pico_reg) \
++  __builtin_mvcr_w(PICO_CPNO, pico_reg)
++
++#define PICO_MVCR_W(x, pico_reg) \
++  asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
++
++#define PICO_MVRC_W(pico_reg, x) \
++  asm  ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
++
++#define PICO_PUT_D(pico_reg, x) \
++  __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
++#define PICO_GET_D(pico_reg) \
++  __builtin_mvcr_d(PICO_CPNO, pico_reg)
++
++#define PICO_MVCR_D(x, pico_reg) \
++  asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
++#define PICO_MVRC_D(pico_reg, x) \
++  asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
++
++#define PICO_STCM_W(ptr, pico_regs...) \
++  asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++#define PICO_STCM_D(ptr, pico_regs...) \
++  asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++
++#define PICO_STCM_W_DEC(ptr, pico_regs...) \
++  asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs)  : "+r"(ptr)); 
++#define PICO_STCM_D_DEC(ptr, pico_regs...) \
++  asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs)  : "+r"(ptr)); 
++
++#define PICO_LDCM_W(ptr, pico_regs...) \
++  asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++#define PICO_LDCM_D(ptr, pico_regs...) \
++  asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++
++#define PICO_LDCM_W_INC(ptr, pico_regs...) \
++  asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs)  : "+r"(ptr)); 
++#define PICO_LDCM_D_INC(ptr, pico_regs...) \
++  asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs)  : "+r"(ptr)); 
++
++#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
++  __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
++
++static inline void set_pico_config(struct pico_config_t *config){
++  PICO_LDCM_D(config, 
++              PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, 
++              PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
++              PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
++              PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
++              PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);              
++}
++
++static inline void get_pico_config(struct pico_config_t *config){
++  PICO_STCM_D(config, 
++              PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, 
++              PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
++              PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
++              PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
++              PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);              
++}
++
++static inline void dump_pico_config(){
++  struct pico_config_t pico_config;
++  char *input_mode, *output_mode;
++  get_pico_config(&pico_config);
++
++  
++  av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n"); 
++  av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits); 
++  av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits); 
++
++  switch ( pico_config.input_mode ){
++  case PICO_TRANSFORMATION_MODE:
++    input_mode = "Transformation Mode"; 
++    break;
++  case PICO_HOR_FILTER_MODE:
++    input_mode = "Horisontal Filter Mode"; 
++    break;
++  case PICO_VERT_FILTER_MODE:
++    input_mode = "Vertical Filter Mode"; 
++    break;
++  default:
++    input_mode = "Unknown Mode!!"; 
++    break;    
++  }
++  av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode); 
++
++  switch ( pico_config.output_mode ){
++  case PICO_PLANAR_MODE:
++    output_mode = "Planar Mode"; 
++    break;
++  case PICO_PACKED_MODE:
++    output_mode = "Packed Mode"; 
++    break;
++  default:
++    output_mode = "Unknown Mode!!"; 
++    break;    
++  }
++
++  av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode); 
++
++  av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits)); 
++                                   
++  av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits)); 
++                                   
++  av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits)); 
++  av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits)); 
++}
++
++
++
++#endif
++
+diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
+index 26b4f8d..1f8fabf 100644
+--- a/libavcodec/bitstream.h
++++ b/libavcodec/bitstream.h
+@@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM {
+ #endif
+ 
+ /* used to avoid missaligned exceptions on some archs (alpha, ...) */
+-#if defined(ARCH_X86) || defined(ARCH_X86_64)
++#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32)
+ #    define unaligned16(a) (*(const uint16_t*)(a))
+ #    define unaligned32(a) (*(const uint32_t*)(a))
+ #    define unaligned64(a) (*(const uint64_t*)(a))
+@@ -813,6 +813,44 @@ void free_vlc(VLC *vlc);
+  * if the vlc code is invalid and max_depth>1 than the number of bits removed
+  * is undefined
+  */
++
++#if defined(ARCH_AVR32)
++#define GET_VLC(code, name, gb, table, bits, max_depth)\
++{\
++    int n, index, nb_bits;\
++    union { VLC_TYPE vlc[2];\
++            uint32_t u32; } table_elem;\
++\
++    index= SHOW_UBITS(name, gb, bits);\
++    table_elem.u32 = unaligned32(&table[index]); \
++    code = table_elem.vlc[0];\
++    n    = table_elem.vlc[1];\
++\
++    if(max_depth > 1 && n < 0 ){\
++        LAST_SKIP_BITS(name, gb, bits)\
++        UPDATE_CACHE(name, gb)\
++\
++        nb_bits = -n;\
++\
++        index= SHOW_UBITS(name, gb, nb_bits) + code;\
++        table_elem.u32 = unaligned32(&table[index]); \
++        code = table_elem.vlc[0];\
++        n    = table_elem.vlc[1];\
++        if(max_depth > 2 && n < 0){\
++            LAST_SKIP_BITS(name, gb, nb_bits)\
++            UPDATE_CACHE(name, gb)\
++\
++            nb_bits = -n;\
++\
++            index= SHOW_UBITS(name, gb, nb_bits) + code;\
++            code = table[index][0];\
++            n    = table[index][1];\
++        }\
++    }\
++    SKIP_BITS(name, gb, n)\
++}
++
++#else
+ #define GET_VLC(code, name, gb, table, bits, max_depth)\
+ {\
+     int n, index, nb_bits;\
+@@ -821,7 +859,7 @@ void free_vlc(VLC *vlc);
+     code = table[index][0];\
+     n    = table[index][1];\
+ \
+-    if(max_depth > 1 && n < 0){\
++    if(max_depth > 1 && n < 0 ){\
+         LAST_SKIP_BITS(name, gb, bits)\
+         UPDATE_CACHE(name, gb)\
+ \
+@@ -843,7 +881,38 @@ void free_vlc(VLC *vlc);
+     }\
+     SKIP_BITS(name, gb, n)\
+ }
++#endif
+ 
++#if defined(ARCH_AVR32)
++#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
++{\
++    int n, index, nb_bits;\
++    union { RL_VLC_ELEM vlc;\
++            uint32_t u32; } table_elem;\
++\
++    index= SHOW_UBITS(name, gb, bits);\
++    table_elem.u32 = unaligned32(&table[index]); \
++    level = table_elem.vlc.level;\
++    n     = table_elem.vlc.len;\
++\
++    if(max_depth > 1 && n < 0 ){\
++        SKIP_BITS(name, gb, bits)\
++        if(need_update){\
++            UPDATE_CACHE(name, gb)\
++        }\
++\
++        nb_bits = -n;\
++\
++        index= SHOW_UBITS(name, gb, nb_bits) + level;\
++        table_elem.u32 = unaligned32(&table[index]); \
++        level = table_elem.vlc.level;\
++        n     = table_elem.vlc.len;\
++    }\
++    run= table_elem.vlc.run;\
++    SKIP_BITS(name, gb, n)\
++}
++
++#else
+ #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
+ {\
+     int n, index, nb_bits;\
+@@ -852,7 +921,7 @@ void free_vlc(VLC *vlc);
+     level = table[index].level;\
+     n     = table[index].len;\
+ \
+-    if(max_depth > 1 && n < 0){\
++    if(max_depth > 1 && n < 0 ){\
+         SKIP_BITS(name, gb, bits)\
+         if(need_update){\
+             UPDATE_CACHE(name, gb)\
+@@ -867,7 +936,7 @@ void free_vlc(VLC *vlc);
+     run= table[index].run;\
+     SKIP_BITS(name, gb, n)\
+ }
+-
++#endif
+ 
+ /**
+  * parses a vlc code, faster then get_vlc()
+diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
+index 56c42b9..8fc10c6 100644
+--- a/libavcodec/dsputil.c
++++ b/libavcodec/dsputil.c
+@@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
+ #ifdef ARCH_BFIN
+     dsputil_init_bfin(c,avctx);
+ #endif
++#ifdef ARCH_AVR32
++    dsputil_init_avr32(c,avctx);
++#endif
+ 
+     for(i=0; i<64; i++){
+         if(!c->put_2tap_qpel_pixels_tab[0][i])
+diff --git a/libavcodec/h264.c b/libavcodec/h264.c
+index 865e80a..8f7c3f1 100644
+--- a/libavcodec/h264.c
++++ b/libavcodec/h264.c
+@@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){
+ 
+ static void init_dequant8_coeff_table(H264Context *h){
+     int i,q,x;
++#ifdef ARCH_AVR32
++    const int transpose = 0;
++#else
+     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
++#endif
++
+     h->dequant8_coeff[0] = h->dequant8_buffer[0];
+     h->dequant8_coeff[1] = h->dequant8_buffer[1];
+ 
+@@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){
+ 
+ static void init_dequant4_coeff_table(H264Context *h){
+     int i,j,q,x;
++    // Yes this is ugly as hell....
++#ifdef ARCH_AVR32
++    const int transpose = 0;
++#else
+     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
++#endif
++
+     for(i=0; i<6; i++ ){
+         h->dequant4_coeff[i] = h->dequant4_buffer[i];
+         for(j=0; j<i; j++){
+@@ -4663,7 +4674,11 @@ static int decode_slice_header(H264Context *h){
+         if (MPV_common_init(s) < 0)
+             return -1;
+ 
++#ifdef ARCH_AVR32
++        if ( 1 ){
++#else
+         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
++#endif
+             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
+             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
+         }else{
+diff --git a/libavutil/common.h b/libavutil/common.h
+index 3ae5971..7e52b90 100644
+--- a/libavutil/common.h
++++ b/libavutil/common.h
+@@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c)
+  * @param amax maximum value of the clip range
+  * @return cliped value
+  */
++#if defined(ARCH_AVR32)
++#define clip(a, amin, amax) \
++  ({ int __tmp__; \
++     asm ("min\t%0, %1, %2\n" \
++          "max\t%0, %0, %3\n" \
++          : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \
++     __tmp__; })
++#else
+ static inline int clip(int a, int amin, int amax)
+ {
+     if (a < amin)      return amin;
+     else if (a > amax) return amax;
+     else               return a;
+ }
++#endif
+ 
+ /**
+  * clip a signed integer value into the 0-255 range
+  * @param a value to clip
+  * @return cliped value
+  */
++#if defined(ARCH_AVR32)
++#define clip_uint8(a) \
++  ({ int __tmp__ = a; \
++     asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \
++     __tmp__; })
++#else
+ static inline uint8_t clip_uint8(int a)
+ {
+     if (a&(~255)) return (-a)>>31;
+     else          return a;
+ }
++#endif
+ 
+ /* math */
+ int64_t ff_gcd(int64_t a, int64_t b);
+diff --git a/libavutil/internal.h b/libavutil/internal.h
+index 285d304..a8b0718 100644
+--- a/libavutil/internal.h
++++ b/libavutil/internal.h
+@@ -210,6 +210,15 @@ if((y)<(x)){\
+     }\
+ }
+ 
++/* XXX: Hack for uclibc which declares lrintf but does not implement it... */
++#ifdef ARCH_AVR32
++#undef HAVE_LRINTF
++#define HAVE_LRINTF 1
++#define lrintf(x) rint(x)
++#define llrint(x) (long long)rint(x) 
++#endif
++ 
++
+ #ifndef HAVE_LRINTF
+ /* XXX: add ISOC specific test to avoid specific BSD testing. */
+ /* better than nothing implementation. */
+diff --git a/libfaad2/common.h b/libfaad2/common.h
+index f809042..6c5fb21 100644
+--- a/libfaad2/common.h
++++ b/libfaad2/common.h
+@@ -67,7 +67,7 @@ extern "C" {
+ /* Use if target platform has address generators with autoincrement */
+ //#define PREFER_POINTERS
+ 
+-#if defined(_WIN32_WCE) || defined(__arm__)
++#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__)
+ #define FIXED_POINT
+ #endif
+ 
+diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c
+index 076359a..51b77fe 100644
+--- a/libmpcodecs/ad_libmad.c
++++ b/libmpcodecs/ad_libmad.c
+@@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){
+   sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2;
+   sh->samplerate=this->frame.header.samplerate;
+   sh->i_bps=this->frame.header.bitrate/8;
++#ifdef WORDS_BIGENDIAN
++  sh->sample_format = AF_FORMAT_S16_BE;
++#else
++  sh->sample_format = AF_FORMAT_S16_LE;
++#endif
+   sh->samplesize=2;
+   
+   return 1;
+diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h
+new file mode 100644
+index 0000000..7ac6200
+--- /dev/null
++++ b/libswscale/pico-avr32.h
+@@ -0,0 +1,137 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++#ifndef __PICO_H__
++#define __PICO_H__
++
++/* Coprocessor Number */
++#define PICO_CPNO  1
++
++/* Pixel Coprocessor Register file */
++#define PICO_REGVECT_INPIX2  cr0
++#define PICO_REGVECT_INPIX1  cr1
++#define PICO_REGVECT_INPIX0  cr2
++#define PICO_REGVECT_OUTPIX2 cr3
++#define PICO_REGVECT_OUTPIX1 cr4
++#define PICO_REGVECT_OUTPIX0 cr5
++#define PICO_REGVECT_COEFF0_A cr6
++#define PICO_REGVECT_COEFF0_B cr7
++#define PICO_REGVECT_COEFF1_A cr8
++#define PICO_REGVECT_COEFF1_B cr9
++#define PICO_REGVECT_COEFF2_A cr10
++#define PICO_REGVECT_COEFF2_B cr11
++#define PICO_REGVECT_VMU0_OUT cr12
++#define PICO_REGVECT_VMU1_OUT cr13
++#define PICO_REGVECT_VMU2_OUT cr14
++#define PICO_REGVECT_CONFIG   cr15
++
++#define PICO_INPIX2  0
++#define PICO_INPIX1  1
++#define PICO_INPIX0  2
++#define PICO_OUTPIX2 3
++#define PICO_OUTPIX1 4
++#define PICO_OUTPIX0 5
++#define PICO_COEFF0_A 6
++#define PICO_COEFF0_B 7
++#define PICO_COEFF1_A 8
++#define PICO_COEFF1_B 9
++#define PICO_COEFF2_A 10
++#define PICO_COEFF2_B 11
++#define PICO_VMU0_OUT 12
++#define PICO_VMU1_OUT 13
++#define PICO_VMU2_OUT 14
++#define PICO_CONFIG   15
++
++/* Config Register */
++#define PICO_COEFF_FRAC_BITS  0
++#define PICO_COEFF_FRAC_BITS_WIDTH  4
++#define PICO_OFFSET_FRAC_BITS  4
++#define PICO_OFFSET_FRAC_BITS_WIDTH  4
++#define PICO_INPUT_MODE  8
++#define PICO_INPUT_MODE_WIDTH  2
++#define PICO_OUTPUT_MODE 10
++
++#define PICO_TRANSFORMATION_MODE 0 
++#define PICO_HOR_FILTER_MODE 1 
++#define PICO_VERT_FILTER_MODE 2 
++
++#define PICO_PLANAR_MODE 1
++#define PICO_PACKED_MODE 0
++
++/* Bits in coefficients */
++#define PICO_COEFF_BITS 12
++
++/* Operation bits */
++#define PICO_USE_ACC (1 << 2)
++#define PICO_SINGLE_VECTOR (1 << 3)
++
++
++#define __str(x...) #x
++#define __xstr(x...) __str(x)
++
++#define PICO_PUT_W(pico_reg, x) \
++  __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
++#define PICO_GET_W(pico_reg) \
++  __builtin_mvcr_w(PICO_CPNO, pico_reg)
++
++#define PICO_PUT_D(pico_reg, x) \
++  __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
++#define PICO_GET_D(pico_reg) \
++  __builtin_mvcr_d(PICO_CPNO, pico_reg)
++
++
++#define PICO_STCM_W(ptr, pico_regs...) \
++  asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++#define PICO_STCM_D(ptr, pico_regs...) \
++  asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++
++#define PICO_STCM_W_DEC(ptr, pico_regs...) \
++  asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs)  : "+r"(ptr)); 
++#define PICO_STCM_D_DEC(ptr, pico_regs...) \
++  asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs)  : "+r"(ptr)); 
++
++#define PICO_LDCM_W(ptr, pico_regs...) \
++  asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++#define PICO_LDCM_D(ptr, pico_regs...) \
++  asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs)  :: "r"(ptr)); 
++
++#define PICO_LDCM_W_INC(ptr, pico_regs...) \
++  asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs)  : "+r"(ptr)); 
++#define PICO_LDCM_D_INC(ptr, pico_regs...) \
++  asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs)  : "+r"(ptr)); 
++
++#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
++  __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
++
++
++#endif
++
+diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
+index ecd28f5..3221d0c 100644
+--- a/libswscale/swscale_internal.h
++++ b/libswscale/swscale_internal.h
+@@ -173,7 +173,7 @@ typedef struct SwsContext{
+ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
+ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
+ 
+-char *sws_format_name(int format);
++char *sws_format_name(enum PixelFormat format);
+ 
+ //FIXME replace this with something faster
+ #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P	\
+diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
+index 71759bc..fa83985 100644
+--- a/libswscale/yuv2rgb.c
++++ b/libswscale/yuv2rgb.c
+@@ -44,6 +44,10 @@
+ #include "yuv2rgb_mlib.c"
+ #endif
+ 
++#ifdef ARCH_AVR32
++#include "yuv2rgb_avr32.c"
++#endif
++
+ #define DITHER1XBPP // only for mmx
+ 
+ const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
+@@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
+ 	if(t) return t;
+     }
+ #endif
++#ifdef ARCH_AVR32
++    {
++      SwsFunc t= yuv2rgb_init_avr32(c);
++      if(t) return t;
++    }
++#endif
+ #ifdef HAVE_ALTIVEC
+     if (c->flags & SWS_CPU_CAPS_ALTIVEC)
+     {
+@@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
+ //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
+     oy -= 256*brightness;
+ 
++#ifdef ARCH_AVR32
++    yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
++#endif
++
+     for (i = 0; i < 1024; i++) {
+ 	int j;
+ 
+diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c
+new file mode 100644
+index 0000000..4a8341e
+--- /dev/null
++++ b/libswscale/yuv2rgb_avr32.c
+@@ -0,0 +1,416 @@
++/*
++ * Copyright (c) 2007 Atmel Corporation. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials provided
++ * with the distribution.
++ *
++ * 3. The name of ATMEL may not be used to endorse or promote products
++ * derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
++ * DAMAGE.
++ */
++#include "pico-avr32.h"
++
++
++#define RGB(uv_part)  \
++      __asm__ volatile (        \
++                        "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
++                        "ld.w\t%1, %4[%8:" uv_part "  << 2]\n\t" /* g = c->table_gU[U] */  \
++                        "ld.w\t%2, %5[%8:" uv_part "  << 2]\n\t" /* b = c->table_bU[U] */  \
++                        "add\t%1, %0\n\t" /* g += tmp */\
++                        "ld.w\t%0, %6[%7:" uv_part "  << 2]" /* r = c->table_rV[V] */ \
++                        : "=&r" (r), "=&r" (g), "=&r" (b) \
++                        : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
++                        "r" (&c->table_rV[0]), "r" (V), "r" (U));
++
++                        
++#undef YUV2RGB1
++#define YUV2RGB1(dst, src, y, idx) \
++  { int tmp2;    __asm__ volatile (      \
++                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[2] = tmp; */   \
++                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[0] = tmp; */   \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 3], %1\n\t"         /* dst_1[5] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
++                        "st.b\t%7[6*%8 + 5], %1"         /* dst_1[3] = tmp; */   \
++                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++                        
++#undef YUV2RGB2
++#define YUV2RGB2(dst, src, y, idx) \
++  { int tmp2;    __asm__ volatile (      \
++                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[2] = tmp; */   \
++                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[0] = tmp; */   \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 3], %1\n\t"         /* dst_1[5] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
++                        "st.b\t%7[6*%8 + 5], %1"         /* dst_1[3] = tmp; */   \
++                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++
++
++#undef YUV2BGR1
++#define YUV2BGR1(dst, src, y, idx) \
++  { int tmp2;    __asm__ volatile (      \
++                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[2] = tmp; */   \
++                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[0] = tmp; */   \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 5], %1\n\t"         /* dst_1[5] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
++                        "st.b\t%7[6*%8 + 3], %1"         /* dst_1[3] = tmp; */   \
++                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++                        
++#undef YUV2BGR2
++#define YUV2BGR2(dst, src, y, idx) \
++  { int tmp2;    __asm__ volatile (      \
++                        "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 2], %1\n\t"         /* dst_1[2] = tmp; */   \
++                        "st.b\t%7[6*%8 + 1], %2\n\t"         /* dst_1[1] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
++                        "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
++                        "st.b\t%7[6*%8 + 0], %1\n\t"         /* dst_1[0] = tmp; */   \
++                        "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
++                        "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 5], %1\n\t"         /* dst_1[5] = tmp; */   \
++                        "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
++                        "st.b\t%7[6*%8 + 4], %2\n\t"         /* dst_1[4] = tmp; */   \
++                        "st.b\t%7[6*%8 + 3], %1"         /* dst_1[3] = tmp; */   \
++                        : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
++                        : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
++
++
++
++int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, 
++                    int srcSliceH, uint8_t* dst[], int dstStride[]){
++  int y;
++  
++  if(c->srcFormat == PIX_FMT_YUV422P){
++    srcStride[1] *= 2;    
++    srcStride[2] *= 2;
++  }
++
++
++  for(y=0; y<srcSliceH; y+=2){
++    uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY  )*dstStride[0]);
++    uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
++    uint32_t *r, *g, *b;
++    uint8_t *py_1= src[0] + y*srcStride[0];
++    uint8_t *py_2= py_1 + srcStride[0];
++    uint8_t *pu= src[1] + (y>>1)*srcStride[1];
++    uint8_t *pv= src[2] + (y>>1)*srcStride[2];
++    unsigned int h_size= c->dstW>>3;
++    while (h_size--) {
++      uint32_t U, V, Y1, Y2, tmp;
++      U = ((uint32_t*)pu)[0];
++      V = ((uint32_t*)pv)[0];
++      
++      RGB("t")
++        YUV2BGR1(dst_1, py_1, Y1, 0) 
++        YUV2BGR1(dst_2, py_2, Y2, 0) 
++
++      RGB("u")
++        YUV2BGR2(dst_1, py_1, Y1, 1) 
++        YUV2BGR2(dst_2, py_2, Y2, 1)
++
++      RGB("l")
++        YUV2BGR1(dst_1, py_1, Y1, 2) 
++        YUV2BGR1(dst_2, py_2, Y2, 2)
++
++      RGB("b")
++        YUV2BGR2(dst_1, py_1, Y1, 3) 
++        YUV2BGR2(dst_2, py_2, Y2, 3)
++ 
++                        
++
++      pu += 4;
++      pv += 4;
++      py_1 += 8;
++      py_2 += 8;
++      dst_1 += 24;
++      dst_2 += 24;
++    }
++  }
++  return srcSliceH;
++}
++
++
++
++static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, 
++                             int srcSliceH, uint8_t* dst[], int dstStride[]){
++  int y;
++  
++  if(c->srcFormat == PIX_FMT_YUV422P){
++    srcStride[1] *= 2;
++    srcStride[2] *= 2;
++  }
++  for(y=0; y<srcSliceH; y+=2){
++    uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY  )*dstStride[0]);
++    uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
++    uint8_t *r, *g, *b;
++    uint8_t *py_1= src[0] + y*srcStride[0];
++    uint8_t *py_2= py_1 + srcStride[0];
++    uint8_t *pu= src[1] + (y>>1)*srcStride[1];
++    uint8_t *pv= src[2] + (y>>1)*srcStride[2];
++    unsigned int h_size= c->dstW>>3;
++    while (h_size--) {
++      uint32_t U, V, Y1, Y2, tmp;
++      U = ((uint32_t*)pu)[0];
++      V = ((uint32_t*)pv)[0];
++      
++      RGB("t")
++        YUV2RGB1(dst_1, py_1, Y1, 0) 
++        YUV2RGB1(dst_2, py_2, Y2, 0) 
++
++      RGB("u")
++        YUV2RGB2(dst_1, py_1, Y1, 1) 
++        YUV2RGB2(dst_2, py_2, Y2, 1)
++
++      RGB("l")
++        YUV2RGB1(dst_1, py_1, Y1, 2) 
++        YUV2RGB1(dst_2, py_2, Y2, 2)
++
++      RGB("b")
++        YUV2RGB2(dst_1, py_1, Y1, 3) 
++        YUV2RGB2(dst_2, py_2, Y2, 3)
++ 
++      pu += 4;
++      pv += 4;
++      py_1 += 8;
++      py_2 += 8;
++      dst_1 += 24;
++      dst_2 += 24;
++    }
++  }
++  return srcSliceH;
++}
++
++#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
++#define COEFF_FRAC_BITS  9
++#define OFFSET_FRAC_BITS  2
++
++/* Coefficients used in the pico */
++static struct {
++  short coeff2_2;
++  short coeff2_3;
++  short coeff2_0;
++  short coeff2_1;
++  short coeff1_2;
++  short coeff1_3;
++  short coeff1_0;
++  short coeff1_1;
++  short coeff0_2;
++  short coeff0_3;
++  short coeff0_0;
++  short coeff0_1;
++} pico_coeff;
++
++
++static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, 
++                                int srcSliceH, uint8_t* dst[], int dstStride[]){
++  int y;
++  static int first_time = 1;
++
++  /* Initialize pico */
++  PICO_LDCM_D(&pico_coeff, 
++              PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, 
++              PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
++              PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);              
++  
++  PICO_PUT_W(PICO_CONFIG, 
++             (PICO_PACKED_MODE << PICO_OUTPUT_MODE 
++              | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE 
++              | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
++              | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
++
++
++  if(c->srcFormat == PIX_FMT_YUV422P){
++    srcStride[1] *= 2;
++    srcStride[2] *= 2;
++  }
++
++  for(y=0; y<srcSliceH; y+=2){
++    uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY  )*dstStride[0]);
++    uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
++    uint8_t *r, *g, *b;
++    uint8_t *py_1= src[0] + y*srcStride[0];
++    uint8_t *py_2= py_1 + srcStride[0];
++    uint8_t *pu= src[1] + (y>>1)*srcStride[1];
++    uint8_t *pv= src[2] + (y>>1)*srcStride[2];
++    unsigned int h_size= c->dstW>>3;
++    int *py_1_int = (int *)py_1;
++    int *py_2_int = (int *)py_2;
++    int *pu_int = (int *)pu;
++    int *pv_int = (int *)pv;
++    while (h_size--) {
++      PICO_PUT_W(PICO_INPIX0, *py_1_int++);
++      PICO_PUT_W(PICO_INPIX1, *pu_int++);
++      PICO_PUT_W(PICO_INPIX2, *pv_int++);
++      PICO_OP(0, 0, 0, 4, 8);
++      PICO_OP(0, 1, 1, 4, 8);
++      PICO_OP(0, 2, 2, 5, 9);
++      PICO_OP(0, 3, 3, 5, 9);
++      PICO_PUT_W(PICO_INPIX0, *py_1_int++);
++      PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++      PICO_OP(0, 0, 0, 6, 10);
++      PICO_OP(0, 1, 1, 6, 10);
++      PICO_OP(0, 2, 2, 7, 11);
++      PICO_OP(0, 3, 3, 7, 11);
++      PICO_PUT_W(PICO_INPIX0, *py_2_int++);
++      PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++      
++      PICO_OP(0, 0, 0, 4, 8);
++      PICO_OP(0, 1, 1, 4, 8);
++      PICO_OP(0, 2, 2, 5, 9);
++      PICO_OP(0, 3, 3, 5, 9);
++      PICO_PUT_W(PICO_INPIX0, *py_2_int++);
++      PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++      PICO_OP(0, 0, 0, 6, 10);
++      PICO_OP(0, 1, 1, 6, 10);
++      PICO_OP(0, 2, 2, 7, 11);
++      PICO_OP(0, 3, 3, 7, 11);
++      PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
++
++      dst_1 += 24;
++      dst_2 += 24;
++    }
++  }
++  return srcSliceH;
++}
++
++extern int avr32_use_pico;
++
++SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
++  switch(c->dstFormat){
++  case PIX_FMT_BGR24:
++    {
++      if ( avr32_use_pico ){
++        MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n");
++        return yuv2bgr24_avr32_pico;
++      } else {
++        MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n");
++        return yuv2bgr24_avr32;
++      }
++    }
++    break;
++  case PIX_FMT_RGB24:
++    {      
++      if ( avr32_use_pico ){
++        MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n");
++        return yuv2bgr24_avr32_pico;
++      } else {
++        MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n");
++        return yuv2rgb24_avr32;
++      }  
++    }
++  }
++  return NULL;
++}
++
++
++int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
++  const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
++ 
++  int64_t crv =  inv_table[0];
++  int64_t cbu =  inv_table[1];
++  int64_t cgu = -inv_table[2];
++  int64_t cgv = -inv_table[3];
++  int64_t cy  = 1<<16;
++  int64_t oy  = 0;
++  
++  if(!fullRange){
++    cy= (cy*255) / 219;
++    oy= 16<<16;
++  }
++  
++  cy = (cy *contrast             )>>16;
++  crv= (crv*contrast * saturation)>>32;
++  cbu= (cbu*contrast * saturation)>>32;
++  cgu= (cgu*contrast * saturation)>>32;
++  cgv= (cgv*contrast * saturation)>>32;
++
++  oy -= 256*brightness;
++  
++  pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */
++  pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */
++  pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */ 
++  pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS)
++                         + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */ 
++  
++  if ( isRgb ){
++    pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
++    pico_coeff.coeff0_1 = 0; /* R <- U */
++    pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ 
++    pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
++                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ 
++    
++    pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
++    pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
++    pico_coeff.coeff2_2 = 0; /* B <- V */ 
++    pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS) 
++                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */       
++  } else {
++    pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
++    pico_coeff.coeff2_1 = 0; /* R <- U */
++    pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ 
++    pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
++                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ 
++    
++    pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
++    pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
++    pico_coeff.coeff0_2 = 0; /* B <- V */ 
++    pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
++                           + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */       
++  }
++
++}
++
++
++#undef RGB
+diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c
+index 053c193..7017770 100644
+--- a/libvo/vo_fbdev2.c
++++ b/libvo/vo_fbdev2.c
+@@ -22,6 +22,9 @@
+ #include "sub.h"
+ #include "mp_msg.h"
+ 
++/* Draw directly to framebuffer */
++#define USE_CONVERT2FB
++
+ static vo_info_t info = {
+ 	"Framebuffer Device",
+ 	"fbdev2",
+@@ -178,6 +181,15 @@ static int fb_preinit(int reset)
+ 	}
+ 	fb_orig_vinfo = fb_vinfo;
+ 
++	/* Reset panning offset */
++	fb_vinfo.yoffset = 0;
++	if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
++		mp_msg(MSGT_VO, MSGL_ERR,
++		       "[fbdev2] FBIOPAN_DISPLAY failed: %s\n",
++		       strerror(errno));
++		return 0;
++	}
++
+ 	fb_bpp = fb_vinfo.bits_per_pixel;
+ 
+ 	/* 16 and 15 bpp is reported as 16 bpp */
+@@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width,
+ 		mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno));
+ 		return 1;
+ 	}
++#else
++	if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2)
++	    && fb_vinfo.yoffset == 0)
++		center += fb_line_len * fb_vinfo.yres;
+ #endif
+ 	if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres);
+ 
+@@ -299,14 +315,22 @@ static int query_format(uint32_t format)
+ {
+ 	// open the device, etc.
+ 	if (fb_preinit(0)) return 0;
+-	if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) {
++	if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) {
+ 		int fb_target_bpp = format & 0xff;
+ 		set_bpp(&fb_vinfo, fb_target_bpp);
+ 		fb_vinfo.xres_virtual = fb_vinfo.xres;
+-		fb_vinfo.yres_virtual = fb_vinfo.yres;
++		fb_vinfo.yres_virtual = fb_vinfo.yres * 2;
+ 		if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
+-			mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno));
+-			return 0;
++			mp_msg(MSGT_VO, MSGL_WARN,
++			       "[fbdev2] Can't double virtual y resolution: %s\n",
++			       strerror(errno));
++			fb_vinfo.yres_virtual = fb_vinfo.yres;
++			if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
++				mp_msg(MSGT_VO, MSGL_ERR,
++				       "[fbdev2] Can't put VSCREENINFO: %s\n",
++				       strerror(errno));
++				return -1;
++			}
+ 		}
+ 		fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
+ 		fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length +
+@@ -367,16 +391,67 @@ static void check_events(void)
+ 
+ static void flip_page(void)
+ {
+-#ifndef USE_CONVERT2FB
+ 	int i, out_offset = 0, in_offset = 0;
+ 
+-	for (i = 0; i < in_height; i++) {
+-		memcpy(center + out_offset, next_frame + in_offset,
+-				in_width * fb_pixel_size);
+-		out_offset += fb_line_len;
+-		in_offset += in_width * fb_pixel_size;
+-	}
++#ifndef USE_CONVERT2FB
++	if (1) {
++#else
++	if (fb_vinfo.yres_virtual == fb_vinfo.yres) {
+ #endif
++		for (i = 0; i < in_height; i++) {
++			memcpy(center + out_offset, next_frame + in_offset,
++			       in_width * fb_pixel_size);
++			out_offset += fb_line_len;
++			in_offset += in_width * fb_pixel_size;
++		}
++	} else {
++		if (fb_vinfo.yoffset == 0) {
++			fb_vinfo.yoffset += fb_vinfo.yres;
++			center -= fb_line_len * fb_vinfo.yres;
++		} else {
++			fb_vinfo.yoffset = 0;
++			center += fb_line_len * fb_vinfo.yres;
++		}
++
++		if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
++			mp_msg(MSGT_VO, MSGL_ERR,
++			       "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n",
++			       strerror(errno));
++		}
++	}
++}
++
++static uint32_t get_image(mp_image_t *mpi)
++{
++	if(mpi->flags&MP_IMGFLAG_READABLE)
++		return VO_FALSE; // slow video ram 
++	if(mpi->type==MP_IMGTYPE_STATIC)
++		return VO_FALSE; // it is not static
++  
++	if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) {
++		// we're lucky or codec accepts stride => ok, let's go!
++
++		//YUY2 and RGB formats
++		mpi->planes[0] = center;
++		mpi->width = in_width;
++		mpi->stride[0] = fb_line_len;
++
++		// center image
++
++		mpi->flags |= MP_IMGFLAG_DIRECT;
++
++		return VO_TRUE;
++	}
++
++	return VO_FALSE;
++}
++
++static uint32_t put_image(mp_image_t *mpi)
++{
++	// already out?
++	if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK)))
++		return VO_TRUE;
++	return VO_FALSE;
+ }
+ 
+ static void uninit(void)
+@@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...)
+   switch (request) {
+   case VOCTRL_QUERY_FORMAT:
+     return query_format(*((uint32_t*)data));
++  case VOCTRL_GET_IMAGE:
++    return get_image(data);
++  case VOCTRL_DRAW_IMAGE:
++    return put_image(data);
+   }
+   return VO_NOTIMPL;
+ }
+diff --git a/version.sh b/version.sh
+index 44b5c5d..cf22a68 100755
+--- a/version.sh
++++ b/version.sh
+@@ -1,2 +1,2 @@
+ #!/bin/sh
+-echo "#define VERSION \"1.0rc1-$1\"" > version.h
++echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h
diff --git a/recipes/mplayer/files/mplayer-imageon-svn.patch b/recipes/mplayer/files/mplayer-imageon-svn.patch
new file mode 100644
index 0000000000..744a520b13
--- /dev/null
+++ b/recipes/mplayer/files/mplayer-imageon-svn.patch
@@ -0,0 +1,367 @@
+
+#
+# Patch managed by http://www.holgerschurig.de/patcher.html
+#
+
+Index: trunk/configure
+===================================================================
+--- trunk.orig/configure	2007-10-07 20:31:56.000000000 +0100
++++ trunk/configure	2007-10-07 20:34:38.000000000 +0100
+@@ -545,6 +545,7 @@
+ _vesa=auto
+ _fbdev=auto
+ _w100=no
++_imageon=no
+ _dvb=auto
+ _dvbhead=auto
+ _dxr2=auto
+@@ -860,6 +861,8 @@
+   --disable-fbdev)	_fbdev=no	;;
+   --enable-w100)        _w100=yes       ;;
+   --disable-w100)       _w100=no        ;;
++  --enable-imageon)     _imageon=yes    ;;
++  --disable-imageon)    _imageon=no     ;;
+   --enable-dvb)		_dvb=yes	;;
+   --disable-dvb)        _dvb=no		;;
+   --enable-dvbhead)	_dvbhead=yes	;;
+@@ -4447,6 +4450,19 @@
+ fi
+ echores "$_w100"
+ 
++echocheck "ATI Imageon 100 (imageon)"
++if test "$_imageon" = yes ; then
++  _def_imageon='#define HAVE_IMAGEON 1'
++  _ld_imageon='-lw100'
++  _libs_mplayer="$_libs_mplayer $_ld_imageon"
++  _vosrc="$_vosrc vo_imageon.c"
++  _vomodules="imageon $_vomodules"
++else
++  _def_imageon='#undef HAVE_IMAGEON'
++  _novomodules="imageon $_novomodules"
++fi
++echores "$_imageon"
++
+ 
+ echocheck "DVB"
+ if test "$_dvb" = auto ; then
+@@ -8453,6 +8469,7 @@
+ $_def_xmga
+ $_def_fbdev
+ $_def_w100
++$_def_imageon
+ $_def_dxr2
+ $_def_dxr3
+ $_def_ivtv
+Index: trunk/libvo/vo_imageon.c
+===================================================================
+--- /dev/null	1970-01-01 00:00:00.000000000 +0000
++++ trunk/libvo/vo_imageon.c	2007-10-07 20:31:57.000000000 +0100
+@@ -0,0 +1,308 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <inttypes.h>
++#include <string.h>
++
++#include <mp_msg.h>
++#include <video_out.h>
++#include <video_out_internal.h>
++
++static vo_info_t info = 
++{
++	"ATI IMAGEON 100 driver",
++	"imageon",
++	"Manuel Teira",
++	"C760-Openzaurus Testing version"
++};
++
++LIBVO_EXTERN(imageon)
++
++#include <acapi.h>
++
++static struct w100privdata_t {
++	uint8_t config;
++	ac_device_t *dev;
++	uint16_t xres;
++	uint16_t yres;
++	uint16_t ovwidth;
++	uint16_t ovheight;
++	ac_surface_t insurface;
++	ac_surface_t ovsurface;
++	uint16_t srcwidth;
++	uint16_t srcheight;
++	uint8_t rotate;
++	uint8_t scale;
++	ac_point_t ovdst;
++	ac_point_t dstpos;
++	ac_overlayprops_t ovprops;
++	uint32_t format;
++} w100_privdata;
++
++static int preinit(const char *arg)
++{
++	//Perhaps libw100 should include some code to query the framebuffer
++	struct w100privdata_t *pdata = &w100_privdata;
++	
++	pdata->config = 0;
++	pdata->xres = 640;
++	pdata->yres = 480;
++	pdata->dev = ac_init(pdata->xres, pdata->yres, AC_ROT90);
++	if (pdata->dev) {
++		return 0;
++	} else {
++		//Put a log message here
++		return 1;
++	}
++}
++
++
++static void draw_osd(void)
++{
++}
++
++void check_events(void)
++{
++	mp_msg(MSGT_VO, MSGL_V, "check_events got called\n");
++}
++
++static int config(uint32_t srcwidth, uint32_t srcheight, 
++		  uint32_t dstwidth, uint32_t dstheight, 
++		  uint32_t flags, char *title, uint32_t format)
++{
++	struct w100privdata_t *pdata = &w100_privdata;
++	uint8_t xscale, yscale;
++	uint16_t scaledwidth, scaledheight;
++	
++	mp_msg(MSGT_VO, MSGL_V,
++	       "vo_imageon: srcwidth:%d, srcheight:%d, "
++	       "dstwidth:%d, dstheight:%d\n",
++	       srcwidth, srcheight, dstwidth, dstheight);
++	
++	if (pdata->config) {
++		ac_overlay_disable(pdata->dev);
++		ac_free_surface(pdata->dev, &pdata->insurface);
++		ac_free_surface(pdata->dev, &pdata->ovsurface);
++	}
++	
++	pdata->srcwidth = srcwidth;
++	pdata->srcheight = srcheight;
++
++	//By the moment, only YUV420 supported
++	pdata->ovprops.format = OVLFORMAT_YUV420;
++	pdata->ovprops.portrait_mode = 0;
++	pdata->ovprops.inv_hor = 0;
++	pdata->ovprops.inv_ver = 0;
++	pdata->ovprops.yuv2rgb = 0;
++
++	pdata->rotate = AC_ROT90;
++	
++	if (flags & VOFLAG_FULLSCREEN) {
++		pdata->ovwidth = 240;
++		pdata->ovheight = 320;
++		xscale = ac_get_scaler(pdata->ovheight, srcwidth);
++		yscale = ac_get_scaler(pdata->ovwidth, srcheight);
++		pdata->scale = (xscale > yscale) ? xscale : yscale;
++		pdata->ovdst.x = 0;
++		pdata->ovdst.y = 0;
++		pdata->ovprops.video_hor_exp = 1;
++		pdata->ovprops.video_ver_exp = 1;
++        } else {
++		pdata->ovwidth = (dstheight + 0xf) & ~0xf;
++		pdata->ovheight = (dstwidth + 0xf) & ~0xf;
++		xscale = ac_get_scaler(dstwidth, srcwidth);
++		yscale = ac_get_scaler(dstheight, srcheight);
++		pdata->ovdst.x = (pdata->xres - pdata->ovheight) / 2;
++		pdata->ovdst.y = (pdata->yres - pdata->ovwidth) / 2;
++		pdata->ovprops.video_hor_exp = 0;
++		pdata->ovprops.video_ver_exp = 0;
++        }
++
++	pdata->scale = (xscale > yscale) ? xscale : yscale;
++	scaledwidth = ac_apply_scaler(srcwidth, pdata->scale);
++	scaledheight = ac_apply_scaler(srcheight, pdata->scale);
++	pdata->dstpos.x = (pdata->ovwidth - scaledheight) / 2; 
++	pdata->dstpos.y = (pdata->ovheight - scaledwidth) / 2;
++
++	if (ac_alloc_surface(pdata->dev, &pdata->ovsurface,
++			     FMT_YUV420, 
++			     pdata->ovwidth, 
++			     pdata->ovheight,
++			     AC_MEM_INTERNAL) == NULL) {
++		mp_msg(MSGT_VO, MSGL_FATAL, "Unable to allocate ov surface\n");
++		return -1;
++	}
++
++	if (ac_alloc_surface(pdata->dev, &pdata->insurface,
++			     FMT_YUV420, srcwidth, srcheight, 
++			     AC_MEM_INTERNAL) == NULL) {
++		mp_msg(MSGT_VO, MSGL_WARN,
++		       "No room in internal memory for insurface\n");
++		if (ac_alloc_surface(pdata->dev, &pdata->insurface,
++				     FMT_YUV420, srcwidth, srcheight,
++				     AC_MEM_EXTERNAL) == NULL) {
++			mp_msg(MSGT_VO, MSGL_FATAL, 
++			       "Unable to allocate surface\n");
++			ac_free_surface(pdata->dev, &pdata->ovsurface);
++			return -1;
++		}
++	}
++	
++	ac_clear_surface(pdata->dev, &pdata->ovsurface);
++	ac_clear_surface(pdata->dev, &pdata->insurface);
++
++
++	mp_msg(MSGT_VO, MSGL_V, 
++	       "vo_imageon: rotate:%d scale:%d ovwidth:%d, ovheight:%d, "
++	       "ovdst(x:%d, y:%d) dstpos(x:%d,y:%d)\n",
++	       pdata->rotate,
++	       pdata->scale,
++	       pdata->ovwidth,
++	       pdata->ovheight,
++	       pdata->ovdst.x,
++	       pdata->ovdst.y,
++	       pdata->dstpos.x,
++	       pdata->dstpos.y);
++
++	ac_overlay_setup(pdata->dev, &pdata->ovsurface, &pdata->ovsurface.rect,
++			 &pdata->ovprops, 0);
++	ac_overlay_setpos(pdata->dev, &pdata->ovdst);
++	ac_overlay_enable(pdata->dev);
++
++	pdata->config = 1;
++	return 0;
++}
++
++static int draw_slice(uint8_t *image[], int stride[], 
++		      int w, int h, int x, int y)
++{
++	struct w100privdata_t *pdata = &w100_privdata;
++	ac_rect_t dstrect;
++	ac_surface_t *dstsurface = &pdata->insurface;
++	int plane;
++	mp_msg(MSGT_VO, MSGL_V, 
++	       "vo_imageon: draw_slice(w:%d,h:%d,x:%d,y:%d)\n",
++	       w, h, x, y);
++        
++	ac_reset_ctx(pdata->dev);
++	for (plane = 0; plane <= V_PLANE; plane++) {
++		mp_msg(MSGT_VO, MSGL_V, "Plane: %d, Stride: %d\n",
++		       plane, stride[plane]);
++		dstrect.x = x;
++		dstrect.y = y;
++		dstrect.w = w;
++		dstrect.h = h;
++		ac_host2planerect(pdata->dev,
++				  image[plane],
++				  &dstrect, 
++				  &pdata->insurface,
++				  plane);
++	}
++	return 0;
++}
++
++static int draw_frame(uint8_t *frame[])
++{
++	struct w100privdata_t *pdata = &w100_privdata;
++	mp_msg(MSGT_VO, MSGL_V, "vo_imageon: draw_frame() not implemented!\n");
++
++}
++
++static void flip_page(void)
++{
++	struct w100privdata_t *pdata = &w100_privdata;
++	int plane;
++	ac_rect_t srcrect;
++	ac_point_t dstpoint;
++	ac_surface_t *insurface = &pdata->insurface;
++	ac_surface_t *ovsurface = &pdata->ovsurface;
++	ac_surfspec_t *surfspec = &ac_surfspecs[ovsurface->format];
++
++	mp_msg(MSGT_VO, MSGL_V, "vo_imageon: flip_page\n");
++
++	srcrect.x = 0;
++	srcrect.y = 0;
++
++	ac_reset_ctx(pdata->dev);
++	if (pdata->rotate != AC_ROT0) {
++		ac_set_xform(pdata->dev, pdata->rotate, AC_NOMIRROR);
++	}
++	ac_disable_dbuf_update(pdata->dev);
++	ac_waitidle(pdata->dev);
++	for (plane = Y_PLANE; plane < surfspec->nplanes; plane++) {
++		ac_setsrcplane(pdata->dev, insurface, plane);
++		ac_setdstplane(pdata->dev, ovsurface, plane);
++		ac_prepare_scaleblt(pdata->dev, DP_DST_8BPP);
++		srcrect.w = pdata->srcwidth / surfspec->planes[plane].xsubsampling;
++		srcrect.h = pdata->srcheight / surfspec->planes[plane].ysubsampling;
++		dstpoint.x = pdata->dstpos.x / surfspec->planes[plane].xsubsampling;
++		dstpoint.y = pdata->dstpos.y / surfspec->planes[plane].ysubsampling;
++		mp_msg(MSGT_VO, MSGL_V, 
++		       "vo_imageon: scaleblt src(x:%d,y:%d,w:%d,h:%d)"
++		       "dst(%d,%d)"
++		       "scale(%d)\n",
++		       srcrect.x, srcrect.y,
++		       srcrect.w, srcrect.h,
++		       dstpoint.x, dstpoint.y,
++		       pdata->scale);
++		ac_scaleblt(pdata->dev, &srcrect, &dstpoint,
++			    pdata->scale, pdata->scale);
++	}
++	ac_enable_dbuf_update(pdata->dev);
++}
++
++static void uninit(void)
++{
++	struct w100privdata_t *pdata = &w100_privdata;
++	ac_overlay_disable(pdata->dev);
++	ac_finish(pdata->dev);
++}
++
++static int control(uint32_t request, void *data, ...)
++{
++	struct w100privdata_t *pdata = &w100_privdata;
++	switch (request) {
++	case VOCTRL_QUERY_FORMAT:
++		return query_format(*((uint32_t *)data));
++	case VOCTRL_FULLSCREEN:
++		mp_msg(MSGT_VO, MSGL_V, "vo_imageon: Asked for fullscreen\n");
++	}
++	return VO_NOTIMPL;
++}
++
++static int query_format(uint32_t format)
++{
++	mp_msg(MSGT_VO, MSGL_V, 
++	       "vo_imageon: query_format was called: %x (%s)\n",
++	       format, vo_format_name(format));
++
++	if (IMGFMT_IS_RGB(format)) {
++		return 0;
++		
++		switch (IMGFMT_RGB_DEPTH(format)) {
++		case 16:
++			return VFCAP_CSP_SUPPORTED | VFCAP_CSP_SUPPORTED_BY_HW |
++				VFCAP_HWSCALE_UP | VFCAP_HWSCALE_DOWN | 
++				VFCAP_OSD | VFCAP_ACCEPT_STRIDE;
++			break;
++		}
++	} else {
++		/* Planar YUV Formats */
++		switch (format) {
++		case IMGFMT_YV12:
++		case IMGFMT_IYUV:
++		case IMGFMT_I420:
++		case IMGFMT_YVU9:
++		case IMGFMT_IF09:
++		case IMGFMT_Y8:
++		case IMGFMT_Y800:
++			return VFCAP_CSP_SUPPORTED | VFCAP_CSP_SUPPORTED_BY_HW |
++				VFCAP_HWSCALE_UP | VFCAP_HWSCALE_DOWN | 
++				VFCAP_OSD | VFCAP_ACCEPT_STRIDE;
++			break;
++		}
++	}
++
++	return 0;
++}
++
++
diff --git a/recipes/mplayer/files/omapfb.patch b/recipes/mplayer/files/omapfb.patch
new file mode 100644
index 0000000000..860cf070f4
--- /dev/null
+++ b/recipes/mplayer/files/omapfb.patch
@@ -0,0 +1,10 @@
+--- /tmp/video_out.c	2009-01-14 16:39:38.000000000 +0100
++++ trunk/libvo/video_out.c	2009-01-14 16:40:11.000000000 +0100
+@@ -86,6 +86,7 @@
+ extern vo_functions_t video_out_bl;
+ extern vo_functions_t video_out_fbdev;
+ extern vo_functions_t video_out_fbdev2;
++extern vo_functions_t video_out_omapfb;
+ extern vo_functions_t video_out_svga;
+ extern vo_functions_t video_out_png;
+ extern vo_functions_t video_out_ggi;
diff --git a/recipes/mplayer/files/pld-onlyarm5-svn.patch b/recipes/mplayer/files/pld-onlyarm5-svn.patch
new file mode 100644
index 0000000000..0924060c6c
--- /dev/null
+++ b/recipes/mplayer/files/pld-onlyarm5-svn.patch
@@ -0,0 +1,405 @@
+--- MPlayer-1.0pre8/libavcodec/arm/dsputil_arm_s.S.orig	2006-07-03 09:53:33.000000000 +0100
++++ MPlayer-1.0pre8/libavcodec/arm/dsputil_arm_s.S	2006-07-03 10:06:58.000000000 +0100
+@@ -16,6 +16,13 @@
+ @ License along with this library; if not, write to the Free Software
+ @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ @
++#if defined(__ARM_ARCH_5__) || \
++    defined(__ARM_ARCH_5T__) || \
++    defined(__ARM_ARCH_5TE__)
++#define PLD(code...)   code
++#else
++#define PLD(code...)
++#endif
+ 
+ .macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
+         mov \Rd0, \Rn0, lsr #(\shift * 8)
+@@ -74,7 +81,7 @@
+ put_pixels16_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11, lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -85,7 +92,7 @@
+         ldmia r1, {r4-r7}
+         add r1, r1, r2
+         stmia r0, {r4-r7}
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         add r0, r0, r2
+         bne 1b
+@@ -95,7 +102,7 @@
+         ldmia r1, {r4-r8}
+         add r1, r1, r2
+         ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r9-r12}
+         add r0, r0, r2
+@@ -106,7 +113,7 @@
+         ldmia r1, {r4-r8}
+         add r1, r1, r2
+         ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r9-r12}
+         add r0, r0, r2
+@@ -117,7 +124,7 @@
+         ldmia r1, {r4-r8}
+         add r1, r1, r2
+         ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r9-r12}
+         add r0, r0, r2
+@@ -136,7 +143,7 @@
+ put_pixels8_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r5,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -147,7 +154,7 @@
+         ldmia r1, {r4-r5}
+         add r1, r1, r2
+         subs r3, r3, #1
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+         bne 1b
+@@ -157,7 +164,7 @@
+         ldmia r1, {r4-r5, r12}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+@@ -168,7 +175,7 @@
+         ldmia r1, {r4-r5, r12}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+@@ -179,7 +186,7 @@
+         ldmia r1, {r4-r5, r12}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+@@ -198,7 +205,7 @@
+ put_pixels8_x2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -210,7 +217,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -223,7 +230,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -236,7 +243,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -248,7 +255,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -267,7 +274,7 @@
+ put_no_rnd_pixels8_x2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -279,7 +286,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -292,7 +299,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -305,7 +312,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -317,7 +324,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -338,7 +345,7 @@
+ put_pixels8_y2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -352,13 +359,13 @@
+         add r1, r1, r2
+ 6:      ldmia r1, {r6-r7}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         ldmia r1, {r4-r5}
+         add r1, r1, r2
+         stmia r0, {r8-r9}
+         add r0, r0, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -369,18 +376,18 @@
+ 2:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+         RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+         subs r3, r3, #1
+         RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -392,18 +399,18 @@
+ 3:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+         RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+         subs r3, r3, #1
+         RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -415,18 +422,18 @@
+ 4:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+         RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+         subs r3, r3, #1
+         RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -447,7 +454,7 @@
+ put_no_rnd_pixels8_y2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -461,13 +468,13 @@
+         add r1, r1, r2
+ 6:      ldmia r1, {r6-r7}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         ldmia r1, {r4-r5}
+         add r1, r1, r2
+         stmia r0, {r8-r9}
+         add r0, r0, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -478,18 +485,18 @@
+ 2:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+         NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+         subs r3, r3, #1
+         NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -501,18 +508,18 @@
+ 3:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+         NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+         subs r3, r3, #1
+         NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -524,18 +531,18 @@
+ 4:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+         NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+         subs r3, r3, #1
+         NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -562,7 +569,7 @@
+         ldmia r1, {r8-r10}
+ .endif
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+ .if \align == 0
+         ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
+ .elseif \align == 1
+@@ -624,7 +631,7 @@
+ put_pixels8_xy2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adrl r12, 5f
+         ands r4, r1, #3
+@@ -661,7 +668,7 @@
+ put_no_rnd_pixels8_xy2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adrl r12, 5f
+         ands r4, r1, #3
diff --git a/recipes/mplayer/files/pld-onlyarm5.patch b/recipes/mplayer/files/pld-onlyarm5.patch
new file mode 100644
index 0000000000..3b8c576439
--- /dev/null
+++ b/recipes/mplayer/files/pld-onlyarm5.patch
@@ -0,0 +1,405 @@
+--- MPlayer-1.0pre8/libavcodec/armv4l/dsputil_arm_s.S.orig	2006-07-03 09:53:33.000000000 +0100
++++ MPlayer-1.0pre8/libavcodec/armv4l/dsputil_arm_s.S	2006-07-03 10:06:58.000000000 +0100
+@@ -16,6 +16,13 @@
+ @ License along with this library; if not, write to the Free Software
+ @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ @
++#if defined(__ARM_ARCH_5__) || \
++    defined(__ARM_ARCH_5T__) || \
++    defined(__ARM_ARCH_5TE__)
++#define PLD(code...)   code
++#else
++#define PLD(code...)
++#endif
+ 
+ .macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
+         mov \Rd0, \Rn0, lsr #(\shift * 8)
+@@ -74,7 +81,7 @@
+ put_pixels16_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11, lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -85,7 +92,7 @@
+         ldmia r1, {r4-r7}
+         add r1, r1, r2
+         stmia r0, {r4-r7}
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         add r0, r0, r2
+         bne 1b
+@@ -95,7 +102,7 @@
+         ldmia r1, {r4-r8}
+         add r1, r1, r2
+         ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r9-r12}
+         add r0, r0, r2
+@@ -106,7 +113,7 @@
+         ldmia r1, {r4-r8}
+         add r1, r1, r2
+         ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r9-r12}
+         add r0, r0, r2
+@@ -117,7 +124,7 @@
+         ldmia r1, {r4-r8}
+         add r1, r1, r2
+         ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r9-r12}
+         add r0, r0, r2
+@@ -136,7 +143,7 @@
+ put_pixels8_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r5,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -147,7 +154,7 @@
+         ldmia r1, {r4-r5}
+         add r1, r1, r2
+         subs r3, r3, #1
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+         bne 1b
+@@ -157,7 +164,7 @@
+         ldmia r1, {r4-r5, r12}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+@@ -168,7 +175,7 @@
+         ldmia r1, {r4-r5, r12}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+@@ -179,7 +186,7 @@
+         ldmia r1, {r4-r5, r12}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
+-        pld [r1]
++        PLD ( pld [r1] )
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+         add r0, r0, r2
+@@ -198,7 +205,7 @@
+ put_pixels8_x2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -210,7 +217,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -223,7 +230,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -236,7 +243,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -248,7 +255,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -267,7 +274,7 @@
+ put_no_rnd_pixels8_x2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -279,7 +286,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -292,7 +299,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -305,7 +312,7 @@
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+         ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+         subs r3, r3, #1
+         stmia r0, {r4-r5}
+@@ -317,7 +324,7 @@
+         ldmia r1, {r4-r5, r10}
+         add r1, r1, r2
+         ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -338,7 +345,7 @@
+ put_pixels8_y2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -352,13 +359,13 @@
+         add r1, r1, r2
+ 6:      ldmia r1, {r6-r7}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         ldmia r1, {r4-r5}
+         add r1, r1, r2
+         stmia r0, {r8-r9}
+         add r0, r0, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -369,18 +376,18 @@
+ 2:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+         RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+         subs r3, r3, #1
+         RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -392,18 +399,18 @@
+ 3:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+         RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+         subs r3, r3, #1
+         RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -415,18 +422,18 @@
+ 4:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+         RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+         subs r3, r3, #1
+         RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -447,7 +454,7 @@
+ put_no_rnd_pixels8_y2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adr r5, 5f
+         ands r4, r1, #3
+@@ -461,13 +468,13 @@
+         add r1, r1, r2
+ 6:      ldmia r1, {r6-r7}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+         ldmia r1, {r4-r5}
+         add r1, r1, r2
+         stmia r0, {r8-r9}
+         add r0, r0, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+         subs r3, r3, #1
+         stmia r0, {r8-r9}
+@@ -478,18 +485,18 @@
+ 2:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+         NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+         subs r3, r3, #1
+         NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -501,18 +508,18 @@
+ 3:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+         NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+         subs r3, r3, #1
+         NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -524,18 +531,18 @@
+ 4:
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+ 6:      ldmia r1, {r7-r9}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+         NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+         stmia r0, {r10-r11}
+         add r0, r0, r2
+         ldmia r1, {r4-r6}
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+         ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+         subs r3, r3, #1
+         NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+@@ -562,7 +569,7 @@
+         ldmia r1, {r8-r10}
+ .endif
+         add r1, r1, r2
+-        pld [r1]
++        PLD ( pld [r1] )
+ .if \align == 0
+         ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
+ .elseif \align == 1
+@@ -624,7 +631,7 @@
+ put_pixels8_xy2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adrl r12, 5f
+         ands r4, r1, #3
+@@ -661,7 +668,7 @@
+ put_no_rnd_pixels8_xy2_arm:
+         @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+         @ block = word aligned, pixles = unaligned
+-        pld [r1]
++        PLD ( pld [r1] )
+         stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+         adrl r12, 5f
+         ands r4, r1, #3
diff --git a/recipes/mplayer/files/powerpc-is-ppc.diff b/recipes/mplayer/files/powerpc-is-ppc.diff
new file mode 100644
index 0000000000..f8143c460e
--- /dev/null
+++ b/recipes/mplayer/files/powerpc-is-ppc.diff
@@ -0,0 +1,11 @@
+--- /tmp/configure	2007-03-30 19:40:34.000000000 +0200
++++ MPlayer-1.0rc1/configure	2007-03-30 19:40:58.795251000 +0200
+@@ -1213,7 +1213,7 @@
+     _optimizing=''
+     ;;
+ 
+-  ppc)
++  ppc|powerpc)
+     _def_arch='#define ARCH_POWERPC 1'
+     _def_dcbzl='#define NO_DCBZL 1'
+     _target_arch='TARGET_ARCH_POWERPC = yes'
diff --git a/recipes/mplayer/files/pxa-video_out.patch b/recipes/mplayer/files/pxa-video_out.patch
new file mode 100644
index 0000000000..0c4c4feb8c
--- /dev/null
+++ b/recipes/mplayer/files/pxa-video_out.patch
@@ -0,0 +1,22 @@
+Index: MPlayer-1.0rc1/libvo/video_out.c
+===================================================================
+--- MPlayer-1.0rc1.orig/libvo/video_out.c
++++ MPlayer-1.0rc1/libvo/video_out.c
+@@ -87,6 +87,7 @@ extern vo_functions_t video_out_fbdev;
+ extern vo_functions_t video_out_fbdev2;
+ extern vo_functions_t video_out_w100;
+ extern vo_functions_t video_out_imageon;
++extern vo_functions_t video_out_pxa;
+ extern vo_functions_t video_out_svga;
+ extern vo_functions_t video_out_png;
+ extern vo_functions_t video_out_ggi;
+@@ -206,6 +207,9 @@ vo_functions_t* video_out_drivers[] =
+ #ifdef HAVE_IMAGEON
+ 	&video_out_imageon,
+ #endif
++#ifdef HAVE_PXA
++    &video_out_pxa,
++#endif
+ #ifdef HAVE_SVGALIB
+ 	&video_out_svga,
+ #endif
diff --git a/recipes/mplayer/files/pxa_configure.patch b/recipes/mplayer/files/pxa_configure.patch
new file mode 100644
index 0000000000..079d3086de
--- /dev/null
+++ b/recipes/mplayer/files/pxa_configure.patch
@@ -0,0 +1,47 @@
+Index: MPlayer-1.0rc1/configure
+===================================================================
+--- MPlayer-1.0rc1.orig/configure
++++ MPlayer-1.0rc1/configure
+@@ -1600,6 +1600,7 @@ _vesa=auto
+ _fbdev=auto
+ _w100=no
+ _imageon=no
++_pxa=no
+ _dvb=auto
+ _dvbhead=auto
+ _dxr2=auto
+@@ -1803,6 +1804,8 @@ for ac_option do
+   --disable-w100)       _w100=no        ;;
+   --enable-imageon)     _imageon=yes    ;;
+   --disable-imageon)    _imageon=no     ;;
++  --enable-pxa)         _pxa=yes        ;;
++  --disable-pxa)        _pxa=no         ;;
+   --enable-dvb)		_dvb=yes	;;
+   --disable-dvb)        _dvb=no		;;
+   --enable-dvbhead)	_dvbhead=yes	;;
+@@ -4296,6 +4299,17 @@ else
+ fi
+ echores "$_imageon"
+ 
++echocheck "PXA27x Overlay Support"
++if test "$_pxa" = yes ; then
++  _def_pxa='#define HAVE_PXA 1'
++  _vosrc="$_vosrc vo_pxa.c"
++  _vomodules="pxa $_vomodules"
++else
++  _def_pxa='#undef HAVE_PXA'
++  _novomodules="pxa $_novomodules"
++fi
++echores "$_pxa"
++
+ 
+ echocheck "DVB"
+ if test "$_dvb" = auto ; then
+@@ -8398,6 +8412,7 @@ $_def_syncfb
+ $_def_fbdev
+ $_def_w100
+ $_def_imageon
++$_def_pxa
+ $_def_dxr2
+ $_def_dxr3
+ $_def_ivtv
diff --git a/recipes/mplayer/files/simple_idct_armv5te.S b/recipes/mplayer/files/simple_idct_armv5te.S
new file mode 100644
index 0000000000..3706f3a4ea
--- /dev/null
+++ b/recipes/mplayer/files/simple_idct_armv5te.S
@@ -0,0 +1,715 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+.arch armv5te
+
+/* IMPORTANT: this value should be the same as defined in dsputil.h */
+#define MAX_NEG_CROP 1024
+
+/*
+ * ARM EABI guarantees 8 byte stack alignment, so we can use LDRD instructions
+ * for accessing stack and load two registers per cycle to improve performance
+ * on ARM11 and XScale
+ */
+#ifdef __ARM_EABI__
+#define DWORD_ALIGNED_STACK 1
+#endif
+
+#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+#define W22 ((-W2 & 0xFFFF) | (W2 << 16))
+#define W44 ((-W4 & 0xFFFF) | (W4 << 16))
+#define W66 ((-W6 & 0xFFFF) | (W6 << 16))
+
+#define M51 ((-W5 & 0xFFFF) | ((-W1 & 0xFFFF) << 16))
+
+        .text
+
+/*
+ * a local pool with 64-bit constants for 'idct_rows_armv5te' function,
+ * we align it at 16 byte boundary in order to ensure that it does not cross
+ * cache line boundary (occupies only a single cache line)
+ */
+        .balign 16
+w2266idct_rows_armv5te:
+        .long W22
+        .long W66
+w1357idct_rows_armv5te:
+        .long W13
+        .long W57
+
+/*
+ * A rows processing function. Benchmarks on a few video files show that
+ * about 80-90% of uses of this function have all rows empty except for
+ * the row[0].
+ *
+ * On entry:
+ * a1              - row address
+ * lr              - return address
+ *
+ * On exit:
+ * a1              - row address
+ *
+ * Registers usage within this function:
+ *  a1             - row address
+ *  a2             - temporary register
+ *  v5, v6, v7, v8 - row data
+ *  v1, v2, v3, v4 - A0, A1, A2 and A3 variables
+ *  a3, a4         - used for loading constants
+ *  ip             - temporary register
+ *  lr             - temporary register, also holds initial row address value
+ *                   to check end of loop condition
+ */
+        .balign 32
+        .type idct_rows_armv5te, %function
+        .func idct_rows_armv5te
+idct_rows_armv5te:
+        str    a1, [sp, #-4]!
+        str    lr, [sp, #-4]!
+        mov    lr, a1
+        ldrd   v7, [a1, #(8 * 16 - 8)]! /* v7 = row[5:4], v8 = row[7:6] */
+1:
+        ldrd   v5, [a1, #-8]!           /* v5 = row[1:0], v6 = row[3:2] */
+        orrs   v1, v7, v8
+        cmpeq  v1, v6
+        cmpeq  v1, v5, lsr #16
+        bne    2f                       /* jump to process full row */
+        /* only row[0] is not empty here */
+        mov    v5, v5, lsl #19
+        cmp    a1, lr
+        orr    v5, v5, v5, lsr #16
+        str    v5, [a1]
+        str    v5, [a1, #4]
+        str    v5, [a1, #8]
+        str    v5, [a1, #12]
+        ldrned v7, [a1, #-8]!           /* v7 = row[5:4], v8 = row[7:6] */
+        bne    1b
+        ldr    pc, [sp], #8
+
+2:      /* process full row */
+        /* the next code fragment calculates A variables */
+
+        ldr    a2, w44                 /* a2 = -W4 | (W4 << 16) */
+        ldrd   a3, w2266idct_rows_armv5te /* a3 = -W2 | (W2 << 16) */
+                                       /* a4 = -W6 | (W6 << 16) */
+        mov    v1, #(1<<(ROW_SHIFT-1))
+        smlatb v1, a2, v5, v1          /* v1 = W4*row[0]+(1<<(ROW_SHIFT-1)) */
+
+        cmp    a1, lr
+
+        smlabb v2, a2, v7, v1          /* v2 = v1 - W4*row[4] */
+        smlatb v1, a2, v7, v1          /* v1 = v1 - W4*row[4] */
+
+        smlabb v3, a4, v6, v2          /* v3 = v2 - W6*row[2] */
+        smlabb v4, a3, v6, v1          /* v4 = v1 - W2*row[2] */
+
+        smlatb v3, a3, v8, v3          /* v3 += W2*row[6] */
+        smlabb v4, a4, v8, v4          /* v4 -= W6*row[6] */
+
+        ldrd   a3, w1357idct_rows_armv5te /* a3 = W1 | (W3 << 16) */
+                                       /* a4 = W5 | (W7 << 16) */
+
+        rsb    v2, v3, v2, lsl #1      /* v2 = 2*v2 - v3 */
+        rsb    v1, v4, v1, lsl #1      /* v1 = 2*v1 - v4 */
+
+        /* all A variables are now calculated (and stored in v1, v2, v3, v4 registers) */
+
+        smulbt a2, a3, v5              /* b0 = W1*row[1] */
+        smultt ip, a3, v5              /* tmp = W3*row[1] */
+        smultt lr, a4, v6              /* -b1 = W7*row[3] */
+        smlatt a2, a3, v6, a2          /* b0 += W3*row[3] */
+        smlabt lr, a3, v7, lr          /* -b1 += W1*row[5] */
+        smlabt a2, a4, v7, a2          /* b0 += W5*row[5] */
+        smlabt lr, a4, v8, lr          /* -b1 += W5*row[7] */
+        smlatt a2, a4, v8, a2          /* b0 += W7*row[7] */
+        sub    lr, ip, lr              /* b1 = -b1 - tmp */
+
+        /* B0 is now calculated (a2), B1 is now calculated (lr) */
+
+        add    ip, v1, a2              /* ip = (A0 + B0) */
+        sub    a2, v1, a2              /* a2 = (A0 - B0) */
+        mov    ip, ip, asr #ROW_SHIFT
+        mov    a2, a2, asr #ROW_SHIFT
+        strh   ip, [a1, #0]            /* row[0] = (A0 + B0) >> ROW_SHIFT */
+        strh   a2, [a1, #14]           /* row[7] = (A0 - B0) >> ROW_SHIFT */
+
+        ldr    v1, m51                 /* v1 = ((-W5 & 0xFFFF) | ((-W1 & 0xFFFF) << 16)) */
+
+        add    ip, v2, lr              /* ip = (A1 + B1) */
+        sub    a2, v2, lr              /* ip = (A1 - B1) */
+        mov    ip, ip, asr #ROW_SHIFT
+        mov    a2, a2, asr #ROW_SHIFT
+        strh   ip, [a1, #2]            /* row[1] = (A1 + B1) >> ROW_SHIFT */
+        strh   a2, [a1, #12]           /* row[6] = (A1 - B1) >> ROW_SHIFT */
+
+        smulbt a2, a4, v5              /* b2 = W5*row[1] */
+        smultt v2, a4, v5              /* b3 = W7*row[1] */
+        smlatt a2, v1, v6, a2          /* b2 -= W1*row[3] */
+        smlatt v2, a3, v7, v2          /* b3 += W3*row[5] */
+        smlatt a2, a4, v7, a2          /* b2 += W7*row[5] */
+        smlatt v2, v1, v8, v2          /* b3 -= W1*row[7] */
+        smlatt a2, a3, v8, a2          /* b2 += W3*row[7] */
+        smlabt v2, v1, v6, v2          /* b3 -= W5*row[3] */
+
+        /* B2 is now calculated (a2), B3 is now calculated (v2) */
+
+        ldr    lr, [sp, #4]
+
+        add    ip, v3, a2              /* ip = (A2 + B2) */
+        sub    a2, v3, a2              /* a2 = (A2 - B2) */
+        mov    ip, ip, asr #ROW_SHIFT
+        mov    a2, a2, asr #ROW_SHIFT
+        strh   ip, [a1, #4]            /* row[2] = (A2 + B2) >> ROW_SHIFT */
+        strh   a2, [a1, #10]           /* row[5] = (A2 - B2) >> ROW_SHIFT */
+
+        add    ip, v4, v2              /* ip = (A3 + B3) */
+        sub    a2, v4, v2              /* a2 = (A3 - B3) */
+        mov    ip, ip, asr #ROW_SHIFT
+        mov    a2, a2, asr #ROW_SHIFT
+        strh   ip, [a1, #6]            /* row[3] = (A3 + B3) >> ROW_SHIFT */
+        strh   a2, [a1, #8]            /* row[4] = (A3 - B3) >> ROW_SHIFT */
+
+        ldrned v7, [a1, #-8]!          /* v7 = row[5:4], v8 = row[7:6] */
+        bne    1b
+        ldr    pc, [sp], #8
+        .endfunc
+
+/******************************************************************************/
+
+/*
+ * a global pool with 32-bit constants (used from all the functions in this module),
+ * we align it at 32 byte boundary in order to ensure that it does not cross cache
+ * line boundary (occupies only a single cache line)
+ */
+        .balign 32
+simple_idct_croptbl_armv5te:
+        .long (ff_cropTbl + MAX_NEG_CROP)
+m51:    .long M51
+w44:    .long W44
+xxx:    .long (((1<<(COL_SHIFT-1))/W4)*W4)
+m7:     .long (-W7)
+
+/*
+ * Enforce 8 byte stack alignment if it is not provided by ABI. Used at the beginning
+ * of global functions. If stack is not properly aligned, real return address is
+ * pushed to stack (thus fixing stack alignment) and lr register is set to a thunk
+ * function 'unaligned_return_thunk_armv5te' which is responsible for providing
+ * correct return from the function in this case.
+ */
+        .macro idct_stackalign_armv5te
+#ifndef DWORD_ALIGNED_STACK
+        tst    sp, #4
+        strne  lr, [sp, #-4]!
+        adrne  lr, unaligned_return_thunk_armv5te
+#endif
+        .endm
+
+/*
+ * Process two columns at once.
+ *
+ * Registers usage within this macro:
+ *  a1             - column address
+ *  a2             - temporary register
+ *  A0b (v1), A0t (v2), A1b (v3), A1t (v4), A2b (v5), A2t (v6), A3b (v7), A3t (v8)
+ *  B0b (v1), B0t (v2), B1b (v3), B1t (v4), B2b (v5), B2t (v6), B3b (v7), B3t (v8)
+ *  a3, a4         - used for loading constants
+ *  ip             - temporary register
+ *  lr             - temporary register
+ *
+ * Data on exit ('b' suffix - first column (also bottom 16-bits of a register),
+ *               't' suffix - second column (also top 16-bits of a register)):
+ *  A0b, A0t, A1b, A1t, A2b, A2t, A3b, A3t - are returned in stack
+ *  B0b, B0t, B1b, B1t, B2b, B2t, B3b, B3t - are returned in v1, v2, v3, v4, v5, v6, v7, v8 registers
+ *  a1 - address of the next pair of columns
+ */
+        .macro idct_two_col_armv5te DWORD_CONST_SUFFIX
+        ldr    v4, [a1], #4            /* v4 = col_t[0]:col_b[0] */
+        ldr    a2, w44                 /* a2 = -W4 | (W4 << 16) */
+        ldr    v1, xxx                 /* v1 = (((1<<(COL_SHIFT-1))/W4)*W4) */
+        ldr    ip, [a1, #(16*4 - 4)]   /* ip = col_t[4]:col_b[4] */
+        ldrd   a3, w2266\DWORD_CONST_SUFFIX /* a3 = -W2 | (W2 << 16) */
+                                       /* a4 = -W6 | (W6 << 16) */
+        smlatt v2, a2, v4, v1          /* A0t = W4 * (col_t[0] + ((1<<(COL_SHIFT-1))/W4)) */
+        smlatb v1, a2, v4, v1          /* A0b = W4 * (col_b[0] + ((1<<(COL_SHIFT-1))/W4)) */
+
+        ldr    lr, [a1, #(16*2 - 4)]   /* lr = col_t[2]:col_b[2] */
+
+        smlabb v3, a2, ip, v1          /* A1b = A0b - W4*col_b[4] */
+        smlatb v1, a2, ip, v1          /* A0b = A0b + W4*col_b[4] */
+        smlabt v4, a2, ip, v2          /* A1t = A0t - W4*col_t[4] */
+        smlatt v2, a2, ip, v2          /* A0t = A0t + W4*col_t[4] */
+
+        ldr    ip, [a1, #(16*6 - 4)]   /* ip = col_t[6]:col_b[6] */
+
+        smlabb v5, a4, lr, v3          /* A2b = A1b - W6*col_b[2] */
+        smlabb v7, a3, lr, v1          /* A3b = A0b - W2*col_b[2] */
+        smlabt v6, a4, lr, v4          /* A2t = A1t - W6*col_t[2] */
+        smlabt v8, a3, lr, v2          /* A3t = A0t - W2*col_t[2] */
+
+        ldr    lr, [a1, #(16*1 - 4)]   /* lr = col_t[1]:col_b[1] */
+
+        smlatb v5, a3, ip, v5          /* A2b += W2*col_b[6] */
+        smlabb v7, a4, ip, v7          /* A3b -= W6*col_b[6] */
+        smlatt v6, a3, ip, v6          /* A2t += W2*col_t[6] */
+        smlabt v8, a4, ip, v8          /* A3t -= W6*col_t[6] */
+
+        ldrd   a3, w1357\DWORD_CONST_SUFFIX /* a3 = W1 | (W3 << 16) */
+                                       /* a4 = W5 | (W7 << 16) */
+
+        rsb    v3, v5, v3, lsl #1      /* A1b = 2*A1b - A2b */
+        rsb    v1, v7, v1, lsl #1      /* A0b = 2*A0b - A3b */
+        rsb    v4, v6, v4, lsl #1      /* A1t = 2*A1t - A2t */
+        rsb    v2, v8, v2, lsl #1      /* A0t = 2*A0t - A3t */
+
+        ldr    ip, [a1, #(16*5 - 4)]   /* ip = col_t[5]:col_b[5] */
+        ldr    a2, m51                 /* a2 = ((-W5 & 0xFFFF) | ((-W1 & 0xFFFF) << 16)) */
+
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, v8}
+
+        smulbb v1, a3, lr              /* B0b = W1*col_b[1] */
+        smulbt v2, a3, lr              /* B0t = W1*col_t[1] */
+        smultb v3, a3, lr              /* B1b = W3*col_b[1] */
+        smultt v4, a3, lr              /* B1t = W3*col_t[1] */
+        smulbb v5, a4, lr              /* B2b = W5*col_b[1] */
+        smulbt v6, a4, lr              /* B2t = W5*col_t[1] */
+        smultb v7, a4, lr              /* B3b = W7*col_b[1] */
+        smultt v8, a4, lr              /* B3t = W7*col_t[1] */
+
+        ldr    lr, [a1, #(16*7 - 4)]   /* lr = col_t[7]:col_b[7] */
+
+        cmp    ip, #0
+        beq    2f                      /* jump probability is typically more than 75% */
+
+        smlabt v2, a4, ip, v2          /* B0t += W5*col_t[5] */
+        smlatt v4, a2, ip, v4          /* B1t -= W1*col_t[5] */
+        smlatt v6, a4, ip, v6          /* B2t += W7*col_t[5] */
+        smlatt v8, a3, ip, v8          /* B3t += W3*col_t[5] */
+        smlabb v1, a4, ip, v1          /* B0b += W5*col_b[5] */
+        smlatb v3, a2, ip, v3          /* B1b -= W1*col_b[5] */
+        smlatb v5, a4, ip, v5          /* B2b += W7*col_b[5] */
+        smlatb v7, a3, ip, v7          /* B3b += W3*col_b[5] */
+2:
+        ldr    ip, [a1, #(16*3 - 4)]   /* ip = col_t[3]:col_b[3] */
+
+        cmp    lr, #0
+        beq    3f                      /* jump probability is typically more than 90% */
+
+        smlatt v2, a4, lr, v2          /* B0t += W7*col_t[7] */
+        smlabt v4, a2, lr, v4          /* B1t -= W5*col_t[7] */
+        smlatt v6, a3, lr, v6          /* B2t += W3*col_t[7] */
+        smlatt v8, a2, lr, v8          /* B3t -= W1*col_t[7] */
+
+        smlatb v1, a4, lr, v1          /* B0b += W7*col_b[7] */
+        smlabb v3, a2, lr, v3          /* B1b -= W5*col_b[7] */
+        smlatb v5, a3, lr, v5          /* B2b += W3*col_b[7] */
+        smlatb v7, a2, lr, v7          /* B3b -= W1*col_b[7] */
+3:
+        cmp    ip, #0
+        beq    4f                      /* jump probability is typically more than 65% */
+
+        ldr    a4, m7
+
+        smlatt v2, a3, ip, v2          /* B0t += W3*col_t[3] */
+        smlatt v6, a2, ip, v6          /* B2t -= W1*col_t[3] */
+        smlabt v8, a2, ip, v8          /* B3t -= W5*col_t[3] */
+        smlabt v4, a4, ip, v4          /* B1t -= W7*col_t[3] */
+
+        smlatb v1, a3, ip, v1          /* B0b += W3*col_b[3] */
+        smlatb v5, a2, ip, v5          /* B2b -= W1*col_b[3] */
+        smlabb v7, a2, ip, v7          /* B3b -= W5*col_b[3] */
+        smlabb v3, a4, ip, v3          /* B1b -= W7*col_b[3] */
+4:
+        .endm
+
+/******************************************************************************/
+
+/*
+ * a local pool with 64-bit constants for 'simple_idct_put_armv5te' function,
+ * we align it at 16 byte boundary in order to ensure that it does not cross
+ * cache line boundary (occupies only a single cache line)
+ */
+        .balign 16
+w2266simple_idct_put_armv5te:
+        .long W22
+        .long W66
+w1357simple_idct_put_armv5te:
+        .long W13
+        .long W57
+
+        .balign 32
+        .global simple_idct_put_armv5te
+        .type simple_idct_put_armv5te, %function
+        .func simple_idct_put_armv5te
+simple_idct_put_armv5te:
+
+        idct_stackalign_armv5te
+
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, v8, lr}
+        strd   a1, [sp, #-12]!
+
+        mov    a1, a3
+        bl     idct_rows_armv5te
+
+        add    a2, a1, #16
+        strd   a1, [sp, #-8]!
+1:
+        idct_two_col_armv5te simple_idct_put_armv5te
+        str    a1, [sp, #(0 + 32)]
+        ldrd   a3, [sp, #(8 + 32)]
+        ldr    lr, simple_idct_croptbl_armv5te
+
+        ldrd   a1, [sp], #8
+        add    ip, a3, #2
+        str    ip, [sp, #(8 + 32 - 8)]
+
+        add    ip, a1, v1
+        sub    v1, a1, v1
+        add    a1, a2, v2
+        sub    v2, a2, v2
+        ldrb   a1, [lr, a1, asr #COL_SHIFT]
+        ldrb   ip, [lr, ip, asr #COL_SHIFT]
+        ldrb   v2, [lr, v2, asr #COL_SHIFT]
+        ldrb   v1, [lr, v1, asr #COL_SHIFT]
+        orr    ip, ip, a1, asl #8
+        ldrd   a1, [sp], #8
+        orr    v1, v1, v2, asl #8
+        strh   ip, [a3], a4
+
+        add    ip, a1, v3
+        sub    v3, a1, v3
+        add    a1, a2, v4
+        sub    v4, a2, v4
+        ldrb   a1, [lr, a1, asr #COL_SHIFT]
+        ldrb   ip, [lr, ip, asr #COL_SHIFT]
+        ldrb   v4, [lr, v4, asr #COL_SHIFT]
+        ldrb   v3, [lr, v3, asr #COL_SHIFT]
+        orr    ip, ip, a1, asl #8
+        ldrd   a1, [sp], #8
+        orr    v3, v3, v4, asl #8
+        strh   ip, [a3], a4
+
+        add    ip, a1, v5
+        sub    v5, a1, v5
+        add    a1, a2, v6
+        sub    v6, a2, v6
+        ldrb   a1, [lr, a1, asr #COL_SHIFT]
+        ldrb   ip, [lr, ip, asr #COL_SHIFT]
+        ldrb   v6, [lr, v6, asr #COL_SHIFT]
+        ldrb   v5, [lr, v5, asr #COL_SHIFT]
+        orr    ip, ip, a1, asl #8
+        ldrd   a1, [sp], #8
+        orr    v5, v5, v6, asl #8
+        strh   ip, [a3], a4
+
+        add    ip, a1, v7
+        sub    v7, a1, v7
+        add    a1, a2, v8
+        sub    v8, a2, v8
+        ldrb   a1, [lr, a1, asr #COL_SHIFT]
+        ldrb   ip, [lr, ip, asr #COL_SHIFT]
+        ldrb   v8, [lr, v8, asr #COL_SHIFT]
+        ldrb   v7, [lr, v7, asr #COL_SHIFT]
+        orr    ip, ip, a1, asl #8
+        strh   ip, [a3], a4
+
+        ldrd   a1, [sp, #0]
+        orr    v7, v7, v8, asl #8
+
+        strh   v7, [a3], a4
+        strh   v5, [a3], a4
+        cmp    a1, a2
+        strh   v3, [a3], a4
+        strh   v1, [a3], a4
+
+        bne    1b
+
+        add    sp, sp, #20
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, v8, pc}
+        .endfunc
+
+/******************************************************************************/
+
+/*
+ * a local pool with 64-bit constants for 'simple_idct_add_armv5te' function, we
+ * align it at 16 byte boundary in order to ensure that it does not cross
+ * cache line boundary (occupies only a single cache line)
+ */
+        .balign 16
+w2266simple_idct_add_armv5te:
+        .long W22
+        .long W66
+w1357simple_idct_add_armv5te:
+        .long W13
+        .long W57
+
+        .balign 32
+        .global simple_idct_add_armv5te
+        .type simple_idct_add_armv5te, %function
+        .func simple_idct_add_armv5te
+simple_idct_add_armv5te:
+
+        idct_stackalign_armv5te
+
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, v8, lr}
+        strd   a1, [sp, #-12]!
+
+        mov    a1, a3
+        bl     idct_rows_armv5te
+
+        add    a2, a1, #16
+        strd   a1, [sp, #-8]!
+
+        sub    sp, sp, #8
+1:
+        idct_two_col_armv5te simple_idct_add_armv5te
+        ldrd   a3, [sp, #(8 + 40)]
+        str    a1, [sp, #(0 + 40)]
+
+        ldrd   a1, [sp], #8
+        add    ip, a3, #2
+        str    ip, [sp, #(8 + 40 - 8)]
+
+        add    ip, a1, v1
+        sub    v1, a1, v1
+        add    a1, a2, v2
+        sub    v2, a2, v2
+        strd   v1, [sp, #(32 - 8)] /* save v1 and v2 to stack in order to use them as temporary registers */
+        ldrb   v1, [a3, #1]
+        ldrb   v2, [a3]
+        ldr    lr, simple_idct_croptbl_armv5te
+        add    v1, v1, a1, asr #COL_SHIFT
+        ldrd   a1, [sp], #8
+        add    ip, v2, ip, asr #COL_SHIFT
+        ldrb   v2, [lr, v1]
+        ldrb   ip, [lr, ip]
+
+        add    v1, a1, v3
+        sub    v3, a1, v3
+
+        ldrb   a1, [a3, a4]
+        orr    ip, ip, v2, asl #8
+        strh   ip, [a3], a4
+
+        ldrb   v2, [a3, #1]
+
+        add    ip, a2, v4
+        sub    v4, a2, v4
+        add    ip, v2, ip, asr #COL_SHIFT
+        add    v1, a1, v1, asr #COL_SHIFT
+        ldrb   v2, [lr, ip]
+        ldrb   ip, [lr, v1]
+        ldrb   v1, [a3, a4]
+        ldrd   a1, [sp], #8
+        orr    ip, ip, v2, asl #8
+        strh   ip, [a3], a4
+
+        ldrb   v2, [a3, #1]
+        add    ip, a1, v5
+        sub    v5, a1, v5
+        add    a1, a2, v6
+        sub    v6, a2, v6
+        add    a1, v2, a1, asr #COL_SHIFT
+        add    ip, v1, ip, asr #COL_SHIFT
+        ldrb   v2, [lr, a1]
+        ldrb   ip, [lr, ip]
+        ldrb   v1, [a3, a4]
+        ldrd   a1, [sp], #8
+        orr    ip, ip, v2, asl #8
+        strh   ip, [a3], a4
+
+        ldrb   v2, [a3, #1]
+        add    ip, a1, v7
+        sub    v7, a1, v7
+        add    a1, a2, v8
+        sub    v8, a2, v8
+        add    a1, v2, a1, asr #COL_SHIFT
+        add    ip, v1, ip, asr #COL_SHIFT
+        ldrb   v2, [lr, a1]
+        ldrb   ip, [lr, ip]
+        ldrb   v1, [a3, a4]
+        add    a2, lr, v7, asr #COL_SHIFT
+        orr    ip, ip, v2, asl #8
+        strh   ip, [a3], a4
+
+        ldrb   v2, [a3, #1]
+        add    v8, lr, v8, asr #COL_SHIFT
+        mov    v7, a3        /* a good news, now we have two more spare registers v7 and v8 */
+        ldrb   ip, [a2, v1]
+        ldrb   v8, [v8, v2]
+        ldrb   v1, [v7, a4]!
+        ldrb   v2, [v7, #1]
+        orr    ip, ip, v8, asl #8
+        strh   ip, [a3], a4
+
+        ldrb   a1, [v7, a4]!
+        ldrb   a2, [v7, #1]
+
+        add    v6, v2, v6, asr #COL_SHIFT
+        add    v5, v1, v5, asr #COL_SHIFT
+        ldrb   v6, [lr, v6]
+        ldrb   v5, [lr, v5]
+        ldrd   v1, [sp, #0]         /* restore v1 and v2 that were saved earlier */
+        orr    v5, v5, v6, asl #8
+        strh   v5, [a3], a4
+        ldrb   v5, [v7, a4]!
+        ldrb   v6, [v7, #1]
+
+        add    v4, a2, v4, asr #COL_SHIFT
+        add    v3, a1, v3, asr #COL_SHIFT
+        ldrb   v4, [lr, v4]
+        ldrb   v3, [lr, v3]
+
+        ldrd   a1, [sp, #8]
+        add    v2, v6, v2, asr #COL_SHIFT
+        add    v1, v5, v1, asr #COL_SHIFT
+        ldrb   v2, [lr, v2]
+        ldrb   v1, [lr, v1]
+        cmp    a1, a2
+        orr    v3, v3, v4, asl #8
+        strh   v3, [a3], a4
+        orr    v1, v1, v2, asl #8
+        strh   v1, [a3], a4
+
+        bne    1b
+
+        add    sp, sp, #28
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, v8, pc}
+        .endfunc
+
+/******************************************************************************/
+
+/*
+ * a local pool with 64-bit constants for 'simple_idct_armv5te' function, we
+ * align it at 16 byte boundary in order to ensure that it does not cross
+ * cache line boundary (occupies only a single cache line)
+ */
+        .balign 16
+w2266simple_idct_armv5te:
+        .long W22
+        .long W66
+w1357simple_idct_armv5te:
+        .long W13
+        .long W57
+
+        .balign 32
+        .global simple_idct_armv5te
+        .type simple_idct_armv5te, %function
+        .func simple_idct_armv5te
+simple_idct_armv5te:
+
+        idct_stackalign_armv5te
+
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, v8, lr}
+        strd   a1, [sp, #-12]!
+
+        bl     idct_rows_armv5te
+
+        add    a2, a1, #16
+        str    a2, [sp, #-8]!
+1:
+        idct_two_col_armv5te simple_idct_armv5te
+
+        ldr    lr, [sp, #32]
+
+        ldrd   a3, [sp], #8
+
+        cmp    lr, a1
+
+        add    a2, a3, v1
+        add    ip, a4, v2
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*0 - 4)]
+        strh   ip, [a1, #(16*0 + 2 - 4)]
+        sub    a2, a3, v1
+        sub    ip, a4, v2
+        ldrd   a3, [sp], #8
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*7 - 4)]
+        strh   ip, [a1, #(16*7 + 2 - 4)]
+
+        add    a2, a3, v3
+        add    ip, a4, v4
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*1 - 4)]
+        strh   ip, [a1, #(16*1 + 2 - 4)]
+        sub    a2, a3, v3
+        sub    ip, a4, v4
+        ldrd   a3, [sp], #8
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*6 - 4)]
+        strh   ip, [a1, #(16*6 + 2 - 4)]
+
+        add    a2, a3, v5
+        add    ip, a4, v6
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*2 - 4)]
+        strh   ip, [a1, #(16*2 + 2 - 4)]
+        sub    a2, a3, v5
+        sub    ip, a4, v6
+        ldrd   a3, [sp], #8
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*5 - 4)]
+        strh   ip, [a1, #(16*5 + 2 - 4)]
+
+        add    a2, a3, v7
+        add    ip, a4, v8
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*3 - 4)]
+        strh   ip, [a1, #(16*3 + 2 - 4)]
+        sub    a2, a3, v7
+        sub    ip, a4, v8
+        mov    a2, a2, asr #COL_SHIFT
+        mov    ip, ip, asr #COL_SHIFT
+        strh   a2, [a1, #(16*4 - 4)]
+        strh   ip, [a1, #(16*4 + 2 - 4)]
+
+        bne    1b
+
+        add    sp, sp, #20
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, v8, pc}
+        .endfunc
+
+/******************************************************************************/
+
+unaligned_return_thunk_armv5te:
+        ldr    pc, [sp], #4
diff --git a/recipes/mplayer/files/vo_omapfb.c b/recipes/mplayer/files/vo_omapfb.c
new file mode 100644
index 0000000000..5a43404300
--- /dev/null
+++ b/recipes/mplayer/files/vo_omapfb.c
@@ -0,0 +1,586 @@
+/*
+ 
+Copyright (C) 2008 Gregoire Gentil <gregoire@gentil.com>
+This file adds an optimized vo output to mplayer for the OMAP platform. This is a first pass and an attempt to help to improve
+media playing on the OMAP platform. The usual disclaimer comes here: this code is provided without any warranty.
+Many bugs and issues still exist. Feed-back is welcome.
+
+This output uses the yuv420_to_yuv422 conversion from Mans Rullgard, and is heavily inspired from the work of Siarhei Siamashka.
+I would like to thank those two persons here, without them this code would certainly not exist.
+
+Two options of the output are available:
+fb_overlay_only (disabled by default): only the overlay is drawn. X11 stuff is ignored.
+dbl_buffer (disabled by default): add double buffering. Some tearsync flags are probably missing in the code.
+
+Syntax is the following:
+mplayer -ao alsa -vo omapfb /test.avi
+mplayer -nosound -vo omapfb:fb_overlay_only:dbl_buffer /test.avi
+
+You need to have two planes on your system. On beagleboard, it means something like: video=omapfb:vram:2M,vram:4M
+
+Known issues:
+1) A green line or some vertical lines (if mplayer decides to draw bands instead of frame) may appear.
+It's an interpolation bug in the color conversion that needs to be fixed
+
+2) The color conversion accepts only 16-pixel multiple for width and height.
+
+3) The scaling down is disabled as the scaling down kernel patch for the OMAP3 platform doesn't seem to work yet.
+
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <linux/fb.h>
+
+#include "config.h"
+#include "video_out.h"
+#include "video_out_internal.h"
+#include "fastmemcpy.h"
+#include "sub.h"
+#include "mp_msg.h"
+
+#include "omapfb.h"
+
+#include "libswscale/swscale.h"
+#include "libmpcodecs/vf_scale.h"
+#include "libavcodec/avcodec.h"
+
+#include "aspect.h"
+
+#include "subopt-helper.h"
+
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/Xatom.h>
+#include "wskeys.h"
+
+static vo_info_t info = {
+	"omapfb video driver",
+	"omapfb",
+	"",
+	""
+};
+
+LIBVO_EXTERN(omapfb)
+
+static int fb_overlay_only = 0; // if set, we need only framebuffer overlay, but do not need any x11 code
+static int dbl_buffer = 0;
+static int fullscreen_flag = 0;
+static int plane_ready = 0;
+
+extern void yuv420_to_yuv422(uint8_t *yuv, uint8_t *y, uint8_t *u, uint8_t *v, int w, int h, int yw, int cw, int dw);
+static struct fb_var_screeninfo sinfo_p0;
+static struct fb_var_screeninfo sinfo;
+static struct omapfb_mem_info minfo;
+static struct omapfb_plane_info pinfo;
+static struct {
+    unsigned x;
+    unsigned y;
+    uint8_t *buf;
+} fb_pages[2];
+static int dev_fd = -1;
+static int fb_page_flip = 0;
+static int page = 0;
+static void omapfb_update(int x, int y, int out_w, int out_h, int show);
+
+extern void mplayer_put_key( int code );
+#include "osdep/keycodes.h"
+
+#define TRANSPARENT_COLOR_KEY 0xff0
+
+static Display *display = NULL; // pointer to X Display structure.
+static int screen_num; // number of screen to place the window on.
+static Window win = 0;
+static Window parent = 0; // pointer to the newly created window.
+
+/* This is used to intercept window closing requests.  */
+static Atom wm_delete_window;
+
+/**
+ * Function to get the offset to be used when in windowed mode
+ * or when using -wid option
+ */
+static void x11_get_window_abs_position(Display *display, Window window,
+                                             int *wx, int *wy, int *ww, int *wh)
+{
+    Window root, parent;
+    Window *child;
+    unsigned int n_children;
+    XWindowAttributes attribs;
+
+    /* Get window attributes */
+    XGetWindowAttributes(display, window, &attribs);
+
+    /* Get relative position of given window */
+    *wx = attribs.x;
+    *wy = attribs.y;
+    if (ww)
+        *ww = attribs.width;
+    if (wh)
+        *wh = attribs.height;
+
+    /* Query window tree information */
+    XQueryTree(display, window, &root, &parent, &child, &n_children);
+    if (parent)
+    {
+      int x, y;
+      /* If we have a parent we must go there and discover his position*/
+      x11_get_window_abs_position(display, parent, &x, &y, NULL, NULL);
+      *wx += x;
+      *wy += y;
+    }
+
+    /* If we had children, free it */
+    if(n_children)
+        XFree(child);
+}
+
+
+/**
+ * Function that controls fullscreen state for x11 window
+ * action = 1 (set fullscreen)
+ * action = 0 (set windowed mode)
+ */
+static void x11_set_fullscreen_state(Display *display, Window window, int action)
+{
+    XEvent xev;
+
+    /* init X event structure for _NET_WM_FULLSCREEN client msg */
+    xev.xclient.type = ClientMessage;
+    xev.xclient.serial = 0;
+    xev.xclient.send_event = True;
+    xev.xclient.message_type = XInternAtom(display, "_NET_WM_STATE", False);
+    xev.xclient.window = window;
+    xev.xclient.format = 32;
+    xev.xclient.data.l[0] = action;
+    xev.xclient.data.l[1] = XInternAtom(display, "_NET_WM_STATE_FULLSCREEN", False);
+    xev.xclient.data.l[2] = 0;
+    xev.xclient.data.l[3] = 0;
+    xev.xclient.data.l[4] = 0;
+
+    /* finally send that damn thing */
+    if (!XSendEvent(display, DefaultRootWindow(display), False, SubstructureRedirectMask | SubstructureNotifyMask, &xev)) {
+        mp_msg(MSGT_VO, MSGL_ERR, "[omapfb] failure in x11_set_fullscreen_state\n");
+        exit(1);
+    }
+    XSync(display, False);
+}
+
+
+XClassHint classhint = {"mediaplayer-ui", "mediaplayer-ui"};
+
+
+/**
+ * Initialize x11 window (it is used to allocate some screen area for framebuffer overlay)
+ */
+static void x11_init()
+{
+    display = XOpenDisplay(getenv("DISPLAY"));
+    if (display == NULL) {
+        mp_msg(MSGT_VO, MSGL_ERR, "[omapfb] failure in x11_init, can't open display\n");
+        exit(1);
+    }
+
+    screen_num = DefaultScreen(display);
+
+    if (WinID > 0)
+    {
+        Window root;
+        Window *child;
+        unsigned int n_children;
+
+        win = WinID;
+
+        /* Query window tree information */
+        XQueryTree(display, win, &root, &parent, &child, &n_children);
+        if (n_children)
+            XFree(child);
+
+        XUnmapWindow(display, win);
+        if (parent)
+            XSelectInput(display, parent, StructureNotifyMask);
+        XMapWindow(display, win);
+
+        wm_delete_window = XInternAtom(display, "WM_DELETE_WINDOW", False);
+        XSetWMProtocols(display, win, &wm_delete_window, 1);
+    } else {
+        win = XCreateSimpleWindow(display, RootWindow(display, screen_num),
+                                  sinfo_p0.xres / 2 - sinfo.xres / 2, sinfo_p0.yres / 2 - sinfo.yres / 2, sinfo.xres, sinfo.yres, 0,
+                                  WhitePixel(display, screen_num),
+                                  TRANSPARENT_COLOR_KEY);
+
+        XSetClassHint(display, win, &classhint);
+
+        XStoreName(display, win, "MPlayer");
+        XMapWindow(display, win);
+
+        /* Set WM_DELETE_WINDOW atom in WM_PROTOCOLS property (to get window_delete requests).  */
+        wm_delete_window = XInternAtom(display, "WM_DELETE_WINDOW", False);
+        XSetWMProtocols(display, win, &wm_delete_window, 1);
+        XSelectInput(display, win, StructureNotifyMask | KeyPressMask);
+    }
+}
+
+
+void print_properties(Window win2)
+{
+	Atom *p;
+	int num, j;
+	char *aname;
+	Atom type;
+	int format;
+	unsigned long nitems, bytes_after;
+	unsigned char *ret = NULL;
+
+	p = XListProperties(display, win2, &num);
+	printf("found %d properties for window %d\n", num, (int)win2);
+	for (j = 0; j < num; j++) {
+		aname = XGetAtomName(display, p[j]);
+		if (aname) {
+			if(Success == XGetWindowProperty(display, win2, XInternAtom(display, aname, False),
+						0L, ~0L, False, XA_STRING,
+						&type, &format, &nitems,
+						&bytes_after, &ret))
+			{
+/*				printf("format = %d, nitems = %d, bytes_after = %d\n", format, nitems, bytes_after);*/
+				printf("%s = %s\n", aname, ret);
+				XFree(ret);
+			}
+			XFree(aname);
+		} else printf("NULL\n");
+	}
+	XFree(p);
+}
+
+
+static int x11_check_events()
+{
+    if (!display) {
+        mp_msg(MSGT_VO, MSGL_ERR, "[omapfb] 'x11_check_events' called out of sequence\n");
+        exit(1);
+    }
+
+    int ret = 0;
+    XEvent Event;
+    while (XPending(display)) {
+        XNextEvent(display, &Event);
+        if (Event.type == UnmapNotify)
+            omapfb_update(0, 0, 0, 0, 0);
+        else if ((Event.type == MapNotify) || (Event.type == ConfigureNotify))
+            omapfb_update(0, 0, 0, 0, 1);
+        else if (Event.type == KeyPress) {
+            int key;
+            KeySym keySym = XKeycodeToKeysym(display, Event.xkey.keycode, 0);
+            key = ((keySym & 0xff00) != 0 ? ((keySym & 0x00ff) + 256) : (keySym));
+            ret |= VO_EVENT_KEYPRESS;
+            vo_x11_putkey(key);
+        } else if (Event.type == ClientMessage) {
+            if ((Atom)Event.xclient.data.l[0] == wm_delete_window) {
+                mplayer_put_key(KEY_ESC);
+            }
+        }
+    }
+    return ret;
+}
+
+
+static void x11_uninit()
+{
+    if (display) {
+        XCloseDisplay(display);
+        display = NULL;
+    }
+}
+
+
+/**
+ * Initialize framebuffer
+ */
+static int preinit(const char *arg)
+{
+
+    opt_t subopts[] = {
+        {"fb_overlay_only", OPT_ARG_BOOL, &fb_overlay_only, NULL},
+        {"dbl_buffer", OPT_ARG_BOOL, &dbl_buffer, NULL},
+        {NULL}
+    };
+
+    if (subopt_parse(arg, subopts) != 0) {
+        mp_msg(MSGT_VO, MSGL_FATAL, "[omapfb] unknown suboptions: %s\n", arg);
+        return -1;
+    }
+
+    dev_fd = open("/dev/fb0", O_RDWR);
+
+    if (dev_fd == -1) {
+        mp_msg(MSGT_VO, MSGL_FATAL, "[omapfb] Error /dev/fb0\n");
+        return -1;
+    }
+
+    ioctl(dev_fd, FBIOGET_VSCREENINFO, &sinfo_p0);
+    close(dev_fd);
+
+    dev_fd = open("/dev/fb1", O_RDWR);
+
+    if (dev_fd == -1) {
+        mp_msg(MSGT_VO, MSGL_FATAL, "[omapfb] Error /dev/fb1\n");
+        return -1;
+    }
+
+    ioctl(dev_fd, FBIOGET_VSCREENINFO, &sinfo);
+    ioctl(dev_fd, OMAPFB_QUERY_PLANE, &pinfo);
+    ioctl(dev_fd, OMAPFB_QUERY_MEM, &minfo);
+
+    if (!fb_overlay_only)
+        x11_init();
+
+    return 0;
+}
+
+
+static void omapfb_update(int x, int y, int out_w, int out_h, int show)
+{
+    if (!fb_overlay_only)
+        x11_get_window_abs_position(display, win, &x, &y, &out_w, &out_h);
+
+    if ((x < 0) || (y < 0)
+
+// If you develop the right scaling-down patch in kernel, uncomment the line below and comment the next one
+//        || (out_w < sinfo.xres / 4) || (out_h < sinfo.yres / 4)
+        || (out_w < sinfo.xres) || (out_h < sinfo.yres)
+
+// If you don't have the right scaling-up patch in kernel, comment the line below and uncomment the next one
+/* Kernel patch to enable scaling up on the omap3
+======================================================
+--- a/drivers/video/omap/dispc.c	2008-11-01 20:08:04.000000000 -0700
++++ b/drivers/video/omap/dispc.c	2008-11-01 20:09:02.000000000 -0700
+@@ -523,9 +523,6 @@
+ 	if ((unsigned)plane > OMAPFB_PLANE_NUM)
+ 		return -ENODEV;
+ 
+-	if (out_width != orig_width || out_height != orig_height)
+-		return -EINVAL;
+-
+ 	enable_lcd_clocks(1);
+ 	if (orig_width < out_width) {
+ 		/*
+======================================================
+*/
+        || (out_w > sinfo.xres * 8) || (out_h > sinfo.yres * 8)
+//        || (out_w > sinfo.xres) || (out_h > sinfo.yres)
+
+        || (x + out_w > sinfo_p0.xres) || (y + out_h > sinfo_p0.yres)) {
+        pinfo.enabled = 0;
+        pinfo.pos_x = 0;
+        pinfo.pos_y = 0;
+        ioctl(dev_fd, OMAPFB_SETUP_PLANE, &pinfo);
+        return;
+    }
+
+    pinfo.enabled = show;
+    pinfo.pos_x = x;
+    pinfo.pos_y = y;
+    pinfo.out_width  = out_w;
+    pinfo.out_height = out_h;
+    ioctl(dev_fd, OMAPFB_SETUP_PLANE, &pinfo);
+}
+
+
+static int config(uint32_t width, uint32_t height, uint32_t d_width,
+		uint32_t d_height, uint32_t flags, char *title,
+		uint32_t format)
+{
+    uint8_t *fbmem;
+    int i;
+    struct omapfb_color_key color_key;
+
+    fullscreen_flag = flags & VOFLAG_FULLSCREEN;
+
+    fbmem = mmap(NULL, minfo.size, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0);
+    if (fbmem == MAP_FAILED) {
+        mp_msg(MSGT_VO, MSGL_FATAL, "[omapfb] Error mmap\n");
+        return -1;
+    }
+
+    for (i = 0; i < minfo.size / 4; i++)
+        ((uint32_t*)fbmem)[i] = 0x80008000;
+
+    sinfo.xres = FFMIN(sinfo_p0.xres, width)  & ~15;
+    sinfo.yres = FFMIN(sinfo_p0.yres, height) & ~15;
+    sinfo.xoffset = 0;
+    sinfo.yoffset = 0;
+    sinfo.nonstd = OMAPFB_COLOR_YUY422;
+
+    fb_pages[0].x = 0;
+    fb_pages[0].y = 0;
+    fb_pages[0].buf = fbmem;
+
+    if (dbl_buffer && minfo.size >= sinfo.xres * sinfo.yres * 2) {
+        sinfo.xres_virtual = sinfo.xres;
+        sinfo.yres_virtual = sinfo.yres * 2;
+        fb_pages[1].x = 0;
+        fb_pages[1].y = sinfo.yres;
+        fb_pages[1].buf = fbmem + sinfo.xres * sinfo.yres * 2;
+        fb_page_flip = 1;
+    } else {
+        sinfo.xres_virtual = sinfo.xres;
+        sinfo.yres_virtual = sinfo.yres;
+        fb_page_flip = 0;
+    }
+
+    ioctl(dev_fd, FBIOPUT_VSCREENINFO, &sinfo);
+
+    if (WinID <= 0) {
+        if (fullscreen_flag) {
+            if (!fb_overlay_only)
+                x11_set_fullscreen_state(display, win, 1);
+            omapfb_update(0, 0, sinfo_p0.xres, sinfo_p0.yres, 1);
+        } else {
+            if (!fb_overlay_only)
+                x11_set_fullscreen_state(display, win, 0);
+            omapfb_update(sinfo_p0.xres / 2 - sinfo.xres / 2, sinfo_p0.yres / 2 - sinfo.yres / 2, sinfo.xres, sinfo.yres, 1);
+        }
+    }
+
+    color_key.channel_out = OMAPFB_CHANNEL_OUT_LCD;
+    color_key.background = 0x0;
+    color_key.trans_key = TRANSPARENT_COLOR_KEY;
+    if (fb_overlay_only)
+        color_key.key_type = OMAPFB_COLOR_KEY_DISABLED;
+    else
+        color_key.key_type = OMAPFB_COLOR_KEY_GFX_DST;
+    ioctl(dev_fd, OMAPFB_SET_COLOR_KEY, &color_key);
+
+    plane_ready = 1;
+    return 0;
+}
+
+
+static void draw_alpha(int x0, int y0, int w, int h, unsigned char *src, unsigned char *srca, int stride)
+{
+    vo_draw_alpha_yuy2(w, h, src, srca, stride, fb_pages[page].buf + sinfo.xres * y0 * 2 + x0 * 2, sinfo.xres);
+}
+
+
+static void draw_osd(void)
+{
+    vo_draw_text(sinfo.xres, sinfo.yres, draw_alpha);
+}
+
+
+static int draw_frame(uint8_t *src[])
+{
+    return 1;
+}
+
+
+static int draw_slice(uint8_t *src[], int stride[], int w, int h, int x, int y)
+{
+    if (x!=0)
+        return 0;
+
+    if (!plane_ready)
+        return 0;
+
+    ioctl(dev_fd, OMAPFB_SYNC_GFX);
+
+    yuv420_to_yuv422(fb_pages[page].buf + 2 * sinfo.xres * y, src[0], src[1], src[2], w & ~15, h, stride[0], stride[1], 2 * sinfo.xres_virtual);
+
+    return 0;
+}
+
+
+static void flip_page(void)
+{
+    if (fb_page_flip) {
+        sinfo.xoffset = fb_pages[page].x;
+        sinfo.yoffset = fb_pages[page].y;
+        ioctl(dev_fd, FBIOPAN_DISPLAY, &sinfo);
+        page ^= fb_page_flip;
+    }
+}
+
+
+static int query_format(uint32_t format)
+{
+    // For simplicity pretend that we can only do YV12, support for
+    // other formats can be added quite easily if/when needed
+    if (format != IMGFMT_YV12)
+        return 0;
+
+    return VFCAP_CSP_SUPPORTED | VFCAP_CSP_SUPPORTED_BY_HW | VFCAP_OSD | VFCAP_SWSCALE | VFCAP_ACCEPT_STRIDE;
+}
+
+
+/**
+ * Uninitialize framebuffer
+ */
+static void uninit()
+{
+    pinfo.enabled = 0;
+    ioctl(dev_fd, OMAPFB_SETUP_PLANE, &pinfo);
+
+    if (!fb_overlay_only) {
+        struct omapfb_color_key color_key;
+        color_key.channel_out = OMAPFB_CHANNEL_OUT_LCD;
+        color_key.key_type = OMAPFB_COLOR_KEY_DISABLED;
+        ioctl(dev_fd, OMAPFB_SET_COLOR_KEY, &color_key);
+    }
+
+    close(dev_fd);
+
+    if (!fb_overlay_only)
+        x11_uninit();
+}
+
+
+static int control(uint32_t request, void *data, ...)
+{
+    switch (request) {
+        case VOCTRL_QUERY_FORMAT:
+            return query_format(*((uint32_t*)data));
+        case VOCTRL_FULLSCREEN: {
+            if (WinID > 0) return VO_FALSE;
+            if (fullscreen_flag) {
+                if (!fb_overlay_only)
+                    x11_set_fullscreen_state(display, win, 0);
+                fullscreen_flag = 0;
+                omapfb_update(sinfo_p0.xres / 2 - sinfo.xres / 2, sinfo_p0.yres / 2 - sinfo.yres / 2, sinfo.xres, sinfo.yres, 1);
+            } else {
+                if (!fb_overlay_only)
+                    x11_set_fullscreen_state(display, win, 1);
+                fullscreen_flag = 1;
+                omapfb_update(0, 0, sinfo_p0.xres, sinfo_p0.yres, 1);
+            }
+            return VO_TRUE;
+        }
+    }
+    return VO_NOTIMPL;
+}
+
+
+static void check_events(void)
+{
+    if (!fb_overlay_only)
+        x11_check_events();
+}
diff --git a/recipes/mplayer/files/vo_pxa.c b/recipes/mplayer/files/vo_pxa.c
new file mode 100644
index 0000000000..1488d14064
--- /dev/null
+++ b/recipes/mplayer/files/vo_pxa.c
@@ -0,0 +1,980 @@
+/*
+ * Video driver for PXA 27x Overlay 2, in conjunction with kernel driver
+ * by Tim Chick <tim (DOT) chick (AT) csr (DOT) com>
+ * (C) 2007
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "config.h"
+#include "video_out.h"
+#include "video_out_internal.h"
+#include "sub.h"
+#include "aspect.h"
+#include "mp_msg.h"
+#include "subopt-helper.h"
+
+#include "vo_pxa.h"
+
+static vo_info_t info = {
+	"PXA 27x Framebuffer",
+	"pxa",
+	"Tim Chick <tim (DOT) chick (AT) csr (DOT) com>",
+	"For Sharp Zaurus SL-C1000 etc"
+};
+
+LIBVO_EXTERN(pxa);
+
+static pxa_priv_t st_pxa_priv;
+
+/*****************************************************************************
+ * preinit
+ *
+ * Preinitializes driver
+ *   arg - currently it's vo_subdevice
+ *   returns: zero on successful initialization, non-zero on error.
+ *
+ ****************************************************************************/
+static int preinit(const char *vo_subdevice)
+{
+    pxa_priv_t *priv = &st_pxa_priv;
+    int rc;
+    
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: preinit() was called\n");
+
+    /* fill pxa_priv_t information */
+    memset(priv, 0, sizeof(*priv));
+    priv->fd = -1;
+    
+    /* We need to open the base framebuffer device, to change and restore modes */
+    priv->base_fd = open( "/dev/fb0", O_RDWR );
+    
+    if( priv->base_fd < 0 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: Could not open base framebuffer device\n");
+        return -1;
+    }
+    
+    /* Get the base fb var data, so we can restore if we change video modes */
+    rc = ioctl( priv->base_fd, FBIOGET_VSCREENINFO, &(priv->base_orig_fb_var) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: FBIOGET_VSCREENINFO preinit base_fd failed %d\n",
+                errno );
+        
+        /* If this failed, close down the FD so we don't try to set this again */
+        close( priv->base_fd );
+        priv->base_fd = -1;
+        
+        return -1;
+    }
+    
+    return 0;
+}
+
+
+/*****************************************************************************
+ * config
+ *
+ * Config the display driver.
+ * params:
+ *   src_width,srcheight: image source size
+ *   dst_width,dst_height: size of the requested window size, just a hint
+ *   fullscreen: flag, 0=windowd 1=fullscreen, just a hint
+ *   title: window title, if available
+ *   format: fourcc of pixel format
+ * returns : zero on successful initialization, non-zero on error.
+ *
+ ****************************************************************************/
+static int config(uint32_t src_width, uint32_t src_height,
+                  uint32_t dst_width, uint32_t dst_height, uint32_t flags,
+                  char *title, uint32_t format)
+{
+    pxa_priv_t *priv = &st_pxa_priv;
+    int rc;
+    int i;
+    
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: config() src_width:%d, src_height:%d, dst_width:%d, dst_height:%d\n",
+	   src_width, src_height, dst_width, dst_height);
+
+    /* Check format */
+    if( !vo_pxa_query_format(format) )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: unsupported fourcc for this driver: %x (%s)\n",
+                format, vo_format_name(format) );
+        goto err_out;
+    }
+    priv->format = format;
+    
+    /* Change resolution? */
+    priv->vm = flags & VOFLAG_MODESWITCHING;
+    if( priv->vm )
+    {
+        priv->my_fb_var = priv->base_orig_fb_var;
+        
+        /* Hard coded values suck, never mind */
+        priv->my_fb_var.xres = 240;
+        priv->my_fb_var.yres = 320;
+        priv->my_fb_var.pixclock = 134617;
+        priv->my_fb_var.left_margin = 20;
+        priv->my_fb_var.right_margin = 46;
+        priv->my_fb_var.upper_margin = 1;
+        priv->my_fb_var.lower_margin = 0;
+        priv->my_fb_var.hsync_len = 20;
+        priv->my_fb_var.vsync_len = 2;
+    
+        rc = ioctl( priv->base_fd, FBIOPUT_VSCREENINFO, &(priv->my_fb_var) );
+        
+        if( rc == -1 )
+        {
+            mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() Set FBIOPUT_VSCREENINFO on base_fd failed %d\n",
+                    errno );
+            priv->vm = 0;
+            goto err_out;
+        }
+        
+        /* We need this sleep, to make the change in resolution actually happen, before we open the overlay */
+        sleep(1);
+    }
+ 
+   
+    /* Open up the overlay fbdev */
+    priv->fd = open( "/dev/fb2", O_RDWR );
+
+    if( priv->fd < 0 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: Could not open /dev/fb2: %d\n", errno );
+        goto err_out;
+    }
+    
+    /* Read in fb var data */
+    rc = ioctl( priv->fd, FBIOGET_VSCREENINFO, &(priv->my_fb_var) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() FBIOGET_VSCREENINFO from fd failed %d\n",
+                errno );
+        goto err_out;
+    }
+    
+    /* Store away the source dimensions, so we can place in centre of screen later in vm mode */
+    priv->src_width = src_width;
+    priv->src_height = src_height;
+    
+    /* Set up the buffer */
+    if( priv->vm )
+    {
+        /* Ignore size, as the rest of the screen is toast. Use max size */
+        priv->my_fb_var.xres = 240;
+        priv->my_fb_var.yres = 320;
+
+        /* Do we need to rotate? */
+        if( priv->src_width > priv->src_height )
+        {
+            /* Yes */
+            priv->rotate = 1;
+        }
+
+        priv->width  = 240;
+        priv->height = 320;
+    }
+    else
+    {
+        priv->my_fb_var.xres = src_width;
+        priv->my_fb_var.yres = src_height;
+        priv->width = src_width;
+        priv->height = src_height;
+    }
+    
+    priv->my_fb_var.nonstd = ( 4 << 20)  /* Format YV12 */
+                           | ( 0 <<  0)  /* x position */
+                           | ( 0 << 10); /* y position */ 
+    /* We have to set the bits per pixel to a valid value, even though it is
+     * incorrect for YV12
+     */
+    priv->my_fb_var.bits_per_pixel = 16;
+    
+    rc = ioctl( priv->fd, FBIOPUT_VSCREENINFO, &(priv->my_fb_var) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() FBIOPUT_VSCREENINFO to fd failed: %d\n",
+                errno );
+        goto err_out;
+    }
+        
+    /* Next get the fixed fbvars, so we can mmap the data for all 3 planes */
+    rc = ioctl( priv->fd, FBIOGET_FSCREENINFO, &(priv->my_fb_fix) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() FBIOGET_FSCREENINFO from fd failed: %d\n", 
+                errno );
+        goto err_out;
+    }
+    
+    priv->fb_mem_base = mmap( NULL, priv->my_fb_fix.smem_len, (PROT_READ | PROT_WRITE ),
+                        MAP_SHARED,
+                        priv->fd,
+                        0 );
+
+    if( priv->fb_mem_base == MAP_FAILED )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: mmap fd buffer failed: %d\n", errno );
+        goto err_out;
+    }
+
+    /* Finally, find the offsets of each plane by getting the var data again */
+    rc = ioctl( priv->fd, FBIOGET_VSCREENINFO, &(priv->my_fb_var) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() FBIOGET_VSCREENINFO from fd (2) failed %d\n", 
+                errno );
+        goto err_out;
+    }
+    
+    /* Fill the overlay with black */
+    memset( priv->fb_mem_base + priv->my_fb_var.red.offset, 16, priv->my_fb_var.red.length );
+    memset( priv->fb_mem_base + priv->my_fb_var.green.offset, 128, priv->my_fb_var.green.length );
+    memset( priv->fb_mem_base + priv->my_fb_var.blue.offset, 128, priv->my_fb_var.blue.length );
+    
+    /* Now open the OSD overlay - overlay 1, and fill with transparent */
+    sleep( 1 );
+    
+    priv->overlay_fd = open( "/dev/fb1", O_RDWR );
+
+    if( priv->overlay_fd < 0 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: Could not open /dev/fb1: %d\n", errno );
+        goto err_out;
+    }
+    
+    /* Read in fb var data */
+    rc = ioctl( priv->overlay_fd, FBIOGET_VSCREENINFO, &(priv->osd_fb_var) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() FBIOGET_VSCREENINFO from overlay_fd failed; %d\n",
+                errno );
+        goto err_out;
+    }
+
+    priv->osd_fb_var.xres = priv->width;
+    priv->osd_fb_var.yres = priv->height;
+    priv->osd_fb_var.nonstd = ( 0 <<  0)  /* x position */
+                            | ( 0 << 10); /* y position */
+    /* Use 15 bit mode, with top bit transparency */
+    priv->osd_fb_var.bits_per_pixel = 16;
+    
+    rc = ioctl( priv->overlay_fd, FBIOPUT_VSCREENINFO, &(priv->osd_fb_var) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() FBIOPUT_VSCREENINFO to overlay_fd failed: %d\n",
+                errno );
+        goto err_out;
+    }
+        
+    /* Next get the fixed fbvars, so we can mmap the data */
+    rc = ioctl( priv->overlay_fd, FBIOGET_FSCREENINFO, &(priv->osd_fb_fix) );
+    
+    if( rc == -1 )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: config() FBIOGET_FSCREENINFO from overlay_fd failed %d\n",
+                errno );
+        goto err_out;
+    }
+    
+    priv->osd_mem_base = mmap( NULL, priv->osd_fb_fix.smem_len, (PROT_READ | PROT_WRITE ),
+                               MAP_SHARED,
+                               priv->overlay_fd,
+                               0 );
+
+    if( priv->osd_mem_base == MAP_FAILED )
+    {
+        mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: mmap osd_mem_base failed: %d\n", errno );
+        goto err_out;
+    }
+    
+    /* Fill the overlay with transparent */
+    vo_pxa_clear_osd( priv->osd_mem_base, priv->osd_fb_fix.smem_len );
+
+    /* We are good to go! */
+    mp_msg( MSGT_VO, MSGL_V, "vo_pxa: Opened video overlay %d x %d fourcc %s\n",
+            priv->my_fb_var.xres,
+            priv->my_fb_var.yres,
+            vo_format_name(format) );
+    
+    return 0;
+    
+    err_out:
+
+    /* Don't do anything here for the moment */
+    return -1;
+}
+
+
+/*****************************************************************************
+ *
+ * control
+ *
+ * Control display
+ *
+ ****************************************************************************/
+static int control(uint32_t request, void *data, ...)
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: control %08x\n", request );
+    
+    switch( request )
+    {
+        case VOCTRL_QUERY_FORMAT:
+            return( vo_pxa_query_format( *(uint32_t *)data ) );
+            break;
+    }
+    
+    return VO_NOTIMPL;
+}
+
+
+/*****************************************************************************
+ *
+ * draw_frame
+ *
+ * Display a new RGB/BGR frame of the video to the screen.
+ * params:
+ *   src[0] - pointer to the image
+ *
+ ****************************************************************************/
+int draw_frame(uint8_t *src[])
+{
+    /* This is not implimented */
+    mp_msg(MSGT_VO, MSGL_ERR, "vo_pxa: dummy draw_frame() was called\n");
+    return -1;
+}
+
+
+/*****************************************************************************
+ *
+ * draw_slice
+ *
+ * Draw a planar YUV slice to the buffer:
+ * params:
+ *   src[3] = source image planes (Y,U,V)
+ *   stride[3] = source image planes line widths (in bytes)
+ *   w,h = width*height of area to be copied (in Y pixels)
+ *   x,y = position at the destination image (in Y pixels)
+ *
+ ****************************************************************************/
+int draw_slice(uint8_t *src[], int stride[], int w,int h, int x,int y)
+{
+    pxa_priv_t *priv = &st_pxa_priv;
+    
+    /* This routine is only display routine actually implimented */
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: draw_slice() w %d h %d x %d y %d stride %d %d %d\n",
+           w, h, x, y, stride[0], stride[1], stride[2] );
+    
+    /* It would be faster to check if source and dest have same geometry and copy
+     * whole block
+     * For the moment we just copy a line at a time
+     */
+
+    /* In vm mode rotate if wider than long */
+    if( priv->vm )
+    {
+        /* Do we nee to rotate? */
+        if( priv->rotate )
+        {
+            /* Yes, rotated version */
+            int dst_x_offset = 0;
+            int dst_y_offset = 0;
+            int src_x_offset = 0;
+            int src_y_offset = 0;
+        
+            /* Figure out  dst offset */
+            if( priv->src_width < 320 )
+            {
+                dst_x_offset = ( ( 320 -  priv->src_width ) / 2 );
+                /* Make it a multiple of 16 */
+                dst_x_offset &= ~(0xf);
+            }
+        
+            if( priv->src_height < 240 )
+            {
+                dst_y_offset = ( ( 240 -  priv->src_height ) / 2 );
+                /* Make it a multiple of 16 */
+                dst_y_offset &= ~(0xf);
+            }
+        
+            dst_x_offset += x;
+            dst_y_offset += y;
+        
+            if( ( dst_x_offset >= 320 ) || ( dst_y_offset >= 240 ) )
+            {
+                /* Nothing to do - drawing off the screen! */
+                return( 0 );
+            }
+        
+            /* Limit to drawable area */
+            if( ( w + dst_x_offset ) > 320 )
+            {
+                w = ( 320 - dst_x_offset );
+            }
+        
+            if( ( h + dst_y_offset ) > 240 )
+            {
+                h = ( 240 - dst_y_offset );
+            }
+            
+            /* And source offset */
+            if( priv->src_width > 320 )
+            {
+                src_x_offset = ( ( priv->src_width - 320 ) / 2 );
+                /* Make it a multiple of 16 */
+                src_x_offset &= ~(0xf);
+            }
+        
+            if( priv->src_height > 240 )
+            {
+                src_y_offset = ( ( priv->src_height - 240 ) / 2 );
+                /* Make it a multiple of 16 */
+                src_y_offset &= ~(0xf);
+            }
+            
+        
+            /* Y first */
+            vo_pxa_copy_and_rotate( src[0] + src_x_offset + (src_y_offset * stride[0]), stride[0],
+                                    priv->fb_mem_base + priv->my_fb_var.red.offset  + (240 * dst_x_offset) + (240 - dst_y_offset - h),
+                                    w, h, 240 );
+            /* Now U */
+            vo_pxa_copy_and_rotate( src[1] + src_x_offset/2 + (src_y_offset/2 * stride[1]), stride[1],
+                                    priv->fb_mem_base + priv->my_fb_var.green.offset + (120 * dst_x_offset/2) + (120 - dst_y_offset/2 - h/2),
+                                    w/2, h/2, 120 );
+            vo_pxa_copy_and_rotate( src[2] + src_x_offset/2 + (src_y_offset/2 * stride[2]), stride[2],
+                                    priv->fb_mem_base + priv->my_fb_var.blue.offset + (120 * dst_x_offset/2) + (120 - dst_y_offset/2 - h/2),
+                                    w/2, h/2, 120 );
+        }
+        else
+        {
+            /* Don't rotate */
+            int i;
+            uint8_t *my_src;
+            uint8_t *dest;
+            int dst_x_offset = 0;
+            int dst_y_offset = 0;
+            int src_x_offset = 0;
+            int src_y_offset = 0;
+        
+            /* Figure out  dst offset */
+            if( priv->src_width < 240 )
+            {
+                dst_x_offset = ( ( 240 -  priv->src_width ) / 2 );
+                /* Make it a multiple of 16 */
+                dst_x_offset &= ~(0xf);
+            }
+        
+            if( priv->src_height < 320 )
+            {
+                dst_y_offset = ( ( 320 -  priv->src_height ) / 2 );
+                /* Make it a multiple of 16 */
+                dst_y_offset &= ~(0xf);
+            }
+        
+            dst_x_offset += x;
+            dst_y_offset += y;
+        
+            if( ( dst_x_offset >= 240 ) || ( dst_y_offset >= 320 ) )
+            {
+                /* Nothing to do - drawing off the screen! */
+                return( 0 );
+            }
+        
+            /* Limit to drawable area */
+            if( ( w + dst_x_offset ) > 240 )
+            {
+                w = ( 240 - dst_x_offset );
+            }
+        
+            if( ( h + dst_y_offset ) > 320 )
+            {
+                h = ( 320 - dst_y_offset );
+            }
+
+            /* And source offset */
+            if( priv->src_width > 240 )
+            {
+                src_x_offset = ( ( priv->src_width - 240 ) / 2 );
+                /* Make it a multiple of 16 */
+                src_x_offset &= ~(0xf);
+            }
+        
+            if( priv->src_height > 320 )
+            {
+                src_y_offset = ( ( priv->src_height - 320 ) / 2 );
+                /* Make it a multiple of 16 */
+                src_y_offset &= ~(0xf);
+            }
+            
+            /* First Y */
+            for( i = 0; i<h; i++ )
+            {
+                dest = priv->fb_mem_base + 
+                    priv->my_fb_var.red.offset + 
+                    ( (dst_y_offset+i) * priv->my_fb_fix.line_length ) +
+                    dst_x_offset;
+                my_src = src[0] + src_x_offset + (stride[0] * (i+src_y_offset));
+                memcpy( dest, my_src, w );
+            }
+
+            /* Now U */
+            for( i = 0; i<(h/2); i++ )
+            {
+                dest = priv->fb_mem_base + 
+                    priv->my_fb_var.green.offset + 
+                    ( ((dst_y_offset/2)+i) * (priv->my_fb_fix.line_length/2) ) +
+                    dst_x_offset/2;
+                my_src = src[1] + src_x_offset/2 + (stride[1] * (i+(src_y_offset/2)));
+                memcpy( dest, my_src, w/2 );
+            }
+    
+            /* Finaly V */
+            for( i = 0; i<(h/2); i++ )
+            {
+                dest = priv->fb_mem_base + 
+                    priv->my_fb_var.blue.offset + 
+                    ( ((dst_y_offset/2)+i) * (priv->my_fb_fix.line_length/2) ) +
+                    dst_x_offset/2;
+                my_src = src[2] + src_x_offset/2 + (stride[2] * (i+(src_y_offset/2)));
+                memcpy( dest, my_src, w/2 );
+            }
+            
+        }
+    }
+    else
+    {
+        /* Not full screen mode */
+        uint8_t *my_src;
+        uint8_t *dest;
+        size_t length;
+        int i;
+        
+        /* It would be faster to check if source and dest have same geometry and copy
+         * whole block
+         * For the moment we just copy a line at a time
+         */
+
+        /* Limit area written to */
+        if( x >= priv->my_fb_fix.line_length )
+        {
+            return 0;
+        }
+    
+        if( w + x > priv->my_fb_fix.line_length )
+        {
+            w = priv->my_fb_fix.line_length - x;
+        }
+    
+        if( y>= priv->my_fb_var.yres )
+        {
+            return 0;
+        }
+    
+        if( h + y > priv->my_fb_var.yres )
+        {
+            h = priv->my_fb_var.yres - y;
+        }
+    
+        /* First Y */
+        for( i = 0; i<h; i++ )
+        {
+            dest = priv->fb_mem_base + 
+                priv->my_fb_var.red.offset + 
+                ( (y+i) * priv->my_fb_fix.line_length ) +
+                x;
+            my_src = src[0] + stride[0] * i;
+            memcpy( dest, my_src, w );
+        }
+
+        /* Now U */
+        for( i = 0; i<(h/2); i++ )
+        {
+            dest = priv->fb_mem_base + 
+                priv->my_fb_var.green.offset + 
+                ( ((y/2)+i) * (priv->my_fb_fix.line_length/2) ) +
+                x;
+            my_src = src[1] + stride[1] * i;
+            memcpy( dest, my_src, w/2 );
+        }
+    
+        /* Finaly V */
+        for( i = 0; i<(h/2); i++ )
+        {
+            dest = priv->fb_mem_base + 
+                priv->my_fb_var.blue.offset + 
+                ( ((y/2)+i) * (priv->my_fb_fix.line_length/2) ) +
+                x;
+            my_src = src[2] + stride[2] * i;
+            memcpy( dest, my_src, w/2 );
+        }
+    }
+    return 0;
+}
+
+static void draw_osd(void)
+{
+    pxa_priv_t *priv = &st_pxa_priv;
+    int osd_has_changed;
+
+    /* This gets called every frame, so systems which do the OSD without a
+     * seperate overlay can mix in the image. We need to find out if the osd
+     * has actually been updated!
+     */
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: draw_osd() was called\n");
+    
+    osd_has_changed = vo_update_osd( priv->width, priv->height);
+
+    if(osd_has_changed)
+    {
+        int i;
+        
+        mp_msg(MSGT_VO, MSGL_V, "vo_pxa: Clear and update OSD\n");
+        
+        /* Fill with transparent */
+        vo_pxa_clear_osd( priv->osd_mem_base, priv->osd_fb_fix.smem_len );
+        
+        priv->osd_cleared = 1;
+        
+        /* now update */
+        if( priv->rotate )
+        {
+            vo_draw_text( priv->width, priv->height, vo_pxa_draw_alpha_with_rotate );
+        }
+        else
+        {
+            vo_draw_text( priv->width, priv->height, vo_pxa_draw_alpha );
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ * flip_page
+ *
+ * Blit/Flip buffer to the screen. Must be called after each frame!
+ *
+ *
+ ****************************************************************************/
+static void flip_page(void)
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: flip_page() was called\n");
+}
+
+/*****************************************************************************
+ *
+ * check_events
+ *
+ *
+ ****************************************************************************/
+static void check_events(void)
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: check_events() was called\n");
+}
+
+/*****************************************************************************
+ *
+ * uninit
+ *
+ *
+ ****************************************************************************/
+static void uninit(void)
+{
+    pxa_priv_t *priv = &st_pxa_priv;
+    int rc;
+
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: uninit() was called\n");
+    
+    if( priv->vm )
+    {
+        /* We need these sleeps, to make the change in resolution actually happen */
+        sleep(1);
+
+        /* Restore original resolution */
+        if( priv->base_fd >= 0 )
+        {
+            rc = ioctl( priv->base_fd, FBIOPUT_VSCREENINFO, &(priv->base_orig_fb_var) );
+    
+            if( rc == -1 )
+            {
+                mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: uninit() FBIOPUT_VSCREENINFO to base_fd failed %d\n",
+                        errno );
+            }
+        }
+        /* We need these sleeps, to make the change in resolution actually happen */
+        /* For some reason, if we change the reolution the overlay buffer never gets deleted? */
+        sleep(1);
+    }
+    
+
+    /* We need to force the overlays to be really disabled, otherwise they
+     * will come back as zombies after suspend, resume
+     * This trick seems to work, but will not be needed once kernel driver
+     * is fixed
+     */
+    if( priv->fd >= 0 )
+    {
+        rc = ioctl( priv->fd, FBIOGET_VSCREENINFO, &(priv->my_fb_var) );
+
+        if( rc == -1 )
+        {
+            mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: uninit() FBIOGET_VSCREENINFO from fd failed %d\n",
+                    errno );
+        }
+	priv->my_fb_var.bits_per_pixel = 0;
+
+        rc = ioctl( priv->fd, FBIOPUT_VSCREENINFO, &(priv->my_fb_var) );
+
+        if( rc == -1 )
+        {
+            mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: uninit() FBIOPUT_VSCREENINFO from fd failed %d\n",
+                    errno );
+        }
+    }
+
+    if( priv->overlay_fd >= 0 )
+    {
+        rc = ioctl( priv->overlay_fd, FBIOGET_VSCREENINFO, &(priv->my_fb_var) );
+
+        if( rc == -1 )
+        {
+            mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: uninit() FBIOGET_VSCREENINFO from overlay_fd failed %d\n",
+                    errno );
+        }
+        priv->my_fb_var.bits_per_pixel = 0;
+
+        rc = ioctl( priv->overlay_fd, FBIOPUT_VSCREENINFO, &(priv->my_fb_var) );
+
+        if( rc == -1 )
+        {
+            mp_msg( MSGT_VO, MSGL_ERR, "vo_pxa: uninit() FBIOPUT_VSCREENINFO from overlay_fd failed %d\n",
+                    errno );
+        }
+    }
+
+    if( priv->base_fd >= 0 )
+    {
+        close( priv->base_fd );
+        priv->base_fd = -1;
+    }
+}
+
+/*****************************************************************************
+ *
+ * Internal functions, not part of mplayer API
+ *
+ ****************************************************************************/
+
+static int vo_pxa_query_format( uint32_t format )
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: vo_pxa_query_format was called: %x (%s)\n",
+	   format, vo_format_name(format));
+
+    switch (format)
+    {
+        /* Planar YUV Formats */
+        /* Warning! dropthrough */
+        case IMGFMT_YV12:
+        case IMGFMT_IYUV:
+        case IMGFMT_I420:
+            return( VFCAP_CSP_SUPPORTED | VFCAP_CSP_SUPPORTED_BY_HW
+                    | VFCAP_HWSCALE_UP | VFCAP_HWSCALE_DOWN | VFCAP_OSD
+                    | VFCAP_ACCEPT_STRIDE );
+            break;
+    }
+        
+    return 0;
+}
+
+static void vo_pxa_copy_and_rotate( uint8_t *src, int stride, uint8_t *dst, int w, int h, int dst_stride )
+{
+    int i,j;
+    uint8_t *my_src, *my_dst;
+    Vo_Pxa_Pixel_Data8 *img_dst_pixel_data8;                       
+        
+    /* Loop so writing consectuive data in rotated image */
+    /* This produces some pretty good assembler - better than the handcoded stuff in w100 */
+    for( j=0; j<w; j++ )
+    {
+        my_src = src + j + ( stride * (h - 1) );
+
+        img_dst_pixel_data8 = (Vo_Pxa_Pixel_Data8 *)dst;
+        
+        /* Allow for src not multiple of 8 by running off the end a little. Should not matter */
+        for( i=0; i<((h+7)/8); i++ )
+        {
+            register Vo_Pxa_Pixel_Data8 build_pixels;
+                
+            build_pixels.a = *my_src;
+            my_src -= stride;
+            build_pixels.a |= (*my_src<<8);
+            my_src -= stride;
+            build_pixels.a |= (*my_src<<16);
+            my_src -= stride;
+            build_pixels.a |= (*my_src<<24);
+            my_src -= stride;
+
+            build_pixels.b = *my_src;
+            my_src -= stride;
+            build_pixels.b |= (*my_src<<8);
+            my_src -= stride;
+            build_pixels.b |= (*my_src<<16);
+            my_src -= stride;
+            build_pixels.b |= (*my_src<<24);
+            my_src -= stride;
+                
+            *img_dst_pixel_data8++ = build_pixels;
+        }
+        
+        /* Allow source not as big as dest */
+        dst += dst_stride;
+    }   
+}
+
+static void vo_pxa_draw_alpha( int x, int y, int w, int h, unsigned char *src,
+                               unsigned char *srca, int stride )
+{
+    /* Dump data into our 15bit buffer with transparency */
+    pxa_priv_t *priv = &st_pxa_priv;
+    int i,j;
+    unsigned char *src_ptr = src;
+    unsigned char *a_ptr = srca;
+    unsigned short *out_ptr;
+    
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: vo_pxa_draw_alpha() w %d y %d w %d h %d\n", x, y, w, h );
+    
+    /* We ignore the alpha channel, other than off or on */
+    for( i=0; i<h; i++ )
+    {
+        out_ptr = priv->osd_mem_base + x + ( priv->width * ( y + i ) );
+        src_ptr = src + ( i * stride );
+        a_ptr = srca + ( i * stride );
+            
+        for( j=0; j<w; j++ )
+        {
+            /* The srca is a 0-255 transpaency level, where 0 is transparent.
+             * We only support transparent on or off
+             */
+            if( *a_ptr++ )
+            {
+                unsigned int grey;
+                /* The src is a greylevel from 0 - 255 */
+                /* We may as well use this value */
+                grey = *src_ptr++ >> 3;
+                *out_ptr++ = grey | (grey << 5) | (grey<<10);                
+            }
+            else
+            {                
+                *out_ptr++ = 0x8000;
+                src_ptr++;
+            }
+            
+        }
+    }
+}
+
+static void vo_pxa_draw_alpha_with_rotate( int x, int y, int w, int h, unsigned char *src,
+                                           unsigned char *srca, int stride )
+{
+    /* Dump data into our 15bit buffer with transparency */
+    pxa_priv_t *priv = &st_pxa_priv;
+    int i,j;
+    unsigned char *src_ptr = src;
+    unsigned char *a_ptr = srca;
+    unsigned short *out_ptr;
+    
+    mp_msg(MSGT_VO, MSGL_V, "vo_pxa: vo_pxa_draw_alpha_with_rotate() x %d y %d w %d h %d\n", x, y, w, h );
+        
+    if( x >= 320 )
+    {
+        /* Off the screen */
+        return;
+    }
+
+    /* Limit to size of screen/memory */
+    if( ( w + x ) > 320 )
+    {
+        w = 320 - x;
+    }
+    
+    if( y >= 240 )
+    {
+        /* Off the screen */
+        return;
+    }
+    
+    /* Limit to size of screen/memory */
+    if( ( y + h ) > 240 )
+    {
+        h = 240 - y;
+    }
+    
+    
+    /* We ignore the alpha channel, other than off or on */
+    for( i=0; i<w; i++ )
+    {
+        out_ptr = priv->osd_mem_base + y + ( priv->width * ( x + i ) );
+        src_ptr = src + i + ( stride * (h - 1));
+        a_ptr = srca + i + ( stride * (h - 1));
+            
+        for( j=0; j<h; j++ )
+        {
+            /* The srca is a 0-255 transpaency level, where 0 is transparent.
+             * We only support transparent on or off
+             */
+            if( *a_ptr )
+            {
+                unsigned int grey;
+                /* The src is a greylevel from 0 - 255 */
+                /* We may as well use this value */
+                grey = *src_ptr >> 3;
+                *out_ptr++ = grey | (grey << 5) | (grey<<10);                
+            }
+            else
+            {                
+                *out_ptr++ = 0x8000;
+                src_ptr;
+            }
+            a_ptr -= stride;
+            src_ptr -= stride;
+        }
+    }
+}
+
+static void vo_pxa_clear_osd( uint16_t *mem_base, int len )
+{
+    /* fill whole area with 0x8000 -> trsnaparent.
+     * assume area is word aligned, and a mulitple of 16 bytes in length
+     * However I tried I could not get the compiler to generate this.
+     * It always wanted to  to do ldmia 4 words from stack followed by
+     * stmia 4 words. This seems odd!
+     */
+    __asm__ __volatile__ (
+        "mov        r4, %0         \n\t"
+        "mov        r5, %1, lsr #4      \n\t"
+        "subs       r5, r5, #1\n\t"
+        "mov	r0, #0x80000000         \n\t"
+        "orr	r0, r0, #0x00008000     \n\t"
+        "mov	r1, r0                  \n\t"
+        "mov	r2, r0                  \n\t"
+        "mov	r3, r0                  \n\t"
+        "1:					\n\t"
+        "subs       r5, r5, #1\n\t"
+        "stmia      r4!, {r0, r1, r2, r3} \n\t"
+        "bne        1b \n\t"
+        : 
+        : "r"(mem_base), "r"(len)
+        : "memory", "r0", "r1", "r2", "r3", "r4", "r5", "cc" );
+}
diff --git a/recipes/mplayer/files/vo_pxa.h b/recipes/mplayer/files/vo_pxa.h
new file mode 100644
index 0000000000..31cc1a7862
--- /dev/null
+++ b/recipes/mplayer/files/vo_pxa.h
@@ -0,0 +1,51 @@
+/*
+ * Video driver for PXA 27x Overlay 2, in conjunction with kernel driver
+ * by Tim Chick <tim (DOT) chick (AT) csr (DOT) com>
+ * (C) 2007
+ */
+
+#include <linux/fb.h>
+
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+typedef struct pxa_priv_s {
+    uint8_t *fb_mem_base;
+    uint16_t *osd_mem_base;
+    
+    int fd;
+    int base_fd;
+    int overlay_fd;
+    struct fb_var_screeninfo my_fb_var;
+    struct fb_fix_screeninfo my_fb_fix;
+    struct fb_var_screeninfo base_orig_fb_var;
+    struct fb_var_screeninfo osd_fb_var;
+    struct fb_fix_screeninfo osd_fb_fix;   
+    int vm;
+    uint32_t format;
+    int src_width;
+    int src_height;
+    int width;
+    int height;
+    int rotate;
+    int osd_cleared;
+} pxa_priv_t;
+
+typedef struct vo_pxa_pixel_data8 {
+    unsigned int a,b;
+} Vo_Pxa_Pixel_Data8;
+
+#define UNUSED(v) ((void)(v))
+
+/* Internal API */
+static int vo_pxa_query_format( uint32_t format );
+static void vo_pxa_copy_and_rotate( uint8_t *src, int stride, uint8_t *dst, int w, int h, int dst_stride );
+static void vo_pxa_draw_alpha( int x, int y, int w, int h, unsigned char *src,
+                               unsigned char *srca, int stride );
+static void vo_pxa_draw_alpha_with_rotate( int x, int y, int w, int h, unsigned char *src,
+                                           unsigned char *srca, int stride );
+
+static void vo_pxa_clear_osd( uint16_t *mem_base, int len );
diff --git a/recipes/mplayer/files/vo_w100.c b/recipes/mplayer/files/vo_w100.c
new file mode 100644
index 0000000000..702707c656
--- /dev/null
+++ b/recipes/mplayer/files/vo_w100.c
@@ -0,0 +1,947 @@
+/*
+ * Video driver for ATI Imageon 100 (w100)
+ * by AGAWA Koji <i (AT) atty (DOT) jp>
+ * (C) 2004
+ */
+/* English in this source code is written by machine translation.
+   Meaning also not leading, permitting. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+#include "video_out.h"
+#include "video_out_internal.h"
+#include "sub.h"
+#include "aspect.h"
+#include "mp_msg.h"
+#include "subopt-helper.h"
+
+#include "vo_w100_api.h"
+#include "vo_w100_fb.h"
+
+#define UNUSED(v) ((void)(v))
+
+static vo_info_t info = {
+	"ATI Imageon 100",
+	"w100",
+	"AGAWA Koji <i (AT) atty (DOT) jp>",
+	"for Sharp Linux Zaurus SL-C700/750/760/860"
+};
+
+LIBVO_EXTERN(w100);
+
+// ----------------------------------------------------------------
+#define MAX_FRAMES 20
+typedef struct vidix_yuv_s
+{
+	unsigned y,u,v;
+}vidix_yuv_t;
+
+typedef struct vidix_rect_s
+{
+	unsigned x,y,w,h;	/* in pixels */
+	vidix_yuv_t pitch;	/* line-align in bytes */
+}vidix_rect_t;
+
+typedef struct w100_yuv_planes_s {
+	uint8_t *y;
+	uint8_t *u;
+	uint8_t *v;
+} w100_yuv_planes_t;
+
+typedef struct w100_priv_s {
+    uint32_t format;
+    int src_width;
+    int src_height;
+    int nframes;		/* total num of frames */
+    int current_frame;		/* current frame to display */
+    int rotate;
+    int current_rotate;
+
+    /* w100 info */
+    int vram_size[2];		/*  */
+    void *vram_addr[2];		/* address */
+    w100_yuv_planes_t frame_addrs[MAX_FRAMES];
+    w100_yuv_planes_t frame_offsets[MAX_FRAMES];
+    int is_graphic_window_enabled;
+    int eq_brightness;		/* for mplayer */
+    int display_brightness;	/* for w100 */
+
+    /* overlay info */
+    uint16_t overlay_handle;
+    ATI_OVERLAYPROP overlay_prop;
+    int overlay_pos_x;
+    int overlay_pos_y;
+    int overlay_expand_h;
+    int overlay_expand_v;
+    int overlay_pitch_y;
+    int overlay_pitch_u;
+    int overlay_pitch_v;
+    video_y_offset_u video_y_offset;
+    video_u_offset_u video_u_offset;
+    video_v_offset_u video_v_offset;
+} w100_priv_t;
+
+static w100_priv_t st_w100_priv;
+static vidix_yuv_t dstrides;
+
+static int test_rotate(int *arg)
+{
+    if ((*arg < -1) || (*arg > 3))
+	return 0;
+    return 1;
+}
+
+static opt_t subopts[] = {
+    { "rotate", OPT_ARG_INT, &st_w100_priv.rotate, (opt_test_f)test_rotate },
+    { NULL }
+};
+
+static void draw_alpha(int x0,int y0, int w,int h,
+		       unsigned char* src, unsigned char *srca, int stride)
+{
+    w100_priv_t *priv = &st_w100_priv;
+    uint8_t *psrc, *psrca, *pdst;
+    pdst = priv->frame_addrs[priv->current_frame].y;
+    pdst += (x0 * priv->overlay_prop.SrcPitch) + (priv->overlay_prop.SrcPitch - 1 - y0);
+    psrc = src;
+    psrca = srca;
+    while (h--) {
+	int j;
+	for (j = 0; j < w; ++j) {
+	    if (psrca[j])
+		pdst[j * priv->overlay_prop.SrcPitch] =
+		    ((pdst[j * priv->overlay_prop.SrcPitch] * psrca[j]) >> 8) + psrc[j];
+	}
+	psrc += stride;
+	psrca += stride;
+	pdst -= 1;
+    }
+#if 0
+    w100_priv_t *priv = &st_w100_priv;
+    uint32_t apitch, bespitch;
+    void *lvo_mem;
+    lvo_mem = priv->frame_addrs[priv->current_frame].y;
+    apitch = priv->overlay_pitch_y - 1;
+    switch (priv->format) {
+    case IMGFMT_YV12:
+    case IMGFMT_IYUV:
+    case IMGFMT_I420:
+    case IMGFMT_YVU9:
+    case IMGFMT_IF09:
+    case IMGFMT_Y8:
+    case IMGFMT_Y800:
+	bespitch = (priv->src_width + apitch) & (~apitch);
+	vo_draw_alpha_yv12(w,h,src,srca,stride,lvo_mem+bespitch*y0+x0,bespitch);
+	break;
+    case IMGFMT_YUY2:
+	bespitch = (priv->src_width*2 + apitch) & (~apitch);
+	vo_draw_alpha_yuy2(w,h,src,srca,stride,lvo_mem+bespitch*y0+2*x0,bespitch);
+	break;
+    case IMGFMT_UYVY:
+	bespitch = (priv->src_width*2 + apitch) & (~apitch);
+	vo_draw_alpha_yuy2(w,h,src,srca,stride,lvo_mem+bespitch*y0+2*x0+1,bespitch);
+	break;
+    case IMGFMT_RGB32:
+    case IMGFMT_BGR32:
+	bespitch = (priv->src_width*4 + apitch) & (~apitch);
+	vo_draw_alpha_rgb32(w,h,src,srca,stride,lvo_mem+y0*bespitch+4*x0,bespitch);
+	break;
+    case IMGFMT_RGB24:
+    case IMGFMT_BGR24:
+	bespitch = (priv->src_width*3 + apitch) & (~apitch);
+	vo_draw_alpha_rgb24(w,h,src,srca,stride,lvo_mem+y0*bespitch+3*x0,bespitch);
+	break;
+    case IMGFMT_RGB16:
+    case IMGFMT_BGR16:
+	bespitch = (priv->src_width*2 + apitch) & (~apitch);
+	vo_draw_alpha_rgb16(w,h,src,srca,stride,lvo_mem+y0*bespitch+2*x0,bespitch);
+	break;
+    case IMGFMT_RGB15:
+    case IMGFMT_BGR15:
+	bespitch = (priv->src_width*2 + apitch) & (~apitch);
+	vo_draw_alpha_rgb15(w,h,src,srca,stride,lvo_mem+y0*bespitch+2*x0,bespitch);
+	break;
+    default:
+	return;
+    }
+#endif
+}
+
+static uint32_t w100_draw_slice_420(uint8_t *image[], int stride[],
+				     int w, int h, int x, int y)
+{
+    w100_priv_t *priv = &st_w100_priv;
+    uint8_t *src;
+    uint8_t *dest;
+    int i;
+
+    /* Plane Y */
+    dest = priv->frame_addrs[priv->current_frame].y;
+    dest += dstrides.y * y + x;
+    src = image[0];
+    for (i = 0; i < h; ++i) {
+	memcpy(dest, src, w);
+	src += stride[0];
+	dest += dstrides.y;
+    }
+
+    /* Plane V */
+    dest = priv->frame_addrs[priv->current_frame].u;
+    dest += dstrides.v * y / 4 + x;
+    src = image[1];
+    for (i = 0; i < h / 2; ++i) {
+	memcpy(dest, src, w / 2);
+	src += stride[1];
+	dest += dstrides.v / 2;
+    }
+
+    /* Plane U */
+    dest = priv->frame_addrs[priv->current_frame].v;
+    dest += dstrides.u * y / 4 + x;
+    src = image[2];
+    for (i = 0; i < h / 2; ++i) {
+	memcpy(dest, src, w / 2);
+	src += stride[2];
+	dest += dstrides.u / 2;
+    }
+
+    return 0;
+}
+
+/*
+  w must be multiple of 8
+ */
+static uint32_t w100_draw_slice_420_rotate3(uint8_t *image[], int stride[],
+					    int w, int h, int x, int y)
+{
+    w100_priv_t *priv = &st_w100_priv;
+    void *src, *dest;
+    int i, dpitch2, h_;
+
+    h_ = h;
+
+    for (i = 0; i < 3; ++i) {
+	src = image[i];
+	switch (i) {
+	case 0:
+	    dest = priv->frame_addrs[priv->current_frame].y;
+	    dest += dstrides.y * x + dstrides.y - y;
+	    dpitch2 = dstrides.y << 1;
+	    break;
+	case 1:
+	    dest = priv->frame_addrs[priv->current_frame].u;
+	    dest += (dstrides.y >> 1) * (x >> 1) + (dstrides.y >> 1) - (y >> 1);
+	    dpitch2 = dstrides.y;
+	    h = h_ >> 1;
+	    w >>= 1;
+	    break;
+	case 2:
+	    dest = priv->frame_addrs[priv->current_frame].v;
+	    dest += (dstrides.y >> 1) * (x >> 1) + (dstrides.y >> 1) - (y >> 1);
+	    h = h_ >> 1;
+	    dpitch2 = dstrides.y;
+	    break;
+	}
+
+	__asm__ __volatile__ (
+	    "1:					\n\t"
+	    "mov	r8, %[w]			\n\t"
+	    "sub	%[dest], %[dest], #1		\n\t"
+	    "mov	r4, %[dest]			\n\t"
+	    "add	r5, %[dest], %[dpitch2], lsr #1			\n\t"
+
+	    "2:					\n\t"
+	    "ldrb	r0, [%[src]]			\n\t"
+	    "ldrb	r1, [%[src], #1]		\n\t"
+	    "add	%[src], %[src], #2		\n\t"
+	    "strb	r0, [r4]			\n\t"
+	    "strb	r1, [r5]			\n\t"
+	    "add	r4, r4, %[dpitch2]		\n\t"
+	    "add	r5, r5, %[dpitch2]		\n\t"
+	    "ldrb	r0, [%[src]]			\n\t"
+	    "ldrb	r1, [%[src], #1]		\n\t"
+	    "add	%[src], %[src], #2		\n\t"
+	    "strb	r0, [r4]			\n\t"
+	    "strb	r1, [r5]			\n\t"
+	    "add	r4, r4, %[dpitch2]		\n\t"
+	    "add	r5, r5, %[dpitch2]		\n\t"
+	    "subs	r8, r8, #4			\n\t"
+	    "bne	2b				\n\t"
+
+	    "add	%[src], %[src], %[srcdiff]	\n\t"
+	    "subs	%[h], %[h], #1			\n\t"
+	    "bne	1b				\n\t"
+	    : [src]"+r"(src), [dest]"+r"(dest), [h]"+r"(h)
+	    : [dpitch2]"r"(dpitch2), [w]"r"(w), [srcdiff]"r"(stride[i] - w)
+	    : "memory", "r0", "r1", "r4", "r5", "r8");
+    }
+}
+
+static uint32_t w100_draw_slice_packed(uint8_t *image[], int stride[],
+					int w, int h, int x, int y)
+{
+#if 0
+    uint8_t *src;
+    uint8_t *dest;
+    int i;
+
+    dest = st_w100_mem + vidix_play.offsets[st_next_frame] + vidix_play.offset.y;
+    dest += dstrides.y * y + x;
+    src = image[0];
+    for (i = 0; i < h; ++i) {
+	memcpy(dest, src, w * st_image_bpp);
+	src += stride[0];
+	dest += dstrides.y;
+    }
+#endif
+    return 0;
+}
+
+static uint32_t w100_get_image(mp_image_t *mpi)
+{
+#if 0
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: w100_get_image called.\n");
+
+    if (mpi->type == MP_IMGTYPE_STATIC && st_num_frames > 1)
+	return VO_FALSE;
+    if (mpi->flags & MP_IMGFLAG_READABLE)
+	return VO_FALSE; /* slow video ram */
+    if (((mpi->stride[0] == dstrides.y &&
+	  (!(mpi->flags & MP_IMGFLAG_PLANAR) ||
+	   (mpi->stride[1] == dstrides.u && mpi->stride[2]==dstrides.v)))
+	 || (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH))) &&
+	(!(vidix_play.flags & VID_PLAY_INTERLEAVED_UV))) {
+	if (mpi->flags & MP_IMGFLAG_ACCEPT_WIDTH) {
+	    // check if only width is enough to represent strides:
+	    if (mpi->flags & MP_IMGFLAG_PLANAR) {
+		if ((dstrides.y >> 1) != dstrides.v || dstrides.v != dstrides.u)
+		    return VO_FALSE;
+	    } else {
+		if (dstrides.y % (mpi->bpp / 8))
+		    return VO_FALSE;
+	    }
+	}
+	mpi->planes[0] = st_w100_mem + vidix_play.offsets[st_next_frame]
+	    + vidix_play.offset.y;
+	mpi->width = mpi->stride[0] = dstrides.y;
+	if (mpi->flags & MP_IMGFLAG_PLANAR) {
+	    mpi->planes[1] = st_w100_mem + vidix_play.offsets[st_next_frame]
+		+ vidix_play.offset.v;
+	    mpi->stride[1] = dstrides.v >> mpi->chroma_x_shift;
+	    mpi->planes[2] = st_w100_mem + vidix_play.offsets[st_next_frame]
+		+ vidix_play.offset.u;
+	    mpi->stride[2] = dstrides.u >> mpi->chroma_x_shift;
+	} else
+	    mpi->width /= mpi->bpp / 8;
+	mpi->flags |= MP_IMGFLAG_DIRECT;
+	return VO_TRUE;
+    }
+#endif
+    return VO_FALSE;
+}
+
+static void w100_set_yuv_addrs(w100_priv_t *priv, w100_yuv_planes_t *offsets)
+{
+    uint32_t val;
+
+    priv->video_y_offset.f.y_offset = GetRealMemAddr((uint32_t)offsets->y);
+    priv->video_u_offset.f.u_offset = GetRealMemAddr((uint32_t)offsets->u);
+    priv->video_v_offset.f.v_offset = GetRealMemAddr((uint32_t)offsets->v);
+    AtiCore_WriteReg(mmVIDEO_Y_OFFSET, (uint32_t *)&priv->video_y_offset);
+    AtiCore_WriteReg(mmVIDEO_U_OFFSET, (uint32_t *)&priv->video_u_offset);
+    AtiCore_WriteReg(mmVIDEO_V_OFFSET, (uint32_t *)&priv->video_v_offset);
+
+    val = 0x7B;
+    AtiCore_WriteReg(mmDISP_DB_BUF_CNTL, &val);
+}
+
+static void w100_set_overlay_expand(w100_priv_t *priv, int exp_h, int exp_v)
+{
+    video_ctrl_u video_ctrl;
+
+    priv->overlay_expand_h = exp_h;
+    priv->overlay_expand_v = exp_v;
+
+    AtiCore_ReadReg(mmVIDEO_CTRL, (uint32_t *)&video_ctrl);
+    video_ctrl.f.video_hor_exp = exp_h;
+    video_ctrl.f.video_ver_exp = exp_v;
+    AtiCore_WriteReg(mmVIDEO_CTRL, (uint32_t *)&video_ctrl);
+}
+
+static int w100_setup(w100_priv_t *priv)
+{
+    if (!AtiCore_AllocOverlay(&priv->overlay_handle)) {
+	mp_msg(MSGT_VO, MSGL_FATAL,
+	       "vo_w100: AtiCore_AllocOverlay failed.\n");
+	return 0;
+    }
+    if (!AtiCore_SetupOverlay(priv->overlay_handle, &priv->overlay_prop)) {
+	mp_msg(MSGT_VO, MSGL_FATAL,
+	       "vo_w100: AtiCore_SetupOverlay failed.\n");
+	return 0;
+    }
+    AtiCore_SetOverlayPos(priv->overlay_handle,
+			  priv->overlay_pos_x, priv->overlay_pos_y);
+    AtiCore_SetOverlayOnOff(priv->overlay_handle, 1);
+    w100_set_yuv_addrs(priv, &priv->frame_offsets[priv->current_frame]);
+    w100_set_overlay_expand(priv, priv->overlay_expand_h, priv->overlay_expand_v);
+    AtiCore_SetDisplayBrightness(priv->display_brightness);
+    AtiCore_SetGraphicWindowOnOff(priv->is_graphic_window_enabled);
+
+/* 	graphic_ctrl_t gc; */
+/* 	AtiCore_ReadReg(mmGRAPHIC_CTRL, &gc); */
+/* 	gc.low_power_on = 0; */
+/* 	AtiCore_WriteReg(mmGRAPHIC_CTRL, &gc); */
+
+    return 1;
+}
+
+static void *w100_offset2addr(uint32_t offset)
+{
+    void *addr;
+    AtiCore_SetupMemoryTransfer((uint32_t)offset, &addr);
+    AtiCore_TerminateMemoryTransfer();
+    return addr;
+}
+
+// ---------------------------------------------------------------- interfaces
+/*
+ * Preinitializes driver (real INITIALIZATION)
+ *   arg - currently it's vo_subdevice
+ *   returns: zero on successful initialization, non-zero on error.
+ */
+static int preinit(const char *vo_subdevice)
+{
+    w100_priv_t *priv = &st_w100_priv;
+
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: preinit() was called\n");
+
+    if (!AtiCore_ProcessAttach())
+	return -1;
+
+    /* fill w100_priv_t information */
+    memset(priv, 0, sizeof(*priv));
+    priv->rotate = -1;
+
+    if (subopt_parse(vo_subdevice, subopts) != 0) {
+	return -1;
+    }
+
+    priv->is_graphic_window_enabled = 1;
+    priv->eq_brightness = 0;	/* FIXME */
+
+    GetAvailableVideoMem(&priv->vram_size[INTERNAL_VRAM],
+			 &priv->vram_size[EXTERNAL_VRAM]);
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: VRAM size %dKB/%dKB\n",
+	   priv->vram_size[INTERNAL_VRAM] / 1024,
+	   priv->vram_size[EXTERNAL_VRAM] / 1024);
+
+    priv->vram_addr[INTERNAL_VRAM] = w100_offset2addr(VRAM_OFFSET_INTERNAL);
+    priv->vram_addr[EXTERNAL_VRAM] = w100_offset2addr(VRAM_OFFSET_EXTERNAL);
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: VRAM address 0x%08x/0x%08x\n",
+	   priv->vram_addr[INTERNAL_VRAM], priv->vram_addr[EXTERNAL_VRAM]);
+
+    lcd_background_color_u lbc;
+    lbc.f.lcd_bg_red = 0;
+    lbc.f.lcd_bg_green = 0;
+    lbc.f.lcd_bg_blue = 0;
+    AtiCore_WriteReg(mmLCD_BACKGROUND_COLOR, &lbc);
+
+    return 0;
+}
+
+/*
+ * Initialize (means CONFIGURE) the display driver.
+ * params:
+ *   src_width,srcheight: image source size
+ *   dst_width,dst_height: size of the requested window size, just a hint
+ *   fullscreen: flag, 0=windowd 1=fullscreen, just a hint
+ *   title: window title, if available
+ *   format: fourcc of pixel format
+ * returns : zero on successful initialization, non-zero on error.
+ */
+static int config(uint32_t src_width, uint32_t src_height,
+		       uint32_t dst_width, uint32_t dst_height, uint32_t flags,
+		       char *title, uint32_t format)
+{
+    w100_priv_t *priv = &st_w100_priv;
+    int fs = flags & VOFLAG_FULLSCREEN;
+    int vm = flags & VOFLAG_MODESWITCHING;
+    int zoom = flags & VOFLAG_SWSCALE;
+    int y_pitch, uv_pitch;
+    int x_res = 480, y_res = 640;
+    uint32_t apitch;
+    int i;
+    uint32_t plane_flags = 0;
+
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: config() was called\n");
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: src_width:%d, src_height:%d, dst_width:%d, dst_height:%d\n",
+	   src_width, src_height, dst_width, dst_height);
+
+    if (!query_format(format)) {
+	printf("vo_w100: unsupported fourcc for this w100 driver: %x (%s)\n",
+	       format, vo_format_name(format));
+	return -1;
+    }
+    priv->format = format;
+
+    // rotate
+    if (priv->rotate < 0) {
+	if (src_width > src_height) {
+	    priv->current_rotate = 3;
+	} else {
+	    priv->current_rotate = 0;
+	}
+    } else
+	priv->current_rotate = priv->rotate;
+    if (priv->current_rotate != 0 && priv->current_rotate != 3) {
+	mp_msg(MSGT_VO, MSGL_FATAL, "vo_w100: Rotate %d not supported\n", priv->current_rotate);
+	return -1;
+    }
+
+    if (priv->current_rotate == 1 || priv->current_rotate == 3) {
+	i = src_width;
+	src_width = src_height;
+	src_height = i;
+    }
+
+    dst_width = src_width;
+    dst_height = src_height;
+
+    if (fs) {
+	int arg[] = { 0, 0, 1, 1, 2, 2, 2, 2, 3 };
+	int arg2[] = { 1, 2, 4, 8 };
+	int hor_exp = x_res / src_width;
+	int ver_exp = y_res / (src_height - 32);
+	int expand;
+	mp_msg(MSGT_VO, MSGL_V, "vo_w100: hor_exp:%d, ver_exp:%d\n",
+	       hor_exp, ver_exp);
+	if ((hor_exp > 0 && hor_exp <= 8 && arg[hor_exp] >= 0) &&
+	    (ver_exp > 0 && ver_exp <= 8 && arg[ver_exp] >= 0)) {
+	    if (arg[hor_exp] > arg[ver_exp])
+		expand = arg[ver_exp];
+	    else
+		expand = arg[hor_exp];
+	}
+	priv->overlay_expand_h = priv->overlay_expand_v = expand;
+	dst_width *= arg2[expand];
+	dst_height *= arg2[expand];
+	if (dst_height > y_res)
+	    dst_height = y_res;
+    }
+
+    // ɽ���ΰ�򥻥󥿥��
+    priv->overlay_pos_x = (x_res - dst_width) / 2;
+    priv->overlay_pos_y = (y_res - dst_height) / 2;
+
+    // Hardware scaling
+    geometry(&priv->overlay_pos_x, &priv->overlay_pos_y,
+	     &dst_width, &dst_height, x_res, y_res);
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: overlay pos(%d, %d)\n",
+	   priv->overlay_pos_x, priv->overlay_pos_y);
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: src size(%dx%d), dst size(%dx%d)\n",
+	   src_width, src_height, dst_width, dst_height);
+
+    /* select first frame */
+    priv->current_frame = 0;
+
+    priv->src_width = src_width;
+    priv->src_height = src_height;
+    priv->overlay_pitch_y = 16;
+    priv->overlay_pitch_u = 16;
+    priv->overlay_pitch_v = 16;
+
+    switch (format) {
+    case IMGFMT_YV12:
+    case IMGFMT_IYUV:
+    case IMGFMT_I420:
+    case IMGFMT_YVU9:
+    case IMGFMT_IF09:
+    case IMGFMT_Y8:
+    case IMGFMT_Y800:
+	y_pitch = (src_width + 15) & ~15;
+	uv_pitch = ((src_width / 2) + 7) & ~7;
+	break;
+    default:
+	return -1;
+    }
+
+    /* �����ե���������¢VRAM�˼��ޤ�ʤ����ϡ�V-Plane����VRAM���ɤ��Ф��� */
+    if (y_pitch * src_height + uv_pitch * src_height > priv->vram_size[INTERNAL_VRAM])
+	plane_flags = 4;
+
+    if (vo_doublebuffering) {
+	if (y_pitch * src_height + uv_pitch * src_height * 2> priv->vram_size[INTERNAL_VRAM])
+	    plane_flags = 4;
+    }
+
+    /* ����VRAM�˥ץ졼����֤������ϡ�Graphic window���ڤ�ʤ��Ȳ��������� */
+/*     priv->is_graphic_window_enabled = (plane_flags != 0) ? 0 : 1; */
+    priv->is_graphic_window_enabled = 0;
+
+    uint32_t p[2] = {
+	VRAM_OFFSET_INTERNAL,
+	VRAM_OFFSET_EXTERNAL + 640 * 480 * 2
+    };
+    i = 0;
+    while (i < MAX_FRAMES) {
+	int sel, j;
+	/* Y-plane */
+	sel = plane_flags & 1 ? EXTERNAL_VRAM : INTERNAL_VRAM;
+	priv->frame_offsets[i].y = (void *)p[sel];
+	priv->frame_addrs[i].y = w100_offset2addr(p[sel]);
+	p[sel] += y_pitch * src_height;
+	/* U-plane */
+	sel = plane_flags & 2 ? EXTERNAL_VRAM : INTERNAL_VRAM;
+	priv->frame_offsets[i].u = (void *)p[sel];
+	priv->frame_addrs[i].u = w100_offset2addr(p[sel]);
+	p[sel] += uv_pitch * (src_height / 2);
+	/* V-plane */
+	sel = plane_flags & 4 ? EXTERNAL_VRAM : INTERNAL_VRAM;
+	priv->frame_offsets[i].v = (void *)p[sel];
+	priv->frame_addrs[i].v = w100_offset2addr(p[sel]);
+	p[sel] += uv_pitch * (src_height / 2);
+	if ((p[INTERNAL_VRAM] - VRAM_OFFSET_INTERNAL >= priv->vram_size[INTERNAL_VRAM]) ||
+	    (p[EXTERNAL_VRAM] - VRAM_OFFSET_EXTERNAL >= priv->vram_size[EXTERNAL_VRAM]))
+	    break;
+	mp_msg(MSGT_VO, MSGL_V, "vo_w100: frame_offsets[%d].y = 0x%08x\n", i, priv->frame_offsets[i].y);
+	mp_msg(MSGT_VO, MSGL_V, "vo_w100: frame_offsets[%d].u = 0x%08x\n", i, priv->frame_offsets[i].u);
+	mp_msg(MSGT_VO, MSGL_V, "vo_w100: frame_offsets[%d].v = 0x%08x\n", i, priv->frame_offsets[i].v);
+	++i;
+    }
+    priv->nframes = i;
+    if (priv->nframes > MAX_FRAMES)
+	priv->nframes = MAX_FRAMES;
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: nframes = %d\n", priv->nframes);
+
+    priv->overlay_prop.lpSrcBitmap = (void *)(priv->frame_offsets[0].y);
+    priv->overlay_prop.XCoord = 0;
+    priv->overlay_prop.YCoord = 0;
+    priv->overlay_prop.SrcPitch = y_pitch;
+    priv->overlay_prop.SrcHeight = src_height;
+    priv->overlay_prop.OverlayWidth = dst_width;
+    priv->overlay_prop.OverlayHeight = dst_height;
+    priv->overlay_prop.lpOverlayKey = 0;
+    priv->overlay_prop.OverlayFormat = OVLTYPE_YUV420;
+
+    priv->display_brightness = 127;
+
+    w100_set_yuv_addrs(priv, &priv->frame_offsets[0]);
+
+    /* clear every frame */
+    memset(priv->vram_addr[INTERNAL_VRAM], 0, priv->vram_size[INTERNAL_VRAM]);
+    memset(priv->vram_addr[EXTERNAL_VRAM] + 640 * 480 * 2, 0,
+	   priv->vram_size[EXTERNAL_VRAM] - 640 * 480 * 2);
+
+    switch (format) {
+    case IMGFMT_YV12:
+    case IMGFMT_I420:
+    case IMGFMT_IYUV:
+    case IMGFMT_YVU9:
+    case IMGFMT_IF09:
+    case IMGFMT_Y800:
+    case IMGFMT_Y8:
+	apitch = priv->overlay_pitch_y - 1;
+	dstrides.y = (src_width + apitch) & ~apitch;
+	apitch = priv->overlay_pitch_v - 1;
+	dstrides.v = (src_width + apitch) & ~apitch;
+	apitch = priv->overlay_pitch_u - 1;
+	dstrides.u = (src_width + apitch) & ~apitch;
+/* 	st_image_bpp = 1; */
+	break;
+    case IMGFMT_RGB32:
+    case IMGFMT_BGR32:
+	apitch = priv->overlay_pitch_y - 1;
+	dstrides.y = (src_width * 4 + apitch) & ~apitch;
+	dstrides.u = dstrides.v = 0;
+/* 	st_image_bpp = 4; */
+	break;
+    case IMGFMT_RGB24:
+    case IMGFMT_BGR24:
+	apitch = priv->overlay_pitch_y - 1;
+	dstrides.y = (src_width * 3 + apitch) & ~apitch;
+	dstrides.u = dstrides.v = 0;
+/* 	st_image_bpp = 3; */
+	break;
+    default:
+	apitch = priv->overlay_pitch_y - 1;
+	dstrides.y = (src_width * 2 + apitch) & ~apitch;
+	dstrides.u = dstrides.v = 0;
+/* 	st_image_bpp = 2; */
+	break;
+    }
+
+    if (format == IMGFMT_YV12 || format == IMGFMT_I420 || format == IMGFMT_IYUV) {
+	switch (priv->current_rotate) {
+	case 0:
+	    video_out_w100.draw_slice = w100_draw_slice_420;
+	    break;
+	case 1:
+	    break;
+	case 2:
+	    break;
+	case 3:
+	    video_out_w100.draw_slice = w100_draw_slice_420_rotate3;
+	    break;
+	default:
+	    video_out_w100.draw_slice = w100_draw_slice_420;
+	    break;
+	}
+    }
+    /* 	else if (format == IMGFMT_YVU9 || format == IMGFMT_IF09) */
+    /* 	    vo_server->draw_slice = w100_draw_slice_410; */
+    else
+	video_out_w100.draw_slice = w100_draw_slice_packed;
+
+    if (!w100_setup(priv))
+	return -1;
+
+    return 0;
+}
+
+/*
+ * Control interface
+ */
+static int control(uint32_t request, void *data, ...)
+{
+    w100_priv_t *priv = &st_w100_priv;
+    switch (request) {
+    case VOCTRL_GET_IMAGE:
+	return w100_get_image(data);
+    case VOCTRL_QUERY_FORMAT:
+	return query_format(*((uint32_t *)data));
+    case VOCTRL_SET_EQUALIZER:
+      {
+	va_list ap;
+	int value;
+
+	va_start(ap, data);
+	value = va_arg(ap, int);
+	va_end(ap);
+
+	if (!strcasecmp(data, "brightness")) {
+	    int br;
+	    priv->eq_brightness = value * 10;
+	    br = (priv->eq_brightness + 1000) * 127 / 2000;
+	    if (br < 0)
+		br = 0;
+	    if (br > 127)
+		br = 127;
+	    if (br > 64)
+		br -= 64;
+	    else
+		br += 64;
+	    priv->display_brightness = br;
+
+	    mp_msg(MSGT_VO, MSGL_V,
+		   "vo_w100: control(VOCTRL_SET_EQUALIZER) %d %d\n",
+		   value, br);
+
+	    if (AtiCore_SetDisplayBrightness(priv->display_brightness))
+		return VO_TRUE;
+	    else
+		return VO_FALSE;
+	}
+      }
+    case VOCTRL_GET_EQUALIZER:
+      {
+	va_list ap;
+	int *value;
+
+	va_start(ap, data);
+	value = va_arg(ap, int*);
+	va_end(ap);
+
+	if (!strcasecmp(data, "brightness")) {
+	    *value = priv->eq_brightness;
+	    return VO_TRUE;
+	} else
+	    return VO_FALSE;
+      }
+    }
+
+    return VO_NOTIMPL;
+}
+
+/*
+ * Display a new RGB/BGR frame of the video to the screen.
+ * params:
+ *   src[0] - pointer to the image
+ */
+int draw_frame(uint8_t *src[])
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: dummy draw_frame() was called\n");
+    return -1;
+}
+
+/*
+ * Draw a planar YUV slice to the buffer:
+ * params:
+ *   src[3] = source image planes (Y,U,V)
+ *   stride[3] = source image planes line widths (in bytes)
+ *   w,h = width*height of area to be copied (in Y pixels)
+ *   x,y = position at the destination image (in Y pixels)
+ */
+int draw_slice(uint8_t *src[], int stride[], int w,int h, int x,int y)
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: dummy draw_slice() was called\n");
+    return -1;
+}
+
+
+/*
+ * Draws OSD to the screen buffer
+ */
+static void draw_osd(void)
+{
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: draw_osd() was called\n");
+    vo_draw_text(st_w100_priv.src_height, st_w100_priv.src_width, draw_alpha);
+}
+
+/*
+ * Blit/Flip buffer to the screen. Must be called after each frame!
+ */
+void flip_page(void)
+{
+    w100_priv_t *priv = &st_w100_priv;
+
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: flip_page() was called\n");
+    if (vo_doublebuffering) {
+	w100_set_yuv_addrs(priv, &priv->frame_offsets[priv->current_frame]);
+	priv->current_frame = (priv->current_frame + 1) % priv->nframes;
+    }
+}
+
+/*
+ * This func is called after every frames to handle keyboard and
+ * other events. It's called in PAUSE mode too!
+ */
+extern int g_sigcont;
+void check_events(void)
+{
+    w100_priv_t *priv = &st_w100_priv;
+
+    if (g_sigcont) {
+	mp_msg(MSGT_VO, MSGL_INFO, "vo_w100: SIGCONT recived.\n");
+
+	/* Immediately after resuming, because kernel modifies the register, it
+	   waits for that. */
+	usleep(1000 * 1000);
+
+	/* re-attach */
+#if 0
+	/* Hmm... With respect to of context is necessary, but really it
+	   fails. It does not release and also there is no problem. */
+	if (!AtiCore_ReleaseOverlay(priv->overlay_handle)) {
+	    mp_msg(MSGT_VO, MSGL_FATAL,
+		   "vo_w100: AtiCore_ReleaseOverlay failed.\n");
+	    exit_player(NULL);
+	}
+#endif
+	if (!AtiCore_ProcessDetach()) {
+	    mp_msg(MSGT_VO, MSGL_FATAL,
+		   "vo_w100: AtiCore_ProcessDetach failed.\n");
+	    exit_player(NULL);
+	}
+	if (!AtiCore_ProcessAttach()) {
+	    mp_msg(MSGT_VO, MSGL_FATAL,
+		   "vo_w100: AtiCore_ProcessAttach failed.\n");
+	    exit_player(NULL);
+	}
+
+	/* re-setup */
+	if (!w100_setup(priv))
+	    exit_player(NULL);
+
+	g_sigcont = 0;
+    }
+}
+
+/*
+ * Closes driver. Should restore the original state of the system.
+ */
+static void uninit(void)
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: uninit() was called\n");
+
+    AtiCore_SetOverlayOnOff(st_w100_priv.overlay_handle, 0);
+    AtiCore_ReleaseOverlay(st_w100_priv.overlay_handle);
+    AtiCore_SetGraphicWindowOnOff(1);
+    AtiCore_ProcessDetach();
+}
+
+
+// ----------------------------------------------------------------
+static int query_format(uint32_t format)
+{
+    mp_msg(MSGT_VO, MSGL_V, "vo_w100: query_format was called: %x (%s)\n",
+	   format, vo_format_name(format));
+
+    if (IMGFMT_IS_RGB(format)) {
+	/* RGB/BGR Formats */
+	// TODO
+	return 0;
+
+	switch (IMGFMT_RGB_DEPTH(format)) {
+	case 16:
+	    return VFCAP_CSP_SUPPORTED | VFCAP_CSP_SUPPORTED_BY_HW
+		| VFCAP_HWSCALE_UP | VFCAP_HWSCALE_DOWN | VFCAP_OSD | VFCAP_ACCEPT_STRIDE;
+	    break;
+	}
+    } else {
+	/* Planar YUV Formats */
+	switch (format) {
+	case IMGFMT_YV12:
+	case IMGFMT_IYUV:
+	case IMGFMT_I420:
+	case IMGFMT_YVU9:
+	case IMGFMT_IF09:
+	case IMGFMT_Y8:
+	case IMGFMT_Y800:
+	    return VFCAP_CSP_SUPPORTED | VFCAP_CSP_SUPPORTED_BY_HW
+		| VFCAP_HWSCALE_UP | VFCAP_HWSCALE_DOWN | VFCAP_OSD | VFCAP_ACCEPT_STRIDE;
+	    break;
+	}
+    }
+
+    return 0;
+}
+
+static void dump_vo_info(void)
+{
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: ================================\n");
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_flags:%x\n", vo_flags);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_depthonscreen:%d\n", vo_depthonscreen);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_screenwidth:%d\n", vo_screenwidth);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_screenheight:%d\n", vo_screenheight);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_dx:%d\n", vo_dx);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_dy:%d\n", vo_dy);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_dwidth:%d\n", vo_dwidth);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_dheight:%d\n", vo_dheight);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_dbpp:%d\n", vo_dbpp);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_grabpointer:%d\n", vo_grabpointer);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_doublebuffering:%d\n", vo_doublebuffering);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_directrendering:%d\n", vo_directrendering);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_vsync:%d\n", vo_vsync);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_fs:%d\n", vo_fs);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_fsmode:%d\n", vo_fsmode);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_panscan:%f\n", vo_panscan);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_adapter_num:%d\n", vo_adapter_num);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_refresh_rate:%d\n", vo_refresh_rate);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_gamma_brightness:%d\n", vo_gamma_brightness);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_gamma_saturation:%d\n", vo_gamma_saturation);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_gamma_contrast:%d\n", vo_gamma_contrast);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_gamma_hue:%d\n", vo_gamma_hue);
+/*     mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_gamma_red_intensity:%d\n", vo_gamma_red_intensity); */
+/*     mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_gamma_green_intensity:%d\n", vo_gamma_green_intensity); */
+/*     mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_gamma_blue_intensity:%d\n", vo_gamma_blue_intensity); */
+/*     mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_mouse_timer_const:%d\n", vo_mouse_timer_const); */
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_nomouse_input:%d\n", vo_nomouse_input);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_pts:%d\n", vo_pts);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_fps:%f\n", vo_fps);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: vo_colorkey:%d\n", vo_colorkey);
+    mp_msg(MSGT_VO, MSGL_DBG2, "vo_w100: ================================\n");
+}
diff --git a/recipes/mplayer/files/vo_w100_api.h b/recipes/mplayer/files/vo_w100_api.h
new file mode 100644
index 0000000000..59cf58be26
--- /dev/null
+++ b/recipes/mplayer/files/vo_w100_api.h
@@ -0,0 +1,306 @@
+/* -*- mode: c++; tab-width: 4 -*- */
+
+/* $Id$ */
+
+/*
+ * Copyright (C) 2003-2004 AGAWA Koji <i (AT) atty (DOT) jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/**
+ * @file w100api.h
+ * @brief 
+ *
+ * 
+ *
+ * @author AGAWA Koji
+ * @date $Date$
+ * @version $Revision$
+ */
+
+#ifndef W100API_H_INCLUDED
+#define W100API_H_INCLUDED
+
+#ifdef __cplusplus
+# define EXTERN_C_BEGIN extern "C" {
+# define EXTERN_C_END }
+#else
+# define EXTERN_C_BEGIN
+# define EXTERN_C_END
+#endif
+
+EXTERN_C_BEGIN
+
+#include <inttypes.h>
+
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+
+#include "vo_w100_fb.h"
+
+// DP_GUI_MASTER_CNTL.GMC_Dst_DataType
+// DP_DATATYPE.Dp_Dst_DataType
+/* #define DSTTYPE_8BPP            2       // 8 bpp grey scale */
+/* #define DSTTYPE_16BPP_1555      3       //16 bpp aRGB 1555 */
+/* #define DSTTYPE_16BPP_444       5       //16 bpp aRGB 4444 */
+#define DSTTYPE_8BPP            1
+
+// DP_GUI_MASTER_CNTL.GMC_Src_DataType
+// DP_DATATYPE.Dp_Src_DataType
+#define SRCTYPE_1BPP_OPA        0       //mono (expanded to frgd, bkgd)
+#define SRCTYPE_1BPP_TRA        1       //mono (expanded to frgd, leave_alone)
+#define SRCTYPE_EQU_DST         3       //color (same as DST)
+#define SRCTYPE_SOLID_COLOR_BLT 4       //solid color for Blt (use frgd)
+#define SRCTYPE_4BPP            5       //4 bpp
+#define SRCTYPE_12BPP_PACKED    6       //12 bpp packed
+
+#define ROP3_SRCCOPY            0xCC
+
+#define OVLTYPE_YUV420          0x07
+
+#define INTERNAL_VRAM 0
+#define EXTERNAL_VRAM 1
+
+#define VRAM_OFFSET_INTERNAL 0x00000000
+#define VRAM_OFFSET_EXTERNAL 0x0F000000
+
+typedef struct {
+    /* ����ȷ���̤���� */
+    int16_t XCoord;
+    int16_t YCoord;
+} ATI_POINT;
+
+typedef struct {
+    /* ����ȷ���̤���� */
+    int16_t XCoord;
+    int16_t YCoord;
+    int16_t Width;
+    int16_t Height;
+} ATI_RECT;
+
+typedef struct {
+    /* ����ȷ���̤���� */
+    uint32_t Count;                        /* +0 */
+    uint8_t ScaleXFactor;                 /* +4 ���� */
+    uint8_t ScaleYFactor;                 /* +5 ���� */
+    uint8_t BlendOn;                      /* +6 ���� */
+    uint8_t dummy1;
+} ATI_STRETCH;                          /* 8bytes? */
+
+typedef struct {
+    uint32_t *lpSrcBitmap;
+    uint16_t XCoord;                      /* +4  ���� */
+    uint16_t YCoord;                      /* +6  ���� */
+    uint16_t SrcPitch;                    /* +8  ���� */
+    uint16_t SrcHeight;                   /* +10 ���� */
+    uint16_t OverlayWidth;
+    uint16_t OverlayHeight;
+    uint16_t *lpOverlayKey;                /* +16 ���� */
+	// uint16_t key[2] �ؤΥݥ���
+    uint8_t OverlayFormat;               /* +20 ���� */
+    uint8_t dummy1;
+    uint16_t dummy2;
+} ATI_OVERLAYPROP;                      /* 24bytes? */
+
+typedef struct {
+    int HInvert;
+    int VInvert;
+} ATI_EXTVIDEOPROP;
+
+typedef struct {
+    ATI_EXTVIDEOPROP ExtVideoProp;
+} ATI_UNKNOWN1;
+
+typedef struct {
+	ATI_UNKNOWN1 u1;			// ���ä���
+	uint8_t HExpansion;                   /* +8  ���� */
+    uint8_t VExpansion;                   /* +9  ���� */
+    uint8_t RConversion;                  /* +12 ���� */
+/*     ATI_UNKNOWN1 x; */
+} ATI_EXTENDEDOVERLAYPROP;              /* 16byte? */
+
+/**
+ * ��������졼���������Ѥ򳫻Ϥ��롣
+ *
+ * @return              1:success, 0:fail
+ */
+int AtiCore_ProcessAttach(void);
+int AtiCore_ProcessAttachSpecialMode(uint32_t);
+
+/**
+ * ��������졼���������Ѥ�λ���롣
+ *
+ * @return              *����*
+ */
+int AtiCore_ProcessDetach(void);
+
+/**
+ * �����ե�����ӥǥ������˺������롣
+ *
+ * @arg     handle      (����)�����ե����Υϥ�ɥ�
+ * @arg     offset      (����)�����ե����Υ��ե��å�
+ * @arg     size        �����ե����Υ�����
+ * @arg     direction   (0:��̤����̤�, 1:��̤�����̤�)�����äƳ���
+ * @return              1:success, 0:fail
+ */
+int AtiCore_AllocateSurface(uint16_t *handle, uint32_t *offset,
+							uint32_t size, uint32_t direction);
+
+/**
+ * �����ե������˴����롣
+ *
+ * @arg     handle      �����ե����Υϥ�ɥ�
+ * @return              1:success, 0:fail
+ */
+int AtiCore_DestroySurface(uint16_t handle);
+
+/*8
+ * @param   rop         8�ӥåȤΥե饰�Ȼפ���
+ */
+int AtiCore_SetRopOperation(uint32_t rop);
+
+int AtiCore_SetDstType(uint32_t);
+int AtiCore_SetSrcType(uint32_t);
+int AtiCore_SetSrcClippingRect(ATI_CLIPRECT *cliprect);
+int AtiCore_SetDstClippingRect(ATI_CLIPRECT *cliprect);
+int AtiCore_SetSrcPitchOffset(int pitch, int offset);
+int AtiCore_SetDstPitchOffset(int pitch, int offset);
+
+int AtiCore_BitBltFilpRotate(int blt090Rotate,
+                             ATI_RECT *dstRect, ATI_RECT *srcRect);
+int AtiCore_StretchBlt(ATI_STRETCH *option,
+                       ATI_POINT *point, ATI_RECT *srcRect);
+
+
+/**
+ * (BitBlt�ʤɤ�)��������λ����Τ��Ԥġ�
+ *
+ * @param   nsec        �������Ȼ���(msec)
+ * @return              1:��������λ����, 0:�����Ϥޤ�����äƤ��ʤ�
+ */
+int AtiCore_WaitComplete(int msec);
+
+/**
+ * �����Х쥤��������롣
+ *
+ * @param   handle      (����)�����Х쥤�Υϥ�ɥ�
+ * @return              1:success, 0:fail
+ */
+int AtiCore_AllocOverlay(uint16_t *handle);
+
+int AtiCore_ReleaseOverlay(uint16_t handle);
+
+/**
+ * @return              1:success, 0:fail
+ */
+int AtiCore_SetupOverlay(uint16_t handle, ATI_OVERLAYPROP *prop);
+
+int AtiCore_SetupOverlayExtended(uint16_t handle, ATI_EXTENDEDOVERLAYPROP *prop);
+
+/**
+ * @return              1:success, 0:fail
+ */
+int AtiCore_SetOverlayOnOff(uint16_t handle, int isEnable);
+
+int AtiCore_SetOverlayPos(uint16_t handle, uint16_t x, uint16_t y);
+
+int AtiCore_SetupMemoryTransfer(uint32_t offset, void **regdata);
+int AtiCore_TerminateMemoryTransfer(void);
+
+int AtiCore_GetFrontBufferPitchOffset(uint32_t *pitch, uint32_t *offset);
+
+/**
+ * @return              1:success, 0:fail
+ */
+int AtiCore_SetDisplayBrightness(int brightness);
+
+/**
+ * @return              1:success, 0:fail
+ */
+int GetAvailableVideoMem(uint32_t *internal, uint32_t *external);
+
+/*
+ * 1 ; 0
+ */
+int AtiCore_SetGraphicWindowOnOff(int );
+
+int AtiCore_ReadReg(uint32_t reg, void *val);
+int AtiCore_WriteReg(uint32_t reg, void *val);
+
+uint32_t GetRealMemAddr(uint32_t offset);
+
+int AtiCore_SetBkgColour(uint32_t);
+
+/* ================================================================ */
+/* from libqte.so.2.3.2 */
+/*
+AtiCore_AlphaBlend
+AtiCore_BitBlt
+AtiCore_BrushType
+AtiCore_CursorOnOff
+AtiCore_DrawPixel
+AtiCore_Flush
+AtiCore_GammaCorrection
+AtiCore_GetCRC
+AtiCore_GetCursorPos
+AtiCore_GetDeviceInfo
+AtiCore_GetGPIO_Data
+AtiCore_GetGraphicExtended
+AtiCore_GetGraphicWindowPos
+AtiCore_GetLargestVideoMemBlock
+AtiCore_GetLastError
+AtiCore_GetMultiCRC
+AtiCore_GetOverlayPos
+AtiCore_GetPitchOffsetProperty
+AtiCore_Host
+AtiCore_LoadCursorBitMap
+AtiCore_PaintRect
+AtiCore_PolyScanline
+AtiCore_Polyline
+AtiCore_ProcessAttachMinimal
+AtiCore_ProcessAttachSpecialMode
+AtiCore_ProcessDetachMinimal
+AtiCore_ProcessDetachSpecialMode
+AtiCore_ReadCfgReg
+AtiCore_ScanlineShading
+AtiCore_SetApertures
+AtiCore_SetBkgColour
+AtiCore_SetBytePixelOrder
+AtiCore_SetCursorPos
+AtiCore_SetDisplayParameters
+AtiCore_SetDriverBehaviour
+AtiCore_SetFrgColour
+AtiCore_SetFrontBuffer
+AtiCore_SetGPIO_Data
+AtiCore_SetGraphicWindowPos
+AtiCore_SetOverlayPosUsingGraphicWindowXY
+AtiCore_SetPartialCursor
+AtiCore_SetupGraphicExtended
+AtiCore_SetupGraphicWindow
+AtiCore_SetupPM4
+AtiCore_SmallText
+AtiCore_SubmitPM4Packet
+AtiCore_TransBitBlt
+AtiCore_WriteCfgReg
+ */
+
+EXTERN_C_END
+
+#endif /* W100API_H_INCLUDED */
diff --git a/recipes/mplayer/files/vo_w100_fb.h b/recipes/mplayer/files/vo_w100_fb.h
new file mode 100644
index 0000000000..39318c645b
--- /dev/null
+++ b/recipes/mplayer/files/vo_w100_fb.h
@@ -0,0 +1,4338 @@
+/*
+ * linux/drivers/video/w100fb.h
+ *
+ * Frame Buffer Device for ATI w100 (Wallaby)
+ *
+ * Copyright (C) 2002, ATI Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * ChangeLog:
+ *
+ */
+
+#if !defined (_W100FB_H)
+#define _W100FB_H
+
+/* Block CIF Start: */
+#define mmCHIP_ID                                                    0x0000
+#define mmREVISION_ID                                                0x0004
+#define mmWRAP_BUF_A                                                 0x0008
+#define mmWRAP_BUF_B                                                 0x000C
+#define mmWRAP_TOP_DIR                                               0x0010
+#define mmWRAP_START_DIR                                             0x0014
+#define mmCIF_CNTL                                                   0x0018
+#define mmCFGREG_BASE                                                0x001C
+#define mmCIF_IO                                                     0x0020
+#define mmCIF_READ_DBG                                               0x0024
+#define mmCIF_WRITE_DBG                                              0x0028
+#define cfgIND_ADDR_A_0                                              0x0000
+#define cfgIND_ADDR_A_1                                              0x0001
+#define cfgIND_ADDR_A_2                                              0x0002
+#define cfgIND_DATA_A                                                0x0003
+#define cfgREG_BASE                                                  0x0004
+#define cfgINTF_CNTL                                                 0x0005
+#define cfgSTATUS                                                    0x0006
+#define cfgCPU_DEFAULTS                                              0x0007
+#define cfgIND_ADDR_B_0                                              0x0008
+#define cfgIND_ADDR_B_1                                              0x0009
+#define cfgIND_ADDR_B_2                                              0x000A
+#define cfgIND_DATA_B                                                0x000B
+#define cfgPM4_RPTR                                                  0x000C
+#define cfgSCRATCH                                                   0x000D
+#define cfgPM4_WRPTR_0                                               0x000E
+#define cfgPM4_WRPTR_1                                               0x000F
+/* Block CIF End: */
+
+/* Block CP Start: */
+#define mmCP_RB_CNTL                                                 0x0210
+#define mmCP_RB_BASE                                                 0x0214
+#define mmCP_RB_RPTR_ADDR                                            0x0218
+#define mmCP_RB_RPTR                                                 0x021C
+#define mmCP_RB_RPTR_WR                                              0x02F8
+#define mmCP_RB_WPTR                                                 0x0220
+#define mmCP_IB_BASE                                                 0x0228
+#define mmCP_IB_BUFSZ                                                0x022C
+#define mmCP_CSQ_CNTL                                                0x0230
+#define mmCP_CSQ_APER_PRIMARY                                        0x0300
+#define mmCP_CSQ_APER_INDIRECT                                       0x0340
+#define mmCP_ME_CNTL                                                 0x0240
+#define mmCP_ME_RAM_ADDR                                             0x0244
+#define mmCP_ME_RAM_RADDR                                            0x0248
+#define mmCP_ME_RAM_DATAH                                            0x024C
+#define mmCP_ME_RAM_DATAL                                            0x0250
+#define mmCP_DEBUG                                                   0x025C
+#define mmSCRATCH_REG0                                               0x0260
+#define mmSCRATCH_REG1                                               0x0264
+#define mmSCRATCH_REG2                                               0x0268
+#define mmSCRATCH_REG3                                               0x026C
+#define mmSCRATCH_REG4                                               0x0270
+#define mmSCRATCH_REG5                                               0x0274
+#define mmSCRATCH_UMSK                                               0x0280
+#define mmSCRATCH_ADDR                                               0x0284
+#define mmCP_CSQ_ADDR                                                0x02E4
+#define mmCP_CSQ_DATA                                                0x02E8
+#define mmCP_CSQ_STAT                                                0x02EC
+#define mmCP_STAT                                                    0x02F0
+#define mmGEN_INT_CNTL                                               0x0200
+#define mmGEN_INT_STATUS                                             0x0204
+/* Block CP End: */
+
+/* Block DISPLAY Start: */
+#define mmLCD_FORMAT                                                 0x0410
+#define mmGRAPHIC_CTRL                                               0x0414
+#define mmGRAPHIC_OFFSET                                             0x0418
+#define mmGRAPHIC_PITCH                                              0x041C
+#define mmCRTC_TOTAL                                                 0x0420
+#define mmACTIVE_H_DISP                                              0x0424
+#define mmACTIVE_V_DISP                                              0x0428
+#define mmGRAPHIC_H_DISP                                             0x042C
+#define mmGRAPHIC_V_DISP                                             0x0430
+#define mmVIDEO_CTRL                                                 0x0434
+#define mmGRAPHIC_KEY                                                0x0438
+#define mmVIDEO_Y_OFFSET                                             0x043C
+#define mmVIDEO_Y_PITCH                                              0x0440
+#define mmVIDEO_U_OFFSET                                             0x0444
+#define mmVIDEO_U_PITCH                                              0x0448
+#define mmVIDEO_V_OFFSET                                             0x044C
+#define mmVIDEO_V_PITCH                                              0x0450
+#define mmVIDEO_H_POS                                                0x0454
+#define mmVIDEO_V_POS                                                0x0458
+#define mmBRIGHTNESS_CNTL                                            0x045C
+#define mmCURSOR1_OFFSET                                             0x0460
+#define mmCURSOR1_H_POS                                              0x0464
+#define mmCURSOR1_V_POS                                              0x0468
+#define mmCURSOR1_COLOR0                                             0x046C
+#define mmCURSOR1_COLOR1                                             0x0470
+#define mmCURSOR2_OFFSET                                             0x0474
+#define mmCURSOR2_H_POS                                              0x0478
+#define mmCURSOR2_V_POS                                              0x047C
+#define mmCURSOR2_COLOR0                                             0x0480
+#define mmCURSOR2_COLOR1                                             0x0484
+#define mmDISP_INT_CNTL                                              0x0488
+#define mmCRTC_SS                                                    0x048C
+#define mmCRTC_LS                                                    0x0490
+#define mmCRTC_REV                                                   0x0494
+#define mmCRTC_DCLK                                                  0x049C
+#define mmCRTC_GS                                                    0x04A0
+#define mmCRTC_VPOS_GS                                               0x04A4
+#define mmCRTC_GCLK                                                  0x04A8
+#define mmCRTC_GOE                                                   0x04AC
+#define mmCRTC_FRAME                                                 0x04B0
+#define mmCRTC_FRAME_VPOS                                            0x04B4
+#define mmGPIO_DATA                                                  0x04B8
+#define mmGPIO_CNTL1                                                 0x04BC
+#define mmGPIO_CNTL2                                                 0x04C0
+#define mmLCDD_CNTL1                                                 0x04C4
+#define mmLCDD_CNTL2                                                 0x04C8
+#define mmGENLCD_CNTL1                                               0x04CC
+#define mmGENLCD_CNTL2                                               0x04D0
+#define mmDISP_DEBUG                                                 0x04D4
+#define mmDISP_DB_BUF_CNTL                                           0x04D8
+#define mmDISP_CRC_SIG                                               0x04DC
+#define mmCRTC_DEFAULT_COUNT                                         0x04E0
+#define mmLCD_BACKGROUND_COLOR                                       0x04E4
+#define mmCRTC_PS2                                                   0x04E8
+#define mmCRTC_PS2_VPOS                                              0x04EC
+#define mmCRTC_PS1_ACTIVE                                            0x04F0
+#define mmCRTC_PS1_NACTIVE                                           0x04F4
+#define mmCRTC_GCLK_EXT                                              0x04F8
+#define mmCRTC_ALW                                                   0x04FC
+#define mmCRTC_ALW_VPOS                                              0x0500
+#define mmCRTC_PSK                                                   0x0504
+#define mmCRTC_PSK_HPOS                                              0x0508
+#define mmCRTC_CV4_START                                             0x050C
+#define mmCRTC_CV4_END                                               0x0510
+#define mmCRTC_CV4_HPOS                                              0x0514
+#define mmCRTC_ECK                                                   0x051C
+#define mmREFRESH_CNTL                                               0x0520
+#define mmGENLCD_CNTL3                                               0x0524
+#define mmGPIO_DATA2                                                 0x0528
+#define mmGPIO_CNTL3                                                 0x052C
+#define mmGPIO_CNTL4                                                 0x0530
+#define mmCHIP_STRAP                                                 0x0534
+#define mmDISP_DEBUG2                                                0x0538
+#define mmDEBUG_BUS_CNTL                                             0x053C
+#define mmGAMMA_VALUE1                                               0x0540
+#define mmGAMMA_VALUE2                                               0x0544
+#define mmGAMMA_SLOPE                                                0x0548
+#define mmGEN_STATUS                                                 0x054C
+#define mmHW_INT                                                     0x0550
+/* Block DISPLAY End: */
+
+/* Block GFX Start: */
+#define mmDST_OFFSET                                                 0x1004
+#define mmDST_PITCH                                                  0x1008
+#define mmDST_PITCH_OFFSET                                           0x102C
+#define mmDST_X                                                      0x101C
+#define mmDST_Y                                                      0x1020
+#define mmDST_X_Y                                                    0x1194
+#define mmDST_Y_X                                                    0x1038
+#define mmDST_WIDTH                                                  0x100C
+#define mmDST_HEIGHT                                                 0x1010
+#define mmDST_WIDTH_HEIGHT                                           0x1198
+#define mmDST_HEIGHT_WIDTH                                           0x103C
+#define mmDST_HEIGHT_WIDTH_8                                         0x118C
+#define mmDST_HEIGHT_Y                                               0x11A0
+#define mmDST_WIDTH_X                                                0x1188
+#define mmDST_WIDTH_X_INCY                                           0x119C
+#define mmDST_LINE_START                                             0x1090
+#define mmDST_LINE_END                                               0x1094
+#define mmBRUSH_OFFSET                                               0x108C
+#define mmBRUSH_Y_X                                                  0x1074
+#define mmDP_BRUSH_FRGD_CLR                                          0x107C
+#define mmDP_BRUSH_BKGD_CLR                                          0x1078
+#define mmSRC2_OFFSET                                                0x1060
+#define mmSRC2_PITCH                                                 0x1064
+#define mmSRC2_PITCH_OFFSET                                          0x1068
+#define mmSRC2_X                                                     0x1050
+#define mmSRC2_Y                                                     0x1054
+#define mmSRC2_X_Y                                                   0x1058
+#define mmSRC2_WIDTH                                                 0x1080
+#define mmSRC2_HEIGHT                                                0x1084
+#define mmSRC2_INC                                                   0x1088
+#define mmSRC_OFFSET                                                 0x11AC
+#define mmSRC_PITCH                                                  0x11B0
+#define mmSRC_PITCH_OFFSET                                           0x1028
+#define mmSRC_X                                                      0x1014
+#define mmSRC_Y                                                      0x1018
+#define mmSRC_X_Y                                                    0x1190
+#define mmSRC_Y_X                                                    0x1034
+#define mmSRC_WIDTH                                                  0x1040
+#define mmSRC_HEIGHT                                                 0x1044
+#define mmSRC_INC                                                    0x1048
+#define mmHOST_DATA0                                                 0x13C0
+#define mmHOST_DATA1                                                 0x13C4
+#define mmHOST_DATA2                                                 0x13C8
+#define mmHOST_DATA3                                                 0x13CC
+#define mmHOST_DATA4                                                 0x13D0
+#define mmHOST_DATA5                                                 0x13D4
+#define mmHOST_DATA6                                                 0x13D8
+#define mmHOST_DATA7                                                 0x13DC
+#define mmHOST_DATA_LAST                                             0x13E0
+#define mmDP_SRC_FRGD_CLR                                            0x1240
+#define mmDP_SRC_BKGD_CLR                                            0x1244
+#define mmSC_LEFT                                                    0x1140
+#define mmSC_RIGHT                                                   0x1144
+#define mmSC_TOP                                                     0x1148
+#define mmSC_BOTTOM                                                  0x114C
+#define mmSRC_SC_RIGHT                                               0x1154
+#define mmSRC_SC_BOTTOM                                              0x115C
+#define mmDP_CNTL                                                    0x11C8
+#define mmDP_CNTL_DST_DIR                                            0x11CC
+#define mmDP_DATATYPE                                                0x12C4
+#define mmDP_MIX                                                     0x12C8
+#define mmDP_WRITE_MSK                                               0x12CC
+#define mmCLR_CMP_CLR_SRC                                            0x1234
+#define mmCLR_CMP_CLR_DST                                            0x1238
+#define mmCLR_CMP_CNTL                                               0x1230
+#define mmCLR_CMP_MSK                                                0x123C
+#define mmDEFAULT_PITCH_OFFSET                                       0x10A0
+#define mmDEFAULT_SC_BOTTOM_RIGHT                                    0x10A8
+#define mmDEFAULT2_SC_BOTTOM_RIGHT                                   0x10AC
+#define mmREF1_PITCH_OFFSET                                          0x10B8
+#define mmREF2_PITCH_OFFSET                                          0x10BC
+#define mmREF3_PITCH_OFFSET                                          0x10C0
+#define mmREF4_PITCH_OFFSET                                          0x10C4
+#define mmREF5_PITCH_OFFSET                                          0x10C8
+#define mmREF6_PITCH_OFFSET                                          0x10CC
+#define mmDP_GUI_MASTER_CNTL                                         0x106C
+#define mmSC_TOP_LEFT                                                0x11BC
+#define mmSC_BOTTOM_RIGHT                                            0x11C0
+#define mmSRC_SC_BOTTOM_RIGHT                                        0x11C4
+#define mmGLOBAL_ALPHA                                               0x1210
+#define mmFILTER_COEF                                                0x1214
+#define mmMVC_CNTL_START                                             0x11E0
+#define mmE2_ARITHMETIC_CNTL                                         0x1220
+#define mmDEBUG0                                                     0x1280
+#define mmDEBUG1                                                     0x1284
+#define mmDEBUG2                                                     0x1288
+#define mmDEBUG3                                                     0x128C
+#define mmDEBUG4                                                     0x1290
+#define mmDEBUG5                                                     0x1294
+#define mmDEBUG6                                                     0x1298
+#define mmDEBUG7                                                     0x129C
+#define mmDEBUG8                                                     0x12A0
+#define mmDEBUG9                                                     0x12A4
+#define mmDEBUG10                                                    0x12A8
+#define mmDEBUG11                                                    0x12AC
+#define mmDEBUG12                                                    0x12B0
+#define mmDEBUG13                                                    0x12B4
+#define mmDEBUG14                                                    0x12B8
+#define mmDEBUG15                                                    0x12BC
+#define mmENG_CNTL                                                   0x13E8
+#define mmENG_PERF_CNT                                               0x13F0
+/* Block GFX End: */
+
+/* Block IDCT Start: */
+#define mmIDCT_RUNS                                                  0x0C00
+#define mmIDCT_LEVELS                                                0x0C04
+#define mmIDCT_CONTROL                                               0x0C3C
+#define mmIDCT_AUTH_CONTROL                                          0x0C08
+#define mmIDCT_AUTH                                                  0x0C0C
+/* Block IDCT End: */
+
+/* Block MC Start: */
+#define mmMEM_CNTL                                                   0x0180
+#define mmMEM_ARB                                                    0x0184
+#define mmMC_FB_LOCATION                                             0x0188
+#define mmMEM_EXT_CNTL                                               0x018C
+#define mmMC_EXT_MEM_LOCATION                                        0x0190
+#define mmMEM_EXT_TIMING_CNTL                                        0x0194
+#define mmMEM_SDRAM_MODE_REG                                         0x0198
+#define mmMEM_IO_CNTL                                                0x019C
+#define mmMC_DEBUG                                                   0x01A0
+#define mmMC_BIST_CTRL                                               0x01A4
+#define mmMC_BIST_COLLAR_READ                                        0x01A8
+#define mmTC_MISMATCH                                                0x01AC
+#define mmMC_PERF_MON_CNTL                                           0x01B0
+#define mmMC_PERF_COUNTERS                                           0x01B4
+/* Block MC End: */
+
+/* Block RBBM Start: */
+#define mmWAIT_UNTIL                                                 0x1400
+#define mmISYNC_CNTL                                                 0x1404
+#define mmRBBM_GUICNTL                                               0x1408
+#define mmRBBM_STATUS                                                0x0140
+#define mmRBBM_STATUS_alt_1                                          0x140C
+#define mmRBBM_CNTL                                                  0x0144
+#define mmRBBM_SOFT_RESET                                            0x0148
+#define mmNQWAIT_UNTIL                                               0x0150
+#define mmRBBM_DEBUG                                                 0x016C
+#define mmRBBM_CMDFIFO_ADDR                                          0x0170
+#define mmRBBM_CMDFIFO_DATAL                                         0x0174
+#define mmRBBM_CMDFIFO_DATAH                                         0x0178
+#define mmRBBM_CMDFIFO_STAT                                          0x017C
+/* Block RBBM End: */
+
+/* Block CG Start: */
+#define mmCLK_PIN_CNTL                                               0x0080
+#define mmPLL_REF_FB_DIV                                             0x0084
+#define mmPLL_CNTL                                                   0x0088
+#define mmSCLK_CNTL                                                  0x008C
+#define mmPCLK_CNTL                                                  0x0090
+#define mmCLK_TEST_CNTL                                              0x0094
+#define mmPWRMGT_CNTL                                                0x0098
+#define mmPWRMGT_STATUS                                              0x009C
+/* Block CG End: */
+
+/* default value definitions */
+#define defCHIP_ID                        0x00001002
+#define defREVISION_ID                    0x00000000
+#define defWRAP_BUF_A                     0x01000000
+#define defWRAP_BUF_B                     0x01000000
+#define defWRAP_TOP_DIR                   0x00000000
+#define defWRAP_START_DIR                 0x00000000
+//#define defCIF_CNTL                       0x00082900
+#define defCIF_CNTL                       0x00182d00		//??updated by Tobey Z.for Sharp,Oct11,2002
+#define defCFGREG_BASE                    0x00000000
+//#define defCIF_IO                         0x000c0800
+#define defCIF_IO                         0x000C0902		//??updated by Tobey Z.for Sharp,Oct11,2002
+#define defCIF_READ_DBG                   0x00018223
+#define defCIF_WRITE_DBG                  0x00002100
+#define defIND_ADDR_A_0                   0x00000000
+#define defIND_ADDR_A_1                   0x00000000
+#define defIND_ADDR_A_2                   0x00000000
+#define defIND_DATA_A                     0x00000000
+#define defREG_BASE                       0x00000001
+#define defINTF_CNTL                      0x00000011
+#define defSTATUS                         0x00000000
+#define defCPU_DEFAULTS                   0x00000006
+#define defIND_ADDR_B_0                   0x00000000
+#define defIND_ADDR_B_1                   0x00000000
+#define defIND_ADDR_B_2                   0x00000000
+#define defIND_DATA_B                     0x00000000
+#define defPM4_RPTR                       0x00000000
+#define defSCRATCH                        0x00000000
+#define defPM4_WRPTR_0                    0x00000000
+#define defPM4_WRPTR_1                    0x00000000
+#define defCP_RB_CNTL                     0x00000000
+#define defCP_RB_BASE                     0x00000000
+#define defCP_RB_RPTR_ADDR                0x00000000
+#define defCP_RB_RPTR                     0x00000000
+#define defCP_RB_RPTR_WR                  0x00000000
+#define defCP_RB_WPTR                     0x00000000
+#define defCP_IB_BASE                     0x00000000
+#define defCP_IB_BUFSZ                    0x00000000
+#define defCP_CSQ_CNTL                    0x00000000
+#define defCP_CSQ_APER_PRIMARY            0x00000000
+#define defCP_CSQ_APER_INDIRECT           0x00000000
+#define defCP_ME_CNTL                     0x40000000
+#define defCP_ME_RAM_ADDR                 0x00000000
+#define defCP_ME_RAM_RADDR                0x00000000
+#define defCP_ME_RAM_DATAH                0x00000000
+#define defCP_ME_RAM_DATAL                0x00000000
+#define defCP_DEBUG                       0x00000000
+#define defSCRATCH_REG0                   0x00000000
+#define defSCRATCH_REG1                   0x00000000
+#define defSCRATCH_REG2                   0x00000000
+#define defSCRATCH_REG3                   0x00000000
+#define defSCRATCH_REG4                   0x00000000
+#define defSCRATCH_REG5                   0x00000000
+#define defSCRATCH_UMSK                   0x00000000
+#define defSCRATCH_ADDR                   0x00000000
+#define defCP_CSQ_ADDR                    0x00000000
+#define defCP_CSQ_DATA                    0x00000000
+#define defCP_CSQ_STAT                    0x00000000
+#define defCP_STAT                        0x00000000
+#define defGEN_INT_CNTL                   0x00000000
+#define defGEN_INT_STATUS_rd              0x00080000
+#define defGEN_INT_STATUS_wr              0x00000000
+#define defLCD_FORMAT                     0x00000000
+#define defGRAPHIC_CTRL                   0x00000000
+#define defGRAPHIC_OFFSET                 0x00000000
+#define defGRAPHIC_PITCH                  0x00000000
+#define defCRTC_TOTAL                     0x00000000
+#define defACTIVE_H_DISP                  0x00000000
+#define defACTIVE_V_DISP                  0x00000000
+#define defGRAPHIC_H_DISP                 0x00000000
+#define defGRAPHIC_V_DISP                 0x00000000
+#define defVIDEO_CTRL                     0x00000000
+#define defGRAPHIC_KEY                    0x00000000
+#define defVIDEO_Y_OFFSET                 0x00000000
+#define defVIDEO_Y_PITCH                  0x00000000
+#define defVIDEO_U_OFFSET                 0x00000000
+#define defVIDEO_U_PITCH                  0x00000000
+#define defVIDEO_V_OFFSET                 0x00000000
+#define defVIDEO_V_PITCH                  0x00000000
+#define defVIDEO_H_POS                    0x00000000
+#define defVIDEO_V_POS                    0x00000000
+#define defBRIGHTNESS_CNTL                0x00000000
+#define defCURSOR1_OFFSET                 0x00000000
+#define defCURSOR1_H_POS                  0x00000000
+#define defCURSOR1_V_POS                  0x00000000
+#define defCURSOR1_COLOR0                 0x00000000
+#define defCURSOR1_COLOR1                 0x00000000
+#define defCURSOR2_OFFSET                 0x00000000
+#define defCURSOR2_H_POS                  0x00000000
+#define defCURSOR2_V_POS                  0x00000000
+#define defCURSOR2_COLOR0                 0x00000000
+#define defCURSOR2_COLOR1                 0x00000000
+#define defDISP_INT_CNTL                  0x00000000
+#define defCRTC_SS                        0x00000000
+#define defCRTC_LS                        0x00000000
+#define defCRTC_REV                       0x00000000
+#define defCRTC_DCLK                      0x00000000
+#define defCRTC_GS                        0x00000000
+#define defCRTC_VPOS_GS                   0x00000000
+#define defCRTC_GCLK                      0x00000000
+#define defCRTC_GOE                       0x00000000
+#define defCRTC_FRAME                     0x00000000
+#define defCRTC_FRAME_VPOS                0x00000000
+#define defGPIO_DATA                      0x00000000
+#define defGPIO_CNTL1                     0xff00ff00
+#define defGPIO_CNTL2                     0x00000000
+#define defLCDD_CNTL1                     0x0000ffff
+#define defLCDD_CNTL2                     0x00000000
+#define defGENLCD_CNTL1                   0x00aaa002
+#define defGENLCD_CNTL2                   0x00000002
+#define defDISP_DEBUG                     0x00000000
+#define defDISP_DB_BUF_CNTL_rd            0x00000000
+#define defDISP_DB_BUF_CNTL_wr            0x00000000
+#define defDISP_CRC_SIG                   0x00000000
+#define defCRTC_DEFAULT_COUNT             0x00000000
+#define defLCD_BACKGROUND_COLOR           0x00000000
+#define defCRTC_PS2                       0x00000000
+#define defCRTC_PS2_VPOS                  0x00000000
+#define defCRTC_PS1_ACTIVE                0x00000000
+#define defCRTC_PS1_NACTIVE               0x00000000
+#define defCRTC_GCLK_EXT                  0x00000000
+#define defCRTC_ALW                       0x00000000
+#define defCRTC_ALW_VPOS                  0x00000000
+#define defCRTC_PSK                       0x00000000
+#define defCRTC_PSK_HPOS                  0x00000000
+#define defCRTC_CV4_START                 0x00000000
+#define defCRTC_CV4_END                   0x00000000
+#define defCRTC_CV4_HPOS                  0x00000000
+#define defCRTC_ECK                       0x00000000
+#define defREFRESH_CNTL                   0x00000000
+#define defGENLCD_CNTL3                   0x000002aa
+#define defGPIO_DATA2                     0x00000000
+#define defGPIO_CNTL3                     0x00000000
+#define defGPIO_CNTL4                     0x00000000
+#define defCHIP_STRAP                     0x00000000
+#define defDISP_DEBUG2                    0x00000000
+#define defDEBUG_BUS_CNTL                 0x00000000
+#define defGAMMA_VALUE1                   0x00000000
+#define defGAMMA_VALUE2                   0x00000000
+#define defGAMMA_SLOPE                    0x00000000
+#define defGEN_STATUS                     0x00000000
+#define defHW_INT                         0x00000000
+#define defDST_OFFSET                     0x00000000
+#define defDST_PITCH                      0x00000000
+#define defDST_PITCH_OFFSET               0x00000000
+#define defDST_X                          0x00000000
+#define defDST_Y                          0x00000000
+#define defDST_X_Y                        0x00000000
+#define defDST_Y_X                        0x00000000
+#define defDST_WIDTH                      0x00000000
+#define defDST_HEIGHT                     0x00000000
+#define defDST_WIDTH_HEIGHT               0x00000000
+#define defDST_HEIGHT_WIDTH               0x00000000
+#define defDST_HEIGHT_WIDTH_8             0x00000000
+#define defDST_HEIGHT_Y                   0x00000000
+#define defDST_WIDTH_X                    0x00000000
+#define defDST_WIDTH_X_INCY               0x00000000
+#define defDST_LINE_START                 0x00000000
+#define defDST_LINE_END                   0x00000000
+#define defBRUSH_OFFSET                   0x00000000
+#define defBRUSH_Y_X                      0x00000000
+#define defDP_BRUSH_FRGD_CLR              0x00000000
+#define defDP_BRUSH_BKGD_CLR              0x00000000
+#define defSRC2_OFFSET                    0x00000000
+#define defSRC2_PITCH                     0x00000000
+#define defSRC2_PITCH_OFFSET              0x00000000
+#define defSRC2_X                         0x00000000
+#define defSRC2_Y                         0x00000000
+#define defSRC2_X_Y                       0x00000000
+#define defSRC2_WIDTH                     0x00000000
+#define defSRC2_HEIGHT                    0x00000000
+#define defSRC2_INC                       0x00000000
+#define defSRC_OFFSET                     0x00000000
+#define defSRC_PITCH                      0x00000000
+#define defSRC_PITCH_OFFSET               0x00000000
+#define defSRC_X                          0x00000000
+#define defSRC_Y                          0x00000000
+#define defSRC_X_Y                        0x00000000
+#define defSRC_Y_X                        0x00000000
+#define defSRC_WIDTH                      0x00000000
+#define defSRC_HEIGHT                     0x00000000
+#define defSRC_INC                        0x00000000
+#define defHOST_DATA0                     0x00000000
+#define defHOST_DATA1                     0x00000000
+#define defHOST_DATA2                     0x00000000
+#define defHOST_DATA3                     0x00000000
+#define defHOST_DATA4                     0x00000000
+#define defHOST_DATA5                     0x00000000
+#define defHOST_DATA6                     0x00000000
+#define defHOST_DATA7                     0x00000000
+#define defHOST_DATA_LAST                 0x00000000
+#define defDP_SRC_FRGD_CLR                0x00000000
+#define defDP_SRC_BKGD_CLR                0x00000000
+#define defSC_LEFT                        0x00000000
+#define defSC_RIGHT                       0x00000000
+#define defSC_TOP                         0x00000000
+#define defSC_BOTTOM                      0x00000000
+#define defSRC_SC_RIGHT                   0x00000000
+#define defSRC_SC_BOTTOM                  0x00000000
+#define defDP_CNTL                        0x00000000
+#define defDP_CNTL_DST_DIR                0x00000000
+#define defDP_DATATYPE                    0x00000000
+#define defDP_MIX                         0x00000000
+#define defDP_WRITE_MSK                   0x00000000
+#define defCLR_CMP_CLR_SRC                0x00000000
+#define defCLR_CMP_CLR_DST                0x00000000
+#define defCLR_CMP_CNTL                   0x00000000
+#define defCLR_CMP_MSK                    0x00000000
+#define defDEFAULT_PITCH_OFFSET           0x00000000
+#define defDEFAULT_SC_BOTTOM_RIGHT        0x00000000
+#define defDEFAULT2_SC_BOTTOM_RIGHT       0x00000000
+#define defREF1_PITCH_OFFSET              0x00000000
+#define defREF2_PITCH_OFFSET              0x00000000
+#define defREF3_PITCH_OFFSET              0x00000000
+#define defREF4_PITCH_OFFSET              0x00000000
+#define defREF5_PITCH_OFFSET              0x00000000
+#define defREF6_PITCH_OFFSET              0x00000000
+#define defDP_GUI_MASTER_CNTL             0x00000000
+#define defSC_TOP_LEFT                    0x00000000
+#define defSC_BOTTOM_RIGHT                0x00000000
+#define defSRC_SC_BOTTOM_RIGHT            0x00000000
+#define defGLOBAL_ALPHA                   0x00000000
+#define defFILTER_COEF                    0x00000000
+#define defMVC_CNTL_START                 0x00000000
+#define defE2_ARITHMETIC_CNTL             0x00000000
+#define defDEBUG0                         0x00000000
+#define defDEBUG1                         0x00000000
+#define defDEBUG2                         0x00000000
+#define defDEBUG3                         0x00000000
+#define defDEBUG4                         0x00000000
+#define defDEBUG5                         0x00000000
+#define defDEBUG6                         0x00000000
+#define defDEBUG7                         0x00000000
+#define defDEBUG8                         0x00000000
+#define defDEBUG9                         0x00000000
+#define defDEBUG10                        0x00000000
+#define defDEBUG11                        0x00000000
+#define defDEBUG12                        0x00000000
+#define defDEBUG13                        0x00000000
+#define defDEBUG14                        0x00000000
+#define defDEBUG15                        0x00000000
+#define defENG_CNTL                       0x00000003
+#define defENG_PERF_CNT                   0x00000000
+#define defIDCT_RUNS                      0x00000000
+#define defIDCT_LEVELS                    0x00000000
+#define defIDCT_CONTROL                   0x00000000
+#define defIDCT_AUTH_CONTROL              0x00000000
+#define defIDCT_AUTH                      0x00000000
+#define defMEM_CNTL                       0x00000006
+#define defMEM_ARB                        0x00000000
+#define defMC_FB_LOCATION                 0x00ff0000
+#define defMEM_EXT_CNTL                   0x00040010
+#define defMC_EXT_MEM_LOCATION            0x07ff0000
+#define defMEM_EXT_TIMING_CNTL            0x00140c73
+#define defMEM_SDRAM_MODE_REG             0x00050000
+#define defMEM_IO_CNTL                    0x00ff00ff
+#define defMC_DEBUG                       0x00000000
+#define defMC_BIST_CTRL                   0x00000000
+#define defMC_BIST_COLLAR_READ            0x00000000
+#define defTC_MISMATCH                    0x00000000
+#define defMC_PERF_MON_CNTL               0x00000000
+#define defMC_PERF_COUNTERS               0x00000000
+#define defWAIT_UNTIL                     0xc5cdcdcd
+#define defISYNC_CNTL                     0x00000000
+#define defRBBM_GUICNTL                   0x00000000
+#define defRBBM_STATUS                    0x81cdcd40
+#define defRBBM_CNTL                      0x0000000f
+#define defRBBM_SOFT_RESET                0x00000000
+#define defNQWAIT_UNTIL                   0x00000001
+#define defRBBM_DEBUG                     0x00000000
+#define defRBBM_CMDFIFO_ADDR              0x0000000d
+#define defRBBM_CMDFIFO_DATAL             0xcdcdcdcd
+#define defRBBM_CMDFIFO_DATAH             0x00000dcd
+#define defRBBM_CMDFIFO_STAT              0x00000d0d
+#define defCLK_PIN_CNTL                   0x0000003f
+#define defPLL_REF_FB_DIV                 0x5a500000
+#define defPLL_CNTL                       0x4b000203
+#define defSCLK_CNTL                      0x00ff0300
+#define defPCLK_CNTL                      0x00010000
+#define defCLK_TEST_CNTL                  0x00000000
+#define defPWRMGT_CNTL                    0x00000004
+#define defPWRMGT_STATUS                  0x00000001
+
+#define CFG_BASE_BOOT_DEFAULT  0x0
+#define CFG_BASE_VALUE         0x0
+#define REG_BASE_BOOT_DEFAULT  0x01
+#define REG_BASE_VALUE         0x10000
+#define MEM_INT_BASE_VALUE     0x100000
+#define MEM_INT_TOP_VALUE_W100 0x15ffff
+#define MEM_EXT_BASE_VALUE     0x800000
+#define MEM_EXT_TOP_VALUE      0x9fffff
+#define WRAP_BUF_BASE_VALUE    0x80000
+#define WRAP_BUF_TOP_VALUE     0xbffff
+
+//----------------------------------------------------------------------------
+// Registers Field Definitions
+
+// DP_GUI_MASTER_CNTL.GMC_Brush_DataType
+// DP_DATATYPE.Brush_DataType
+#define DP_BRUSH_8x8MONOOPA			0   //8x8 mono pattern (expanded to frgd, bkgd)
+#define DP_BRUSH_8x8MONOTRA			1   //8x8 mono pattern (expanded to frgd, leave_alone)
+#define DP_PEN_32x1MONOOPA			6   //32x1 mono pattern (expanded to frgd, bkgd)
+#define DP_PEN_32x1MONOTRA			7   //32x1 mono pattern (expanded to frgd, leave_alone)
+#define DP_BRUSH_8x8COLOR			10  //8x8 color pattern
+#define DP_BRUSH_SOLIDCOLOR			13  //solid color pattern (frgd)
+#define DP_BRUSH_NONE				15	//no brush used
+
+#define SIZE_BRUSH_8x8MONO			2
+#define SIZE_PEN_32x1MONO			1
+#define SIZE_BRUSH_8x8COLOR_8		16
+#define SIZE_BRUSH_8x8COLOR_16		32
+#define MAX_BRUSH_SIZE				SIZE_BRUSH_8x8COLOR_16
+
+// DP_GUI_MASTER_CNTL.GMC_Dst_DataType
+// DP_DATATYPE.Dp_Dst_DataType
+#define DP_DST_8BPP					2   // 8 bpp grey scale
+#define DP_DST_16BPP_1555			3   //16 bpp aRGB 1555
+#define DP_DST_16BPP_444			5   //16 bpp aRGB 4444
+
+// DP_GUI_MASTER_CNTL.GMC_Src_DataType
+// DP_DATATYPE.Dp_Src_DataType
+#define DP_SRC_1BPP_OPA				0   //mono (expanded to frgd, bkgd)
+#define DP_SRC_1BPP_TRA				1   //mono (expanded to frgd, leave_alone)
+#define DP_SRC_COLOR_SAME_AS_DST				3   //color (same as DST)
+#define	DP_SRC_SOLID_COLOR_BLT		4	//solid color for Blt (use frgd)
+#define	DP_SRC_4BPP					5	//4 bpp
+#define	DP_SRC_12BPP_PACKED			6	//12 bpp packed
+
+// DP_GUI_MASTER_CNTL.GMC_Byte_Pix_Order
+// DP_DATATYPE.Dp_Byte_Pix_Order
+#define DP_PIX_ORDER_MSB2LSB		0   //monochrome pixel order from MSBit to LSBit
+#define DP_PIX_ORDER_LSB2MSB		1   //monochrome pixel order from LSBit to MSBit
+
+// DP_GUI_MASTER_CNTL.GMC_Dp_Src_Source
+#define DP_SRC_MEM_LINEAR			1	//loaded from memory (linear trajectory)
+#define DP_SRC_MEM_RECTANGULAR		2   //loaded from memory (rectangular trajectory)
+#define DP_SRC_HOSTDATA_BIT			3   //loaded from hostdata (linear trajectory)
+#define DP_SRC_HOSTDATA_BYTE		4   //loaded from hostdata (linear trajectory & byte-aligned)
+
+// DP_GUI_MASTER_CNTL.GMC_Dp_Op
+#define	DP_OP_ROP					0
+#define	DP_OP_ARITHMETIC			1
+
+// E2_ARITHMETIC_CNTL.opcode
+#define	E2_OPC_GLBALP_ADD_SRC2		0
+#define	E2_OPC_GLBALP_SUB_SRC2		1
+#define	E2_OPC_SRC1_ADD_SRC2		2
+#define	E2_OPC_SRC1_SUB_SRC2		3
+#define	E2_OPC_DST_SADDBLEND_SRC2	4
+#define	E2_OPC_DST_CADDBLEND_SRC2	5
+#define	E2_OPC_DST_CSUBBLEND_SRC2	6
+#define	E2_OPC_LF_SRC2				7
+#define	E2_OPC_SCALE_SRC2			8
+#define	E2_OPC_STRETCH_SRC2			9
+#define	E2_OPC_SRC1_4BPPCPYWEXP		10
+#define	E2_OPC_MC1					11
+#define	E2_OPC_MC2					12
+#define E2_OPC_MC1_IDCT				13
+#define	E2_OPC_MC2_IDCT				14
+#define	E2_OPC_IDCT_ONLY_IFRAME		15
+
+// E2_ARITHMETIC_CNTL.clamp
+#define	E2_CLAMP_OFF				0
+#define	E2_CLAMP_ON					1
+
+// E2_ARITHMETIC_CNTL.rounding
+#define	E2_ROUNDING_TRUNCATE		0
+#define	E2_ROUNDING_TO_INFINITY		1
+
+// E2_ARITHMETIC_CNTL.srcblend
+#define	E2_SRCBLEND_GLOBALALPHA		0
+#define	E2_SRCBLEND_ZERO			1
+#define	E2_SRCBLEND_SRC2ALPHA		2
+#define	E2_SRCBLEND_DSTALPHA		3
+#define	E2_SRCBLEND_ALPHA1PLANE		4
+
+// E2_ARITHMETIC_CNTL.destblend
+#define	E2_DSTBLEND_GLOBALALPHA		0
+#define	E2_DSTBLEND_ZERO			1
+#define	E2_DSTBLEND_SRC2ALPHA		2
+#define	E2_DSTBLEND_DSTALPHA		3
+#define	E2_DSTBLEND_ALPHA1PLANE		4
+
+// LCD_FORMAT.lcd_type
+#define	LCDTYPE_TFT333				0
+#define	LCDTYPE_TFT444				1
+#define	LCDTYPE_TFT555				2
+#define	LCDTYPE_TFT666				3
+#define	LCDTYPE_COLSTNPACK4			4
+#define	LCDTYPE_COLSTNPACK8F1		5
+#define	LCDTYPE_COLSTNPACK8F2		6
+#define	LCDTYPE_COLSTNPACK16		7
+#define	LCDTYPE_MONSTNPACK4			8
+#define	LCDTYPE_MONSTNPACK8			9
+
+// CP_RB_CNTL.rb_bufsz
+#define	RB_SIZE_2K					8
+#define	RB_SIZE_4K					9
+#define	RB_SIZE_8K					10
+#define	RB_SIZE_16K					11
+#define	RB_SIZE_32K					12
+#define	RB_SIZE_64K					13
+
+// GRAPHIC_CTRL.color_depth
+#define	COLOR_DEPTH_1BPP			0
+#define	COLOR_DEPTH_2BPP			1
+#define	COLOR_DEPTH_4BPP			2
+#define	COLOR_DEPTH_8BPP			3
+#define	COLOR_DEPTH_332				4
+#define	COLOR_DEPTH_A444			5
+#define	COLOR_DEPTH_A555			6
+
+// VIDEO_CTRL.video_mode
+#define	VIDEO_MODE_422				0
+#define	VIDEO_MODE_420				1
+
+/* data structure definitions */
+
+typedef struct _chip_id_t {
+     unsigned long vendor_id                      : 16;
+     unsigned long device_id                      : 16;
+     } chip_id_t;
+
+typedef union {
+     unsigned long val : 32;
+     chip_id_t f;
+} chip_id_u;
+
+typedef struct _revision_id_t {
+     unsigned long minor_rev_id                   : 4;
+     unsigned long major_rev_id                   : 4;
+     unsigned long                                : 24;
+     } revision_id_t;
+
+typedef union {
+     unsigned long val : 32;
+     revision_id_t f;
+} revision_id_u;
+
+typedef struct _wrap_buf_a_t {
+     unsigned long offset_addr_a                  : 24;
+     unsigned long block_size_a                   : 3;
+     unsigned long                                : 5;
+     } wrap_buf_a_t;
+
+typedef union {
+     unsigned long val : 32;
+     wrap_buf_a_t f;
+} wrap_buf_a_u;
+
+typedef struct _wrap_buf_b_t {
+     unsigned long offset_addr_b                  : 24;
+     unsigned long block_size_b                   : 3;
+     unsigned long                                : 5;
+     } wrap_buf_b_t;
+
+typedef union {
+     unsigned long val : 32;
+     wrap_buf_b_t f;
+} wrap_buf_b_u;
+
+typedef struct _wrap_top_dir_t {
+     unsigned long top_addr                       : 23;
+     unsigned long                                : 9;
+     } wrap_top_dir_t;
+
+typedef union {
+     unsigned long val : 32;
+     wrap_top_dir_t f;
+} wrap_top_dir_u;
+
+typedef struct _wrap_start_dir_t {
+     unsigned long start_addr                     : 23;
+     unsigned long                                : 9;
+     } wrap_start_dir_t;
+
+typedef union {
+     unsigned long val : 32;
+     wrap_start_dir_t f;
+} wrap_start_dir_u;
+
+typedef struct _cif_cntl_t {
+     unsigned long swap_reg                       : 2;
+     unsigned long swap_fbuf_1                    : 2;
+     unsigned long swap_fbuf_2                    : 2;
+     unsigned long swap_fbuf_3                    : 2;
+     unsigned long pmi_int_disable                : 1;
+     unsigned long pmi_schmen_disable             : 1;
+     unsigned long intb_oe                        : 1;
+     unsigned long en_wait_to_compensate_dq_prop_dly : 1;
+     unsigned long compensate_wait_rd_size        : 2;
+     unsigned long wait_asserted_timeout_val      : 2;
+     unsigned long wait_masked_val                : 2;
+     unsigned long en_wait_timeout                : 1;
+     unsigned long en_one_clk_setup_before_wait   : 1;
+     unsigned long interrupt_active_high          : 1;
+     unsigned long en_overwrite_straps            : 1;
+     unsigned long strap_wait_active_hi           : 1;
+     unsigned long lat_busy_count                 : 2;
+     unsigned long lat_rd_pm4_sclk_busy           : 1;
+     unsigned long dis_system_bits                : 1;
+     unsigned long dis_mr                         : 1;
+     unsigned long cif_spare_1                    : 4;
+     } cif_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     cif_cntl_t f;
+} cif_cntl_u;
+
+typedef struct _cfgreg_base_t {
+     unsigned long cfgreg_base                    : 24;
+     unsigned long                                : 8;
+     } cfgreg_base_t;
+
+typedef union {
+     unsigned long val : 32;
+     cfgreg_base_t f;
+} cfgreg_base_u;
+
+typedef struct _cif_io_t {
+     unsigned long dq_srp                         : 1;
+     unsigned long dq_srn                         : 1;
+     unsigned long dq_sp                          : 4;
+     unsigned long dq_sn                          : 4;
+     unsigned long waitb_srp                      : 1;
+     unsigned long waitb_srn                      : 1;
+     unsigned long waitb_sp                       : 4;
+     unsigned long waitb_sn                       : 4;
+     unsigned long intb_srp                       : 1;
+     unsigned long intb_srn                       : 1;
+     unsigned long intb_sp                        : 4;
+     unsigned long intb_sn                        : 4;
+     unsigned long                                : 2;
+     } cif_io_t;
+
+typedef union {
+     unsigned long val : 32;
+     cif_io_t f;
+} cif_io_u;
+
+typedef struct _cif_read_dbg_t {
+     unsigned long unpacker_pre_fetch_trig_gen    : 2;
+     unsigned long dly_second_rd_fetch_trig       : 1;
+     unsigned long rst_rd_burst_id                : 1;
+     unsigned long dis_rd_burst_id                : 1;
+     unsigned long en_block_rd_when_packer_is_not_emp : 1;
+     unsigned long dis_pre_fetch_cntl_sm          : 1;
+     unsigned long rbbm_chrncy_dis                : 1;
+     unsigned long rbbm_rd_after_wr_lat           : 2;
+     unsigned long dis_be_during_rd               : 1;
+     unsigned long one_clk_invalidate_pulse       : 1;
+     unsigned long dis_chnl_priority              : 1;
+     unsigned long rst_read_path_a_pls            : 1;
+     unsigned long rst_read_path_b_pls            : 1;
+     unsigned long dis_reg_rd_fetch_trig          : 1;
+     unsigned long dis_rd_fetch_trig_from_ind_addr : 1;
+     unsigned long dis_rd_same_byte_to_trig_fetch : 1;
+     unsigned long dis_dir_wrap                   : 1;
+     unsigned long dis_ring_buf_to_force_dec      : 1;
+     unsigned long dis_addr_comp_in_16bit         : 1;
+     unsigned long clr_w                          : 1;
+     unsigned long err_rd_tag_is_3                : 1;
+     unsigned long err_load_when_ful_a            : 1;
+     unsigned long err_load_when_ful_b            : 1;
+     unsigned long                                : 7;
+     } cif_read_dbg_t;
+
+typedef union {
+     unsigned long val : 32;
+     cif_read_dbg_t f;
+} cif_read_dbg_u;
+
+typedef struct _cif_write_dbg_t {
+     unsigned long packer_timeout_count           : 2;
+     unsigned long en_upper_load_cond             : 1;
+     unsigned long en_chnl_change_cond            : 1;
+     unsigned long dis_addr_comp_cond             : 1;
+     unsigned long dis_load_same_byte_addr_cond   : 1;
+     unsigned long dis_timeout_cond               : 1;
+     unsigned long dis_timeout_during_rbbm        : 1;
+     unsigned long dis_packer_ful_during_rbbm_timeout : 1;
+     unsigned long en_dword_split_to_rbbm         : 1;
+     unsigned long en_dummy_val                   : 1;
+     unsigned long dummy_val_sel                  : 1;
+     unsigned long mask_pm4_wrptr_dec             : 1;
+     unsigned long dis_mc_clean_cond              : 1;
+     unsigned long err_two_reqi_during_ful        : 1;
+     unsigned long err_reqi_during_idle_clk       : 1;
+     unsigned long err_global                     : 1;
+     unsigned long en_wr_buf_dbg_load             : 1;
+     unsigned long en_wr_buf_dbg_path             : 1;
+     unsigned long sel_wr_buf_byte                : 3;
+     unsigned long dis_rd_flush_wr                : 1;
+     unsigned long dis_packer_ful_cond            : 1;
+     unsigned long dis_invalidate_by_ops_chnl     : 1;
+     unsigned long en_halt_when_reqi_err          : 1;
+     unsigned long cif_spare_2                    : 5;
+     unsigned long                                : 1;
+     } cif_write_dbg_t;
+
+typedef union {
+     unsigned long val : 32;
+     cif_write_dbg_t f;
+} cif_write_dbg_u;
+
+typedef struct _ind_addr_a_0_t {
+     unsigned char ind_addr_a_0                   : 8;
+     } ind_addr_a_0_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_addr_a_0_t f;
+} ind_addr_a_0_u;
+
+typedef struct _ind_addr_a_1_t {
+     unsigned char ind_addr_a_1                   : 8;
+     } ind_addr_a_1_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_addr_a_1_t f;
+} ind_addr_a_1_u;
+
+typedef struct _ind_addr_a_2_t {
+     unsigned char ind_addr_a_2                   : 8;
+     } ind_addr_a_2_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_addr_a_2_t f;
+} ind_addr_a_2_u;
+
+typedef struct _ind_data_a_t {
+     unsigned char ind_data_a                     : 8;
+     } ind_data_a_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_data_a_t f;
+} ind_data_a_u;
+
+typedef struct _reg_base_t {
+     unsigned char reg_base                       : 8;
+     } reg_base_t;
+
+typedef union {
+     unsigned char val : 8;
+     reg_base_t f;
+} reg_base_u;
+
+typedef struct _intf_cntl_t {
+     unsigned char ad_inc_a                       : 1;
+     unsigned char ring_buf_a                     : 1;
+     unsigned char rd_fetch_trigger_a             : 1;
+     unsigned char rd_data_rdy_a                  : 1;
+     unsigned char ad_inc_b                       : 1;
+     unsigned char ring_buf_b                     : 1;
+     unsigned char rd_fetch_trigger_b             : 1;
+     unsigned char rd_data_rdy_b                  : 1;
+     } intf_cntl_t;
+
+typedef union {
+     unsigned char val : 8;
+     intf_cntl_t f;
+} intf_cntl_u;
+
+typedef struct _status_t {
+     unsigned char wr_fifo_available_space        : 2;
+     unsigned char fbuf_wr_pipe_emp               : 1;
+     unsigned char soft_reset                     : 1;
+     unsigned char system_pwm_mode                : 2;
+     unsigned char mem_access_dis                 : 1;
+     unsigned char en_pre_fetch                   : 1;
+     } status_t;
+
+typedef union {
+     unsigned char val : 8;
+     status_t f;
+} status_u;
+
+typedef struct _cpu_defaults_t {
+     unsigned char unpack_rd_data                 : 1;
+     unsigned char access_ind_addr_a              : 1;
+     unsigned char access_ind_addr_b              : 1;
+     unsigned char access_scratch_reg             : 1;
+     unsigned char pack_wr_data                   : 1;
+     unsigned char transition_size                : 1;
+     unsigned char en_read_buf_mode               : 1;
+     unsigned char rd_fetch_scratch               : 1;
+     } cpu_defaults_t;
+
+typedef union {
+     unsigned char val : 8;
+     cpu_defaults_t f;
+} cpu_defaults_u;
+
+typedef struct _ind_addr_b_0_t {
+     unsigned char ind_addr_b_0                   : 8;
+     } ind_addr_b_0_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_addr_b_0_t f;
+} ind_addr_b_0_u;
+
+typedef struct _ind_addr_b_1_t {
+     unsigned char ind_addr_b_1                   : 8;
+     } ind_addr_b_1_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_addr_b_1_t f;
+} ind_addr_b_1_u;
+
+typedef struct _ind_addr_b_2_t {
+     unsigned char ind_addr_b_2                   : 8;
+     } ind_addr_b_2_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_addr_b_2_t f;
+} ind_addr_b_2_u;
+
+typedef struct _ind_data_b_t {
+     unsigned char ind_data_b                     : 8;
+     } ind_data_b_t;
+
+typedef union {
+     unsigned char val : 8;
+     ind_data_b_t f;
+} ind_data_b_u;
+
+typedef struct _pm4_rptr_t {
+     unsigned char pm4_rptr                       : 8;
+     } pm4_rptr_t;
+
+typedef union {
+     unsigned char val : 8;
+     pm4_rptr_t f;
+} pm4_rptr_u;
+
+typedef struct _scratch_t {
+     unsigned char scratch                        : 8;
+     } scratch_t;
+
+typedef union {
+     unsigned char val : 8;
+     scratch_t f;
+} scratch_u;
+
+typedef struct _pm4_wrptr_0_t {
+     unsigned char pm4_wrptr_0                    : 8;
+     } pm4_wrptr_0_t;
+
+typedef union {
+     unsigned char val : 8;
+     pm4_wrptr_0_t f;
+} pm4_wrptr_0_u;
+
+typedef struct _pm4_wrptr_1_t {
+     unsigned char pm4_wrptr_1                    : 6;
+     unsigned char rd_fetch_pm4_rptr              : 1;
+     unsigned char wrptr_atomic_update_w          : 1;
+     } pm4_wrptr_1_t;
+
+typedef union {
+     unsigned char val : 8;
+     pm4_wrptr_1_t f;
+} pm4_wrptr_1_u;
+
+typedef struct _cp_rb_cntl_t {
+     unsigned long rb_bufsz                       : 6;
+     unsigned long                                : 2;
+     unsigned long rb_blksz                       : 6;
+     unsigned long                                : 2;
+     unsigned long buf_swap                       : 2;
+     unsigned long max_fetch                      : 2;
+     unsigned long                                : 7;
+     unsigned long rb_no_update                   : 1;
+     unsigned long                                : 3;
+     unsigned long rb_rptr_wr_ena                 : 1;
+     } cp_rb_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_rb_cntl_t f;
+} cp_rb_cntl_u;
+
+typedef struct _cp_rb_base_t {
+     unsigned long                                : 2;
+     unsigned long rb_base                        : 22;
+     unsigned long                                : 8;
+     } cp_rb_base_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_rb_base_t f;
+} cp_rb_base_u;
+
+typedef struct _cp_rb_rptr_addr_t {
+     unsigned long rb_rptr_swap                   : 2;
+     unsigned long rb_rptr_addr                   : 22;
+     unsigned long                                : 8;
+     } cp_rb_rptr_addr_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_rb_rptr_addr_t f;
+} cp_rb_rptr_addr_u;
+
+typedef struct _cp_rb_rptr_t {
+     unsigned long rb_rptr                        : 23;
+     unsigned long                                : 9;
+     } cp_rb_rptr_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_rb_rptr_t f;
+} cp_rb_rptr_u;
+
+typedef struct _cp_rb_rptr_wr_t {
+     unsigned long rb_rptr_wr                     : 23;
+     unsigned long                                : 9;
+     } cp_rb_rptr_wr_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_rb_rptr_wr_t f;
+} cp_rb_rptr_wr_u;
+
+typedef struct _cp_rb_wptr_t {
+     unsigned long rb_wptr                        : 23;
+     unsigned long                                : 9;
+     } cp_rb_wptr_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_rb_wptr_t f;
+} cp_rb_wptr_u;
+
+typedef struct _cp_ib_base_t {
+     unsigned long                                : 2;
+     unsigned long ib_base                        : 22;
+     unsigned long                                : 8;
+     } cp_ib_base_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_ib_base_t f;
+} cp_ib_base_u;
+
+typedef struct _cp_ib_bufsz_t {
+     unsigned long ib_bufsz                       : 23;
+     unsigned long                                : 9;
+     } cp_ib_bufsz_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_ib_bufsz_t f;
+} cp_ib_bufsz_u;
+
+typedef struct _cp_csq_cntl_t {
+     unsigned long csq_cnt_primary                : 8;
+     unsigned long csq_cnt_indirect               : 8;
+     unsigned long                                : 12;
+     unsigned long csq_mode                       : 4;
+     } cp_csq_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_csq_cntl_t f;
+} cp_csq_cntl_u;
+
+typedef struct _cp_csq_aper_primary_t {
+     unsigned long cp_csq_aper_primary            : 32;
+     } cp_csq_aper_primary_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_csq_aper_primary_t f;
+} cp_csq_aper_primary_u;
+
+typedef struct _cp_csq_aper_indirect_t {
+     unsigned long cp_csq_aper_indirect           : 32;
+     } cp_csq_aper_indirect_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_csq_aper_indirect_t f;
+} cp_csq_aper_indirect_u;
+
+typedef struct _cp_me_cntl_t {
+     unsigned long me_stat                        : 16;
+     unsigned long me_statmux                     : 5;
+     unsigned long                                : 8;
+     unsigned long me_busy                        : 1;
+     unsigned long me_mode                        : 1;
+     unsigned long me_step                        : 1;
+     } cp_me_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_me_cntl_t f;
+} cp_me_cntl_u;
+
+typedef struct _cp_me_ram_addr_t {
+     unsigned long me_ram_addr                    : 8;
+     unsigned long                                : 24;
+     } cp_me_ram_addr_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_me_ram_addr_t f;
+} cp_me_ram_addr_u;
+
+typedef struct _cp_me_ram_raddr_t {
+     unsigned long me_ram_raddr                   : 8;
+     unsigned long                                : 24;
+     } cp_me_ram_raddr_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_me_ram_raddr_t f;
+} cp_me_ram_raddr_u;
+
+typedef struct _cp_me_ram_datah_t {
+     unsigned long me_ram_datah                   : 6;
+     unsigned long                                : 26;
+     } cp_me_ram_datah_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_me_ram_datah_t f;
+} cp_me_ram_datah_u;
+
+typedef struct _cp_me_ram_datal_t {
+     unsigned long me_ram_datal                   : 32;
+     } cp_me_ram_datal_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_me_ram_datal_t f;
+} cp_me_ram_datal_u;
+
+typedef struct _cp_debug_t {
+     unsigned long cp_debug                       : 32;
+     } cp_debug_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_debug_t f;
+} cp_debug_u;
+
+typedef struct _scratch_reg0_t {
+     unsigned long scratch_reg0                   : 32;
+     } scratch_reg0_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_reg0_t f;
+} scratch_reg0_u;
+
+typedef struct _scratch_reg1_t {
+     unsigned long scratch_reg1                   : 32;
+     } scratch_reg1_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_reg1_t f;
+} scratch_reg1_u;
+
+typedef struct _scratch_reg2_t {
+     unsigned long scratch_reg2                   : 32;
+     } scratch_reg2_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_reg2_t f;
+} scratch_reg2_u;
+
+typedef struct _scratch_reg3_t {
+     unsigned long scratch_reg3                   : 32;
+     } scratch_reg3_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_reg3_t f;
+} scratch_reg3_u;
+
+typedef struct _scratch_reg4_t {
+     unsigned long scratch_reg4                   : 32;
+     } scratch_reg4_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_reg4_t f;
+} scratch_reg4_u;
+
+typedef struct _scratch_reg5_t {
+     unsigned long scratch_reg5                   : 32;
+     } scratch_reg5_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_reg5_t f;
+} scratch_reg5_u;
+
+typedef struct _scratch_umsk_t {
+     unsigned long scratch_umsk                   : 6;
+     unsigned long                                : 10;
+     unsigned long scratch_swap                   : 2;
+     unsigned long                                : 14;
+     } scratch_umsk_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_umsk_t f;
+} scratch_umsk_u;
+
+typedef struct _scratch_addr_t {
+     unsigned long                                : 5;
+     unsigned long scratch_addr                   : 27;
+     } scratch_addr_t;
+
+typedef union {
+     unsigned long val : 32;
+     scratch_addr_t f;
+} scratch_addr_u;
+
+typedef struct _cp_csq_addr_t {
+     unsigned long                                : 2;
+     unsigned long csq_addr                       : 8;
+     unsigned long                                : 22;
+     } cp_csq_addr_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_csq_addr_t f;
+} cp_csq_addr_u;
+
+typedef struct _cp_csq_data_t {
+     unsigned long csq_data                       : 32;
+     } cp_csq_data_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_csq_data_t f;
+} cp_csq_data_u;
+
+typedef struct _cp_csq_stat_t {
+     unsigned long csq_rptr_primary               : 8;
+     unsigned long csq_wptr_primary               : 8;
+     unsigned long csq_rptr_indirect              : 8;
+     unsigned long csq_wptr_indirect              : 8;
+     } cp_csq_stat_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_csq_stat_t f;
+} cp_csq_stat_u;
+
+typedef struct _cp_stat_t {
+     unsigned long mru_busy                       : 1;
+     unsigned long mwu_busy                       : 1;
+     unsigned long rsiu_busy                      : 1;
+     unsigned long rciu_busy                      : 1;
+     unsigned long                                : 5;
+     unsigned long csf_primary_busy               : 1;
+     unsigned long csf_indirect_busy              : 1;
+     unsigned long csq_primary_busy               : 1;
+     unsigned long csq_indirect_busy              : 1;
+     unsigned long csi_busy                       : 1;
+     unsigned long                                : 14;
+     unsigned long guidma_busy                    : 1;
+     unsigned long viddma_busy                    : 1;
+     unsigned long cmdstrm_busy                   : 1;
+     unsigned long cp_busy                        : 1;
+     } cp_stat_t;
+
+typedef union {
+     unsigned long val : 32;
+     cp_stat_t f;
+} cp_stat_u;
+
+typedef struct _gen_int_cntl_t {
+     unsigned long crtc_vblank_mask               : 1;
+     unsigned long crtc_vline_mask                : 1;
+     unsigned long crtc_hwint1_mask               : 1;
+     unsigned long crtc_hwint2_mask               : 1;
+     unsigned long                                : 15;
+     unsigned long gui_idle_mask                  : 1;
+     unsigned long                                : 8;
+     unsigned long pm4_idle_int_mask              : 1;
+     unsigned long dvi_i2c_int_mask               : 1;
+     unsigned long                                : 2;
+     } gen_int_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     gen_int_cntl_t f;
+} gen_int_cntl_u;
+
+typedef struct _gen_int_status_rd_t {
+     unsigned long crtc_vblank_stat               : 1;
+     unsigned long crtc_vline_stat                : 1;
+     unsigned long crtc_hwint1_stat               : 1;
+     unsigned long crtc_hwint2_stat               : 1;
+     unsigned long                                : 15;
+     unsigned long gui_idle_stat                  : 1;
+     unsigned long                                : 8;
+     unsigned long pm4_idle_int_stat              : 1;
+     unsigned long dvi_i2c_int_stat               : 1;
+     unsigned long                                : 2;
+     } gen_int_status_rd_t;
+
+typedef union {
+     unsigned long val : 32;
+     gen_int_status_rd_t f;
+} gen_int_status_rd_u;
+
+typedef struct _gen_int_status_wr_t {
+     unsigned long crtc_vblank_stat_ak            : 1;
+     unsigned long crtc_vline_stat_ak             : 1;
+     unsigned long crtc_hwint1_stat_ak            : 1;
+     unsigned long crtc_hwint2_stat_ak            : 1;
+     unsigned long                                : 15;
+     unsigned long gui_idle_stat_ak               : 1;
+     unsigned long                                : 8;
+     unsigned long pm4_idle_int_ak                : 1;
+     unsigned long dvi_i2c_int_ak                 : 1;
+     unsigned long                                : 2;
+     } gen_int_status_wr_t;
+
+typedef union {
+     unsigned long val : 32;
+     gen_int_status_wr_t f;
+} gen_int_status_wr_u;
+
+typedef struct _lcd_format_t {
+     unsigned long lcd_type                       : 4;
+     unsigned long color_to_mono                  : 1;
+     unsigned long data_inv                       : 1;
+     unsigned long stn_fm                         : 2;
+     unsigned long tft_fm                         : 2;
+     unsigned long scan_lr_en                     : 1;
+     unsigned long scan_ud_en                     : 1;
+     unsigned long pol_inv                        : 1;
+     unsigned long rst_fm                         : 1;
+     unsigned long yuv_to_rgb                     : 1;
+     unsigned long hr_tft                         : 1;
+     unsigned long ulc_panel                      : 1;
+     unsigned long                                : 15;
+     } lcd_format_t;
+
+typedef union {
+     unsigned long val : 32;
+     lcd_format_t f;
+} lcd_format_u;
+
+typedef struct _graphic_ctrl_t {
+     unsigned long color_depth                    : 3; // 6
+     unsigned long portrait_mode                  : 2; // 0
+     unsigned long low_power_on                   : 1; // 1
+     unsigned long req_freq                       : 4; // 5
+     unsigned long en_crtc                        : 1; // 1
+     unsigned long en_graphic_req                 : 1; // 1
+     unsigned long en_graphic_crtc                : 1; // 1
+     unsigned long total_req_graphic              : 9; // 240
+     unsigned long lcd_pclk_on                    : 1; // 1
+     unsigned long lcd_sclk_on                    : 1; // 1
+     unsigned long pclk_running                   : 1; // 1
+     unsigned long sclk_running                   : 1; // 1
+     unsigned long                                : 6;
+     } graphic_ctrl_t;
+
+typedef union {
+     unsigned long val : 32;
+     graphic_ctrl_t f;
+} graphic_ctrl_u;
+
+typedef struct _graphic_offset_t {
+     unsigned long graphic_offset                 : 24;
+     unsigned long                                : 8;
+     } graphic_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     graphic_offset_t f;
+} graphic_offset_u;
+
+typedef struct _graphic_pitch_t {
+     unsigned long graphic_pitch                  : 11;
+     unsigned long                                : 21;
+     } graphic_pitch_t;
+
+typedef union {
+     unsigned long val : 32;
+     graphic_pitch_t f;
+} graphic_pitch_u;
+
+typedef struct _crtc_total_t {
+     unsigned long crtc_h_total                   : 10;
+     unsigned long                                : 6;
+     unsigned long crtc_v_total                   : 10;
+     unsigned long                                : 6;
+     } crtc_total_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_total_t f;
+} crtc_total_u;
+
+typedef struct _active_h_disp_t {
+     unsigned long active_h_start                 : 10;
+     unsigned long                                : 6;
+     unsigned long active_h_end                   : 10;
+     unsigned long                                : 6;
+     } active_h_disp_t;
+
+typedef union {
+     unsigned long val : 32;
+     active_h_disp_t f;
+} active_h_disp_u;
+
+typedef struct _active_v_disp_t {
+     unsigned long active_v_start                 : 10;
+     unsigned long                                : 6;
+     unsigned long active_v_end                   : 10;
+     unsigned long                                : 6;
+     } active_v_disp_t;
+
+typedef union {
+     unsigned long val : 32;
+     active_v_disp_t f;
+} active_v_disp_u;
+
+typedef struct _graphic_h_disp_t {
+     unsigned long graphic_h_start                : 10;
+     unsigned long                                : 6;
+     unsigned long graphic_h_end                  : 10;
+     unsigned long                                : 6;
+     } graphic_h_disp_t;
+
+typedef union {
+     unsigned long val : 32;
+     graphic_h_disp_t f;
+} graphic_h_disp_u;
+
+typedef struct _graphic_v_disp_t {
+     unsigned long graphic_v_start                : 10;
+     unsigned long                                : 6;
+     unsigned long graphic_v_end                  : 10;
+     unsigned long                                : 6;
+     } graphic_v_disp_t;
+
+typedef union {
+     unsigned long val : 32;
+     graphic_v_disp_t f;
+} graphic_v_disp_u;
+
+typedef struct _video_ctrl_t {
+     unsigned long video_mode                     : 1; // 00000001
+     unsigned long keyer_en                       : 1; // 00000002
+     unsigned long en_video_req                   : 1; // 00000004
+     unsigned long en_graphic_req_video           : 1; // 00000008
+     unsigned long en_video_crtc                  : 1; // 00000010
+     unsigned long video_hor_exp                  : 2; // 00000060
+     unsigned long video_ver_exp                  : 2; // 00000180
+     unsigned long uv_combine                     : 1; // 00000200
+     unsigned long total_req_video                : 9; // 0007fc00
+     unsigned long video_ch_sel                   : 1; // 00080000
+     unsigned long video_portrait                 : 2; // 00300000
+     unsigned long yuv2rgb_en                     : 1; // 00400000
+     unsigned long yuv2rgb_option                 : 1; // 00800000
+     unsigned long video_inv_hor                  : 1; // 01000000
+     unsigned long video_inv_ver                  : 1; // 02000000
+     unsigned long gamma_sel                      : 2; // 0c000000
+     unsigned long dis_limit                      : 1; // 10000000
+     unsigned long en_uv_hblend                   : 1; // 20000000
+     unsigned long rgb_gamma_sel                  : 2; // c0000000
+     } video_ctrl_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_ctrl_t f;
+} video_ctrl_u;
+
+typedef struct _graphic_key_t {
+     unsigned long keyer_color                    : 16;
+     unsigned long keyer_mask                     : 16;
+     } graphic_key_t;
+
+typedef union {
+     unsigned long val : 32;
+     graphic_key_t f;
+} graphic_key_u;
+
+typedef struct _video_y_offset_t {
+     unsigned long y_offset                       : 24;
+     unsigned long                                : 8;
+     } video_y_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_y_offset_t f;
+} video_y_offset_u;
+
+typedef struct _video_y_pitch_t {
+     unsigned long y_pitch                        : 11;
+     unsigned long                                : 21;
+     } video_y_pitch_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_y_pitch_t f;
+} video_y_pitch_u;
+
+typedef struct _video_u_offset_t {
+     unsigned long u_offset                       : 24;
+     unsigned long                                : 8;
+     } video_u_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_u_offset_t f;
+} video_u_offset_u;
+
+typedef struct _video_u_pitch_t {
+     unsigned long u_pitch                        : 11;
+     unsigned long                                : 21;
+     } video_u_pitch_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_u_pitch_t f;
+} video_u_pitch_u;
+
+typedef struct _video_v_offset_t {
+     unsigned long v_offset                       : 24;
+     unsigned long                                : 8;
+     } video_v_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_v_offset_t f;
+} video_v_offset_u;
+
+typedef struct _video_v_pitch_t {
+     unsigned long v_pitch                        : 11;
+     unsigned long                                : 21;
+     } video_v_pitch_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_v_pitch_t f;
+} video_v_pitch_u;
+
+typedef struct _video_h_pos_t {
+     unsigned long video_h_start                  : 10;
+     unsigned long                                : 6;
+     unsigned long video_h_end                    : 10;
+     unsigned long                                : 6;
+     } video_h_pos_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_h_pos_t f;
+} video_h_pos_u;
+
+typedef struct _video_v_pos_t {
+     unsigned long video_v_start                  : 10;
+     unsigned long                                : 6;
+     unsigned long video_v_end                    : 10;
+     unsigned long                                : 6;
+     } video_v_pos_t;
+
+typedef union {
+     unsigned long val : 32;
+     video_v_pos_t f;
+} video_v_pos_u;
+
+typedef struct _brightness_cntl_t {
+     unsigned long brightness                     : 7;
+     unsigned long                                : 25;
+     } brightness_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     brightness_cntl_t f;
+} brightness_cntl_u;
+
+typedef struct _cursor1_offset_t {
+     unsigned long cur1_offset                    : 24;
+     unsigned long cur1_x_offset                  : 4;
+     unsigned long cur1_y_offset                  : 4;
+     } cursor1_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor1_offset_t f;
+} cursor1_offset_u;
+
+typedef struct _cursor1_h_pos_t {
+     unsigned long cur1_h_start                   : 10;
+     unsigned long                                : 6;
+     unsigned long cur1_h_end                     : 10;
+     unsigned long                                : 5;
+     unsigned long cur1_en                        : 1;
+     } cursor1_h_pos_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor1_h_pos_t f;
+} cursor1_h_pos_u;
+
+typedef struct _cursor1_v_pos_t {
+     unsigned long cur1_v_start                   : 10;
+     unsigned long                                : 6;
+     unsigned long cur1_v_end                     : 10;
+     unsigned long                                : 6;
+     } cursor1_v_pos_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor1_v_pos_t f;
+} cursor1_v_pos_u;
+
+typedef struct _cursor1_color0_t {
+     unsigned long cur1_color0_r                  : 8;
+     unsigned long cur1_color0_g                  : 8;
+     unsigned long cur1_color0_b                  : 8;
+     unsigned long                                : 8;
+     } cursor1_color0_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor1_color0_t f;
+} cursor1_color0_u;
+
+typedef struct _cursor1_color1_t {
+     unsigned long cur1_color1_r                  : 8;
+     unsigned long cur1_color1_g                  : 8;
+     unsigned long cur1_color1_b                  : 8;
+     unsigned long                                : 8;
+     } cursor1_color1_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor1_color1_t f;
+} cursor1_color1_u;
+
+typedef struct _cursor2_offset_t {
+     unsigned long cur2_offset                    : 24;
+     unsigned long cur2_x_offset                  : 4;
+     unsigned long cur2_y_offset                  : 4;
+     } cursor2_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor2_offset_t f;
+} cursor2_offset_u;
+
+typedef struct _cursor2_h_pos_t {
+     unsigned long cur2_h_start                   : 10;
+     unsigned long                                : 6;
+     unsigned long cur2_h_end                     : 10;
+     unsigned long                                : 5;
+     unsigned long cur2_en                        : 1;
+     } cursor2_h_pos_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor2_h_pos_t f;
+} cursor2_h_pos_u;
+
+typedef struct _cursor2_v_pos_t {
+     unsigned long cur2_v_start                   : 10;
+     unsigned long                                : 6;
+     unsigned long cur2_v_end                     : 10;
+     unsigned long                                : 6;
+     } cursor2_v_pos_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor2_v_pos_t f;
+} cursor2_v_pos_u;
+
+typedef struct _cursor2_color0_t {
+     unsigned long cur2_color0_r                  : 8;
+     unsigned long cur2_color0_g                  : 8;
+     unsigned long cur2_color0_b                  : 8;
+     unsigned long                                : 8;
+     } cursor2_color0_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor2_color0_t f;
+} cursor2_color0_u;
+
+typedef struct _cursor2_color1_t {
+     unsigned long cur2_color1_r                  : 8;
+     unsigned long cur2_color1_g                  : 8;
+     unsigned long cur2_color1_b                  : 8;
+     unsigned long                                : 8;
+     } cursor2_color1_t;
+
+typedef union {
+     unsigned long val : 32;
+     cursor2_color1_t f;
+} cursor2_color1_u;
+
+typedef struct _disp_int_cntl_t {
+     unsigned long vline_int_pos                  : 10;
+     unsigned long                                : 6;
+     unsigned long hpos_int_pos                   : 10;
+     unsigned long                                : 4;
+     unsigned long vblank_int_pol                 : 1;
+     unsigned long frame_int_pol                  : 1;
+     } disp_int_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     disp_int_cntl_t f;
+} disp_int_cntl_u;
+
+typedef struct _crtc_ss_t {
+     unsigned long ss_start                       : 10;
+     unsigned long                                : 6;
+     unsigned long ss_end                         : 10;
+     unsigned long                                : 2;
+     unsigned long ss_align                       : 1;
+     unsigned long ss_pol                         : 1;
+     unsigned long ss_run_mode                    : 1;
+     unsigned long ss_en                          : 1;
+     } crtc_ss_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_ss_t f;
+} crtc_ss_u;
+
+typedef struct _crtc_ls_t {
+     unsigned long ls_start                       : 10;
+     unsigned long                                : 6;
+     unsigned long ls_end                         : 10;
+     unsigned long                                : 2;
+     unsigned long ls_align                       : 1;
+     unsigned long ls_pol                         : 1;
+     unsigned long ls_run_mode                    : 1;
+     unsigned long ls_en                          : 1;
+     } crtc_ls_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_ls_t f;
+} crtc_ls_u;
+
+typedef struct _crtc_rev_t {
+     unsigned long rev_pos                        : 10;
+     unsigned long                                : 6;
+     unsigned long rev_align                      : 1;
+     unsigned long rev_freq_nref                  : 5;
+     unsigned long rev_en                         : 1;
+     unsigned long                                : 9;
+     } crtc_rev_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_rev_t f;
+} crtc_rev_u;
+
+typedef struct _crtc_dclk_t {
+     unsigned long dclk_start                     : 10;
+     unsigned long                                : 6;
+     unsigned long dclk_end                       : 10;
+     unsigned long                                : 1;
+     unsigned long dclk_run_mode                  : 2;
+     unsigned long dclk_pol                       : 1;
+     unsigned long dclk_align                     : 1;
+     unsigned long dclk_en                        : 1;
+     } crtc_dclk_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_dclk_t f;
+} crtc_dclk_u;
+
+typedef struct _crtc_gs_t {
+     unsigned long gs_start                       : 10;
+     unsigned long                                : 6;
+     unsigned long gs_end                         : 10;
+     unsigned long                                : 3;
+     unsigned long gs_align                       : 1;
+     unsigned long gs_pol                         : 1;
+     unsigned long gs_en                          : 1;
+     } crtc_gs_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_gs_t f;
+} crtc_gs_u;
+
+typedef struct _crtc_vpos_gs_t {
+     unsigned long gs_vpos_start                  : 10;
+     unsigned long                                : 6;
+     unsigned long gs_vpos_end                    : 10;
+     unsigned long                                : 6;
+     } crtc_vpos_gs_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_vpos_gs_t f;
+} crtc_vpos_gs_u;
+
+typedef struct _crtc_gclk_t {
+     unsigned long gclk_start                     : 10;
+     unsigned long                                : 6;
+     unsigned long gclk_end                       : 10;
+     unsigned long                                : 3;
+     unsigned long gclk_align                     : 1;
+     unsigned long gclk_pol                       : 1;
+     unsigned long gclk_en                        : 1;
+     } crtc_gclk_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_gclk_t f;
+} crtc_gclk_u;
+
+typedef struct _crtc_goe_t {
+     unsigned long goe_start                      : 10;
+     unsigned long                                : 6;
+     unsigned long goe_end                        : 10;
+     unsigned long                                : 3;
+     unsigned long goe_align                      : 1;
+     unsigned long goe_pol                        : 1;
+     unsigned long goe_en                         : 1;
+     } crtc_goe_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_goe_t f;
+} crtc_goe_u;
+
+typedef struct _crtc_frame_t {
+     unsigned long crtc_fr_start                  : 10;
+     unsigned long                                : 6;
+     unsigned long crtc_fr_end                    : 10;
+     unsigned long                                : 4;
+     unsigned long crtc_frame_en                  : 1;
+     unsigned long crtc_frame_align               : 1;
+     } crtc_frame_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_frame_t f;
+} crtc_frame_u;
+
+typedef struct _crtc_frame_vpos_t {
+     unsigned long crtc_fr_vpos                   : 10;
+     unsigned long                                : 22;
+     } crtc_frame_vpos_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_frame_vpos_t f;
+} crtc_frame_vpos_u;
+
+typedef struct _gpio_data_t {
+     unsigned long gio_out                        : 16;
+     unsigned long gio_in                         : 16;
+     } gpio_data_t;
+
+typedef union {
+     unsigned long val : 32;
+     gpio_data_t f;
+} gpio_data_u;
+
+typedef struct _gpio_cntl1_t {
+     unsigned long gio_pd                         : 16;
+     unsigned long gio_schmen                     : 16;
+     } gpio_cntl1_t;
+
+typedef union {
+     unsigned long val : 32;
+     gpio_cntl1_t f;
+} gpio_cntl1_u;
+
+typedef struct _gpio_cntl2_t {
+     unsigned long gio_oe                         : 16;
+     unsigned long gio_srp                        : 1;
+     unsigned long gio_srn                        : 1;
+     unsigned long gio_sp                         : 4;
+     unsigned long gio_sn                         : 4;
+     unsigned long                                : 6;
+     } gpio_cntl2_t;
+
+typedef union {
+     unsigned long val : 32;
+     gpio_cntl2_t f;
+} gpio_cntl2_u;
+
+typedef struct _lcdd_cntl1_t {
+     unsigned long lcdd_pd                        : 18;
+     unsigned long lcdd_srp                       : 1;
+     unsigned long lcdd_srn                       : 1;
+     unsigned long lcdd_sp                        : 4;
+     unsigned long lcdd_sn                        : 4;
+     unsigned long lcdd_align                     : 1;
+     unsigned long                                : 3;
+     } lcdd_cntl1_t;
+
+typedef union {
+     unsigned long val : 32;
+     lcdd_cntl1_t f;
+} lcdd_cntl1_u;
+
+typedef struct _lcdd_cntl2_t {
+     unsigned long lcdd_oe                        : 18;
+     unsigned long                                : 14;
+     } lcdd_cntl2_t;
+
+typedef union {
+     unsigned long val : 32;
+     lcdd_cntl2_t f;
+} lcdd_cntl2_u;
+
+typedef struct _genlcd_cntl1_t {
+     unsigned long dclk_oe                        : 1;
+     unsigned long dclk_pd                        : 1;
+     unsigned long dclk_srp                       : 1;
+     unsigned long dclk_srn                       : 1;
+     unsigned long dclk_sp                        : 4;
+     unsigned long dclk_sn                        : 4;
+     unsigned long ss_oe                          : 1;
+     unsigned long ss_pd                          : 1;
+     unsigned long ls_oe                          : 1;
+     unsigned long ls_pd                          : 1;
+     unsigned long gs_oe                          : 1;
+     unsigned long gs_pd                          : 1;
+     unsigned long goe_oe                         : 1;
+     unsigned long goe_pd                         : 1;
+     unsigned long rev_oe                         : 1;
+     unsigned long rev_pd                         : 1;
+     unsigned long frame_oe                       : 1;
+     unsigned long frame_pd                       : 1;
+     unsigned long                                : 8;
+     } genlcd_cntl1_t;
+
+typedef union {
+     unsigned long val : 32;
+     genlcd_cntl1_t f;
+} genlcd_cntl1_u;
+
+typedef struct _genlcd_cntl2_t {
+     unsigned long gclk_oe                        : 1;
+     unsigned long gclk_pd                        : 1;
+     unsigned long gclk_srp                       : 1;
+     unsigned long gclk_srn                       : 1;
+     unsigned long gclk_sp                        : 4;
+     unsigned long gclk_sn                        : 4;
+     unsigned long genlcd_srp                     : 1;
+     unsigned long genlcd_srn                     : 1;
+     unsigned long genlcd_sp                      : 4;
+     unsigned long genlcd_sn                      : 4;
+     unsigned long                                : 10;
+     } genlcd_cntl2_t;
+
+typedef union {
+     unsigned long val : 32;
+     genlcd_cntl2_t f;
+} genlcd_cntl2_u;
+
+typedef struct _disp_debug_t {
+     unsigned long disp_debug                     : 32;
+     } disp_debug_t;
+
+typedef union {
+     unsigned long val : 32;
+     disp_debug_t f;
+} disp_debug_u;
+
+typedef struct _disp_db_buf_cntl_rd_t {
+     unsigned long en_db_buf                      : 1;
+     unsigned long update_db_buf_done             : 1;
+     unsigned long db_buf_cntl                    : 6;
+     unsigned long                                : 24;
+     } disp_db_buf_cntl_rd_t;
+
+typedef union {
+     unsigned long val : 32;
+     disp_db_buf_cntl_rd_t f;
+} disp_db_buf_cntl_rd_u;
+
+typedef struct _disp_db_buf_cntl_wr_t {
+     unsigned long en_db_buf                      : 1;
+     unsigned long update_db_buf                  : 1;
+     unsigned long db_buf_cntl                    : 6;
+     unsigned long                                : 24;
+     } disp_db_buf_cntl_wr_t;
+
+typedef union {
+     unsigned long val : 32;
+     disp_db_buf_cntl_wr_t f;
+} disp_db_buf_cntl_wr_u;
+
+typedef struct _disp_crc_sig_t {
+     unsigned long crc_sig_r                      : 6;
+     unsigned long crc_sig_g                      : 6;
+     unsigned long crc_sig_b                      : 6;
+     unsigned long crc_cont_en                    : 1;
+     unsigned long crc_en                         : 1;
+     unsigned long crc_mask_en                    : 1;
+     unsigned long crc_sig_cntl                   : 6;
+     unsigned long                                : 5;
+     } disp_crc_sig_t;
+
+typedef union {
+     unsigned long val : 32;
+     disp_crc_sig_t f;
+} disp_crc_sig_u;
+
+typedef struct _crtc_default_count_t {
+     unsigned long crtc_hcount_def                : 10;
+     unsigned long                                : 6;
+     unsigned long crtc_vcount_def                : 10;
+     unsigned long                                : 6;
+     } crtc_default_count_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_default_count_t f;
+} crtc_default_count_u;
+
+typedef struct _lcd_background_color_t {
+     unsigned long lcd_bg_red                     : 8;
+     unsigned long lcd_bg_green                   : 8;
+     unsigned long lcd_bg_blue                    : 8;
+     unsigned long                                : 8;
+     } lcd_background_color_t;
+
+typedef union {
+     unsigned long val : 32;
+     lcd_background_color_t f;
+} lcd_background_color_u;
+
+typedef struct _crtc_ps2_t {
+     unsigned long ps2_start                      : 10;
+     unsigned long                                : 6;
+     unsigned long ps2_end                        : 10;
+     unsigned long                                : 4;
+     unsigned long ps2_pol                        : 1;
+     unsigned long ps2_en                         : 1;
+     } crtc_ps2_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_ps2_t f;
+} crtc_ps2_u;
+
+typedef struct _crtc_ps2_vpos_t {
+     unsigned long ps2_vpos_start                 : 10;
+     unsigned long                                : 6;
+     unsigned long ps2_vpos_end                   : 10;
+     unsigned long                                : 6;
+     } crtc_ps2_vpos_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_ps2_vpos_t f;
+} crtc_ps2_vpos_u;
+
+typedef struct _crtc_ps1_active_t {
+     unsigned long ps1_h_start                    : 10;
+     unsigned long                                : 6;
+     unsigned long ps1_h_end                      : 10;
+     unsigned long                                : 3;
+     unsigned long ps1_pol                        : 1;
+     unsigned long ps1_en                         : 1;
+     unsigned long ps1_use_nactive                : 1;
+     } crtc_ps1_active_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_ps1_active_t f;
+} crtc_ps1_active_u;
+
+typedef struct _crtc_ps1_nactive_t {
+     unsigned long ps1_h_start_na                 : 10;
+     unsigned long                                : 6;
+     unsigned long ps1_h_end_na                   : 10;
+     unsigned long                                : 5;
+     unsigned long ps1_en_na                      : 1;
+     } crtc_ps1_nactive_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_ps1_nactive_t f;
+} crtc_ps1_nactive_u;
+
+typedef struct _crtc_gclk_ext_t {
+     unsigned long gclk_alter_start               : 10;
+     unsigned long                                : 6;
+     unsigned long gclk_alter_width               : 2;
+     unsigned long gclk_en_alter                  : 1;
+     unsigned long gclk_db_width                  : 2;
+     unsigned long                                : 11;
+     } crtc_gclk_ext_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_gclk_ext_t f;
+} crtc_gclk_ext_u;
+
+typedef struct _crtc_alw_t {
+     unsigned long alw_hstart                     : 10;
+     unsigned long                                : 6;
+     unsigned long alw_hend                       : 10;
+     unsigned long                                : 4;
+     unsigned long alw_delay                      : 1;
+     unsigned long alw_en                         : 1;
+     } crtc_alw_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_alw_t f;
+} crtc_alw_u;
+
+typedef struct _crtc_alw_vpos_t {
+     unsigned long alw_vstart                     : 10;
+     unsigned long                                : 6;
+     unsigned long alw_vend                       : 10;
+     unsigned long                                : 6;
+     } crtc_alw_vpos_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_alw_vpos_t f;
+} crtc_alw_vpos_u;
+
+typedef struct _crtc_psk_t {
+     unsigned long psk_vstart                     : 10;
+     unsigned long                                : 6;
+     unsigned long psk_vend                       : 10;
+     unsigned long                                : 4;
+     unsigned long psk_pol                        : 1;
+     unsigned long psk_en                         : 1;
+     } crtc_psk_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_psk_t f;
+} crtc_psk_u;
+
+typedef struct _crtc_psk_hpos_t {
+     unsigned long psk_hstart                     : 10;
+     unsigned long                                : 6;
+     unsigned long psk_hend                       : 10;
+     unsigned long                                : 6;
+     } crtc_psk_hpos_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_psk_hpos_t f;
+} crtc_psk_hpos_u;
+
+typedef struct _crtc_cv4_start_t {
+     unsigned long cv4_vstart                     : 10;
+     unsigned long                                : 20;
+     unsigned long cv4_pol                        : 1;
+     unsigned long cv4_en                         : 1;
+     } crtc_cv4_start_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_cv4_start_t f;
+} crtc_cv4_start_u;
+
+typedef struct _crtc_cv4_end_t {
+     unsigned long cv4_vend1                      : 10;
+     unsigned long                                : 6;
+     unsigned long cv4_vend2                      : 10;
+     unsigned long                                : 6;
+     } crtc_cv4_end_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_cv4_end_t f;
+} crtc_cv4_end_u;
+
+typedef struct _crtc_cv4_hpos_t {
+     unsigned long cv4_hstart                     : 10;
+     unsigned long                                : 6;
+     unsigned long cv4_hend                       : 10;
+     unsigned long                                : 6;
+     } crtc_cv4_hpos_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_cv4_hpos_t f;
+} crtc_cv4_hpos_u;
+
+typedef struct _crtc_eck_t {
+     unsigned long eck_freq1                      : 3;
+     unsigned long eck_en                         : 1;
+     unsigned long                                : 28;
+     } crtc_eck_t;
+
+typedef union {
+     unsigned long val : 32;
+     crtc_eck_t f;
+} crtc_eck_u;
+
+typedef struct _refresh_cntl_t {
+     unsigned long ref_frame                      : 3;
+     unsigned long nref_frame                     : 5;
+     unsigned long ref_cntl                       : 1;
+     unsigned long stop_sm_nref                   : 1;
+     unsigned long stop_req_nref                  : 1;
+     unsigned long                                : 21;
+     } refresh_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     refresh_cntl_t f;
+} refresh_cntl_u;
+
+typedef struct _genlcd_cntl3_t {
+     unsigned long ps1_oe                         : 1;
+     unsigned long ps1_pd                         : 1;
+     unsigned long ps2_oe                         : 1;
+     unsigned long ps2_pd                         : 1;
+     unsigned long rev2_oe                        : 1;
+     unsigned long rev2_pd                        : 1;
+     unsigned long awl_oe                         : 1;
+     unsigned long awl_pd                         : 1;
+     unsigned long dinv_oe                        : 1;
+     unsigned long dinv_pd                        : 1;
+     unsigned long psk_out                        : 1;
+     unsigned long psd_out                        : 1;
+     unsigned long eck_out                        : 1;
+     unsigned long cv4_out                        : 1;
+     unsigned long ps1_out                        : 1;
+     unsigned long ps2_out                        : 1;
+     unsigned long rev_out                        : 1;
+     unsigned long rev2_out                       : 1;
+     unsigned long                                : 14;
+     } genlcd_cntl3_t;
+
+typedef union {
+     unsigned long val : 32;
+     genlcd_cntl3_t f;
+} genlcd_cntl3_u;
+
+typedef struct _gpio_data2_t {
+     unsigned long gio2_out                       : 16;
+     unsigned long gio2_in                        : 16;
+     } gpio_data2_t;
+
+typedef union {
+     unsigned long val : 32;
+     gpio_data2_t f;
+} gpio_data2_u;
+
+typedef struct _gpio_cntl3_t {
+     unsigned long gio2_pd                        : 16;
+     unsigned long gio2_schmen                    : 16;
+     } gpio_cntl3_t;
+
+typedef union {
+     unsigned long val : 32;
+     gpio_cntl3_t f;
+} gpio_cntl3_u;
+
+typedef struct _gpio_cntl4_t {
+     unsigned long gio2_oe                        : 16;
+     unsigned long                                : 16;
+     } gpio_cntl4_t;
+
+typedef union {
+     unsigned long val : 32;
+     gpio_cntl4_t f;
+} gpio_cntl4_u;
+
+typedef struct _chip_strap_t {
+     unsigned long config_strap                   : 8;
+     unsigned long pkg_strap                      : 1;
+     unsigned long                                : 23;
+     } chip_strap_t;
+
+typedef union {
+     unsigned long val : 32;
+     chip_strap_t f;
+} chip_strap_u;
+
+typedef struct _disp_debug2_t {
+     unsigned long disp_debug2                    : 32;
+     } disp_debug2_t;
+
+typedef union {
+     unsigned long val : 32;
+     disp_debug2_t f;
+} disp_debug2_u;
+
+typedef struct _debug_bus_cntl_t {
+     unsigned long debug_testmux                  : 4;
+     unsigned long debug_testsel                  : 4;
+     unsigned long debug_gioa_sel                 : 2;
+     unsigned long debug_giob_sel                 : 2;
+     unsigned long debug_clk_sel                  : 1;
+     unsigned long debug_clk_inv                  : 1;
+     unsigned long                                : 2;
+     unsigned long debug_bus                      : 16;
+     } debug_bus_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug_bus_cntl_t f;
+} debug_bus_cntl_u;
+
+typedef struct _gamma_value1_t {
+     unsigned long gamma1                         : 8;
+     unsigned long gamma2                         : 8;
+     unsigned long gamma3                         : 8;
+     unsigned long gamma4                         : 8;
+     } gamma_value1_t;
+
+typedef union {
+     unsigned long val : 32;
+     gamma_value1_t f;
+} gamma_value1_u;
+
+typedef struct _gamma_value2_t {
+     unsigned long gamma5                         : 8;
+     unsigned long gamma6                         : 8;
+     unsigned long gamma7                         : 8;
+     unsigned long gamma8                         : 8;
+     } gamma_value2_t;
+
+typedef union {
+     unsigned long val : 32;
+     gamma_value2_t f;
+} gamma_value2_u;
+
+typedef struct _gamma_slope_t {
+     unsigned long slope1                         : 3;
+     unsigned long slope2                         : 3;
+     unsigned long slope3                         : 3;
+     unsigned long slope4                         : 3;
+     unsigned long slope5                         : 3;
+     unsigned long slope6                         : 3;
+     unsigned long slope7                         : 3;
+     unsigned long slope8                         : 3;
+     unsigned long                                : 8;
+     } gamma_slope_t;
+
+typedef union {
+     unsigned long val : 32;
+     gamma_slope_t f;
+} gamma_slope_u;
+
+typedef struct _gen_status_t {
+     unsigned long status                         : 16;
+     unsigned long                                : 16;
+     } gen_status_t;
+
+typedef union {
+     unsigned long val : 32;
+     gen_status_t f;
+} gen_status_u;
+
+typedef struct _hw_int_t {
+     unsigned long hwint1_pos                     : 5;
+     unsigned long hwint2_pos                     : 5;
+     unsigned long hwint1_pol                     : 1;
+     unsigned long hwint2_pol                     : 1;
+     unsigned long hwint1_en_db                   : 1;
+     unsigned long hwint2_en_db                   : 1;
+     unsigned long                                : 18;
+     } hw_int_t;
+
+typedef union {
+     unsigned long val : 32;
+     hw_int_t f;
+} hw_int_u;
+
+typedef struct _dst_offset_t {
+     unsigned long dst_offset                     : 24;
+     unsigned long                                : 8;
+     } dst_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_offset_t f;
+} dst_offset_u;
+
+typedef struct _dst_pitch_t {
+     unsigned long dst_pitch                      : 14;
+     unsigned long mc_dst_pitch_mul               : 2;
+     unsigned long                                : 16;
+     } dst_pitch_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_pitch_t f;
+} dst_pitch_u;
+
+typedef struct _dst_pitch_offset_t {
+     unsigned long dst_offset                     : 20;
+     unsigned long dst_pitch                      : 10;
+     unsigned long mc_dst_pitch_mul               : 2;
+     } dst_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_pitch_offset_t f;
+} dst_pitch_offset_u;
+
+typedef struct _dst_x_t {
+     unsigned long dst_x                          : 14;
+     unsigned long                                : 18;
+     } dst_x_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_x_t f;
+} dst_x_u;
+
+typedef struct _dst_y_t {
+     unsigned long dst_y                          : 14;
+     unsigned long                                : 18;
+     } dst_y_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_y_t f;
+} dst_y_u;
+
+typedef struct _dst_x_y_t {
+     unsigned long dst_y                          : 14;
+     unsigned long                                : 2;
+     unsigned long dst_x                          : 14;
+     unsigned long                                : 2;
+     } dst_x_y_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_x_y_t f;
+} dst_x_y_u;
+
+typedef struct _dst_y_x_t {
+     unsigned long dst_x                          : 14;
+     unsigned long                                : 2;
+     unsigned long dst_y                          : 14;
+     unsigned long                                : 2;
+     } dst_y_x_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_y_x_t f;
+} dst_y_x_u;
+
+typedef struct _dst_width_t {
+     unsigned long dst_width_b0                   : 8;
+     unsigned long dst_width_b1                   : 6;
+     unsigned long                                : 18;
+     } dst_width_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_width_t f;
+} dst_width_u;
+
+typedef struct _dst_height_t {
+     unsigned long dst_height                     : 14;
+     unsigned long                                : 18;
+     } dst_height_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_height_t f;
+} dst_height_u;
+
+typedef struct _dst_width_height_t {
+     unsigned long dst_height                     : 14;
+     unsigned long                                : 2;
+     unsigned long dst_width_b0                   : 8;
+     unsigned long dst_width_b1                   : 6;
+     unsigned long                                : 2;
+     } dst_width_height_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_width_height_t f;
+} dst_width_height_u;
+
+typedef struct _dst_height_width_t {
+     unsigned long dst_width_b0                   : 8;
+     unsigned long dst_width_b1                   : 6;
+     unsigned long                                : 2;
+     unsigned long dst_height                     : 14;
+     unsigned long                                : 2;
+     } dst_height_width_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_height_width_t f;
+} dst_height_width_u;
+
+typedef struct _dst_height_width_8_t {
+     unsigned long                                : 16;
+     unsigned long dst_width_b0                   : 8;
+     unsigned long dst_height                     : 8;
+     } dst_height_width_8_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_height_width_8_t f;
+} dst_height_width_8_u;
+
+typedef struct _dst_height_y_t {
+     unsigned long dst_y                          : 14;
+     unsigned long                                : 2;
+     unsigned long dst_height                     : 14;
+     unsigned long                                : 2;
+     } dst_height_y_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_height_y_t f;
+} dst_height_y_u;
+
+typedef struct _dst_width_x_t {
+     unsigned long dst_x                          : 14;
+     unsigned long                                : 2;
+     unsigned long dst_width_b0                   : 8;
+     unsigned long dst_width_b1                   : 6;
+     unsigned long                                : 2;
+     } dst_width_x_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_width_x_t f;
+} dst_width_x_u;
+
+typedef struct _dst_width_x_incy_t {
+     unsigned long dst_x                          : 14;
+     unsigned long                                : 2;
+     unsigned long dst_width_b0                   : 8;
+     unsigned long dst_width_b1                   : 6;
+     unsigned long                                : 2;
+     } dst_width_x_incy_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_width_x_incy_t f;
+} dst_width_x_incy_u;
+
+typedef struct _dst_line_start_t {
+     unsigned long dst_start_x                    : 14;
+     unsigned long                                : 2;
+     unsigned long dst_start_y                    : 14;
+     unsigned long                                : 2;
+     } dst_line_start_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_line_start_t f;
+} dst_line_start_u;
+
+typedef struct _dst_line_end_t {
+     unsigned long dst_end_x                      : 14;
+     unsigned long                                : 2;
+     unsigned long dst_end_y_b0                   : 8;
+     unsigned long dst_end_y_b1                   : 6;
+     unsigned long                                : 2;
+     } dst_line_end_t;
+
+typedef union {
+     unsigned long val : 32;
+     dst_line_end_t f;
+} dst_line_end_u;
+
+typedef struct _brush_offset_t {
+     unsigned long brush_offset                   : 24;
+     unsigned long                                : 8;
+     } brush_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     brush_offset_t f;
+} brush_offset_u;
+
+typedef struct _brush_y_x_t {
+     unsigned long brush_x                        : 5;
+     unsigned long                                : 3;
+     unsigned long brush_y                        : 3;
+     unsigned long                                : 21;
+     } brush_y_x_t;
+
+typedef union {
+     unsigned long val : 32;
+     brush_y_x_t f;
+} brush_y_x_u;
+
+typedef struct _dp_brush_frgd_clr_t {
+     unsigned long dp_brush_frgd_clr              : 32;
+     } dp_brush_frgd_clr_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_brush_frgd_clr_t f;
+} dp_brush_frgd_clr_u;
+
+typedef struct _dp_brush_bkgd_clr_t {
+     unsigned long dp_brush_bkgd_clr              : 32;
+     } dp_brush_bkgd_clr_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_brush_bkgd_clr_t f;
+} dp_brush_bkgd_clr_u;
+
+typedef struct _src2_offset_t {
+     unsigned long src2_offset                    : 24;
+     unsigned long                                : 8;
+     } src2_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_offset_t f;
+} src2_offset_u;
+
+typedef struct _src2_pitch_t {
+     unsigned long src2_pitch                     : 14;
+     unsigned long src2_pitch_mul                 : 2;
+     unsigned long                                : 16;
+     } src2_pitch_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_pitch_t f;
+} src2_pitch_u;
+
+typedef struct _src2_pitch_offset_t {
+     unsigned long src2_offset                    : 20;
+     unsigned long                                : 2;
+     unsigned long src2_pitch                     : 8;
+     unsigned long src2_pitch_mul                 : 2;
+     } src2_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_pitch_offset_t f;
+} src2_pitch_offset_u;
+
+typedef struct _src2_x_t {
+     unsigned long src_x                          : 14;
+     unsigned long                                : 18;
+     } src2_x_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_x_t f;
+} src2_x_u;
+
+typedef struct _src2_y_t {
+     unsigned long src_y                          : 14;
+     unsigned long                                : 18;
+     } src2_y_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_y_t f;
+} src2_y_u;
+
+typedef struct _src2_x_y_t {
+     unsigned long src_y                          : 14;
+     unsigned long                                : 2;
+     unsigned long src_x                          : 14;
+     unsigned long                                : 2;
+     } src2_x_y_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_x_y_t f;
+} src2_x_y_u;
+
+typedef struct _src2_width_t {
+     unsigned long src2_width                     : 14;
+     unsigned long                                : 18;
+     } src2_width_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_width_t f;
+} src2_width_u;
+
+typedef struct _src2_height_t {
+     unsigned long src2_height                    : 14;
+     unsigned long                                : 18;
+     } src2_height_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_height_t f;
+} src2_height_u;
+
+typedef struct _src2_inc_t {
+     unsigned long src2_xinc                      : 6;
+     unsigned long                                : 2;
+     unsigned long src2_yinc                      : 6;
+     unsigned long                                : 18;
+     } src2_inc_t;
+
+typedef union {
+     unsigned long val : 32;
+     src2_inc_t f;
+} src2_inc_u;
+
+typedef struct _src_offset_t {
+     unsigned long src_offset                     : 24;
+     unsigned long                                : 8;
+     } src_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_offset_t f;
+} src_offset_u;
+
+typedef struct _src_pitch_t {
+     unsigned long src_pitch                      : 14;
+     unsigned long src_pitch_mul                  : 2;
+     unsigned long                                : 16;
+     } src_pitch_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_pitch_t f;
+} src_pitch_u;
+
+typedef struct _src_pitch_offset_t {
+     unsigned long src_offset                     : 20;
+     unsigned long src_pitch                      : 10;
+     unsigned long src_pitch_mul                  : 2;
+     } src_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_pitch_offset_t f;
+} src_pitch_offset_u;
+
+typedef struct _src_x_t {
+     unsigned long src_x                          : 14;
+     unsigned long                                : 18;
+     } src_x_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_x_t f;
+} src_x_u;
+
+typedef struct _src_y_t {
+     unsigned long src_y                          : 14;
+     unsigned long                                : 18;
+     } src_y_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_y_t f;
+} src_y_u;
+
+typedef struct _src_x_y_t {
+     unsigned long src_y                          : 14;
+     unsigned long                                : 2;
+     unsigned long src_x                          : 14;
+     unsigned long                                : 2;
+     } src_x_y_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_x_y_t f;
+} src_x_y_u;
+
+typedef struct _src_y_x_t {
+     unsigned long src_x                          : 14;
+     unsigned long                                : 2;
+     unsigned long src_y                          : 14;
+     unsigned long                                : 2;
+     } src_y_x_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_y_x_t f;
+} src_y_x_u;
+
+typedef struct _src_width_t {
+     unsigned long src_width                      : 14;
+     unsigned long                                : 18;
+     } src_width_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_width_t f;
+} src_width_u;
+
+typedef struct _src_height_t {
+     unsigned long src_height                     : 14;
+     unsigned long                                : 18;
+     } src_height_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_height_t f;
+} src_height_u;
+
+typedef struct _src_inc_t {
+     unsigned long src_xinc                       : 6;
+     unsigned long                                : 2;
+     unsigned long src_yinc                       : 6;
+     unsigned long                                : 18;
+     } src_inc_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_inc_t f;
+} src_inc_u;
+
+typedef struct _host_data0_t {
+     unsigned long host_data                      : 32;
+     } host_data0_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data0_t f;
+} host_data0_u;
+
+typedef struct _host_data1_t {
+     unsigned long host_data                      : 32;
+     } host_data1_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data1_t f;
+} host_data1_u;
+
+typedef struct _host_data2_t {
+     unsigned long host_data                      : 32;
+     } host_data2_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data2_t f;
+} host_data2_u;
+
+typedef struct _host_data3_t {
+     unsigned long host_data                      : 32;
+     } host_data3_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data3_t f;
+} host_data3_u;
+
+typedef struct _host_data4_t {
+     unsigned long host_data                      : 32;
+     } host_data4_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data4_t f;
+} host_data4_u;
+
+typedef struct _host_data5_t {
+     unsigned long host_data                      : 32;
+     } host_data5_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data5_t f;
+} host_data5_u;
+
+typedef struct _host_data6_t {
+     unsigned long host_data                      : 32;
+     } host_data6_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data6_t f;
+} host_data6_u;
+
+typedef struct _host_data7_t {
+     unsigned long host_data                      : 32;
+     } host_data7_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data7_t f;
+} host_data7_u;
+
+typedef struct _host_data_last_t {
+     unsigned long host_data_last                 : 32;
+     } host_data_last_t;
+
+typedef union {
+     unsigned long val : 32;
+     host_data_last_t f;
+} host_data_last_u;
+
+typedef struct _dp_src_frgd_clr_t {
+     unsigned long dp_src_frgd_clr                : 32;
+     } dp_src_frgd_clr_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_src_frgd_clr_t f;
+} dp_src_frgd_clr_u;
+
+typedef struct _dp_src_bkgd_clr_t {
+     unsigned long dp_src_bkgd_clr                : 32;
+     } dp_src_bkgd_clr_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_src_bkgd_clr_t f;
+} dp_src_bkgd_clr_u;
+
+typedef struct _sc_left_t {
+     unsigned long sc_left                        : 14;
+     unsigned long                                : 18;
+     } sc_left_t;
+
+typedef union {
+     unsigned long val : 32;
+     sc_left_t f;
+} sc_left_u;
+
+typedef struct _sc_right_t {
+     unsigned long sc_right                       : 14;
+     unsigned long                                : 18;
+     } sc_right_t;
+
+typedef union {
+     unsigned long val : 32;
+     sc_right_t f;
+} sc_right_u;
+
+typedef struct _sc_top_t {
+     unsigned long sc_top                         : 14;
+     unsigned long                                : 18;
+     } sc_top_t;
+
+typedef union {
+     unsigned long val : 32;
+     sc_top_t f;
+} sc_top_u;
+
+typedef struct _sc_bottom_t {
+     unsigned long sc_bottom                      : 14;
+     unsigned long                                : 18;
+     } sc_bottom_t;
+
+typedef union {
+     unsigned long val : 32;
+     sc_bottom_t f;
+} sc_bottom_u;
+
+typedef struct _src_sc_right_t {
+     unsigned long sc_right                       : 14;
+     unsigned long                                : 18;
+     } src_sc_right_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_sc_right_t f;
+} src_sc_right_u;
+
+typedef struct _src_sc_bottom_t {
+     unsigned long sc_bottom                      : 14;
+     unsigned long                                : 18;
+     } src_sc_bottom_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_sc_bottom_t f;
+} src_sc_bottom_u;
+
+typedef struct _dp_cntl_t {
+     unsigned long dst_x_dir                      : 1;
+     unsigned long dst_y_dir                      : 1;
+     unsigned long src_x_dir                      : 1;
+     unsigned long src_y_dir                      : 1;
+     unsigned long dst_major_x                    : 1;
+     unsigned long src_major_x                    : 1;
+     unsigned long                                : 26;
+     } dp_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_cntl_t f;
+} dp_cntl_u;
+
+typedef struct _dp_cntl_dst_dir_t {
+     unsigned long                                : 15;
+     unsigned long dst_y_dir                      : 1;
+     unsigned long                                : 15;
+     unsigned long dst_x_dir                      : 1;
+     } dp_cntl_dst_dir_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_cntl_dst_dir_t f;
+} dp_cntl_dst_dir_u;
+
+typedef struct _dp_datatype_t {
+     unsigned long dp_dst_datatype                : 4;
+     unsigned long                                : 4;
+     unsigned long dp_brush_datatype              : 4;
+     unsigned long dp_src2_type                   : 1;
+     unsigned long dp_src2_datatype               : 3;
+     unsigned long dp_src_datatype                : 3;
+     unsigned long                                : 11;
+     unsigned long dp_byte_pix_order              : 1;
+     unsigned long                                : 1;
+     } dp_datatype_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_datatype_t f;
+} dp_datatype_u;
+
+typedef struct _dp_mix_t {
+     unsigned long                                : 8;
+     unsigned long dp_src_source                  : 3;
+     unsigned long dp_src2_source                 : 3;
+     unsigned long                                : 2;
+     unsigned long dp_rop3                        : 8;
+     unsigned long dp_op                          : 1;
+     unsigned long                                : 7;
+     } dp_mix_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_mix_t f;
+} dp_mix_u;
+
+typedef struct _dp_write_msk_t {
+     unsigned long dp_write_msk                   : 32;
+     } dp_write_msk_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_write_msk_t f;
+} dp_write_msk_u;
+
+typedef struct _clr_cmp_clr_src_t {
+     unsigned long clr_cmp_clr_src                : 32;
+     } clr_cmp_clr_src_t;
+
+typedef union {
+     unsigned long val : 32;
+     clr_cmp_clr_src_t f;
+} clr_cmp_clr_src_u;
+
+typedef struct _clr_cmp_clr_dst_t {
+     unsigned long clr_cmp_clr_dst                : 32;
+     } clr_cmp_clr_dst_t;
+
+typedef union {
+     unsigned long val : 32;
+     clr_cmp_clr_dst_t f;
+} clr_cmp_clr_dst_u;
+
+typedef struct _clr_cmp_cntl_t {
+     unsigned long clr_cmp_fcn_src                : 3;
+     unsigned long                                : 5;
+     unsigned long clr_cmp_fcn_dst                : 3;
+     unsigned long                                : 13;
+     unsigned long clr_cmp_src                    : 2;
+     unsigned long                                : 6;
+     } clr_cmp_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     clr_cmp_cntl_t f;
+} clr_cmp_cntl_u;
+
+typedef struct _clr_cmp_msk_t {
+     unsigned long clr_cmp_msk                    : 32;
+     } clr_cmp_msk_t;
+
+typedef union {
+     unsigned long val : 32;
+     clr_cmp_msk_t f;
+} clr_cmp_msk_u;
+
+typedef struct _default_pitch_offset_t {
+     unsigned long default_offset                 : 20;
+     unsigned long default_pitch                  : 10;
+     unsigned long                                : 2;
+     } default_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     default_pitch_offset_t f;
+} default_pitch_offset_u;
+
+typedef struct _default_sc_bottom_right_t {
+     unsigned long default_sc_right               : 14;
+     unsigned long                                : 2;
+     unsigned long default_sc_bottom              : 14;
+     unsigned long                                : 2;
+     } default_sc_bottom_right_t;
+
+typedef union {
+     unsigned long val : 32;
+     default_sc_bottom_right_t f;
+} default_sc_bottom_right_u;
+
+typedef struct _default2_sc_bottom_right_t {
+     unsigned long default_sc_right               : 14;
+     unsigned long                                : 2;
+     unsigned long default_sc_bottom              : 14;
+     unsigned long                                : 2;
+     } default2_sc_bottom_right_t;
+
+typedef union {
+     unsigned long val : 32;
+     default2_sc_bottom_right_t f;
+} default2_sc_bottom_right_u;
+
+typedef struct _ref1_pitch_offset_t {
+     unsigned long offset                         : 20;
+     unsigned long                                : 2;
+     unsigned long pitch                          : 8;
+     unsigned long                                : 2;
+     } ref1_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     ref1_pitch_offset_t f;
+} ref1_pitch_offset_u;
+
+typedef struct _ref2_pitch_offset_t {
+     unsigned long offset                         : 20;
+     unsigned long                                : 2;
+     unsigned long pitch                          : 8;
+     unsigned long                                : 2;
+     } ref2_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     ref2_pitch_offset_t f;
+} ref2_pitch_offset_u;
+
+typedef struct _ref3_pitch_offset_t {
+     unsigned long offset                         : 20;
+     unsigned long                                : 2;
+     unsigned long pitch                          : 8;
+     unsigned long                                : 2;
+     } ref3_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     ref3_pitch_offset_t f;
+} ref3_pitch_offset_u;
+
+typedef struct _ref4_pitch_offset_t {
+     unsigned long offset                         : 20;
+     unsigned long                                : 2;
+     unsigned long pitch                          : 8;
+     unsigned long                                : 2;
+     } ref4_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     ref4_pitch_offset_t f;
+} ref4_pitch_offset_u;
+
+typedef struct _ref5_pitch_offset_t {
+     unsigned long offset                         : 20;
+     unsigned long                                : 2;
+     unsigned long pitch                          : 8;
+     unsigned long                                : 2;
+     } ref5_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     ref5_pitch_offset_t f;
+} ref5_pitch_offset_u;
+
+typedef struct _ref6_pitch_offset_t {
+     unsigned long offset                         : 20;
+     unsigned long                                : 2;
+     unsigned long pitch                          : 8;
+     unsigned long                                : 2;
+     } ref6_pitch_offset_t;
+
+typedef union {
+     unsigned long val : 32;
+     ref6_pitch_offset_t f;
+} ref6_pitch_offset_u;
+
+typedef struct _dp_gui_master_cntl_t {
+     unsigned long gmc_src_pitch_offset_cntl      : 1;
+     unsigned long gmc_dst_pitch_offset_cntl      : 1;
+     unsigned long gmc_src_clipping               : 1;
+     unsigned long gmc_dst_clipping               : 1;
+     unsigned long gmc_brush_datatype             : 4;
+     unsigned long gmc_dst_datatype               : 4;
+     unsigned long gmc_src_datatype               : 3;
+     unsigned long gmc_byte_pix_order             : 1;
+     unsigned long gmc_default_sel                : 1;
+     unsigned long gmc_rop3                       : 8;
+     unsigned long gmc_dp_src_source              : 3;
+     unsigned long gmc_clr_cmp_fcn_dis            : 1;
+     unsigned long                                : 1;
+     unsigned long gmc_wr_msk_dis                 : 1;
+     unsigned long gmc_dp_op                      : 1;
+     } dp_gui_master_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     dp_gui_master_cntl_t f;
+} dp_gui_master_cntl_u;
+
+typedef struct _sc_top_left_t {
+     unsigned long sc_left                        : 14;
+     unsigned long                                : 2;
+     unsigned long sc_top                         : 14;
+     unsigned long                                : 2;
+     } sc_top_left_t;
+
+typedef union {
+     unsigned long val : 32;
+     sc_top_left_t f;
+} sc_top_left_u;
+
+typedef struct _sc_bottom_right_t {
+     unsigned long sc_right                       : 14;
+     unsigned long                                : 2;
+     unsigned long sc_bottom                      : 14;
+     unsigned long                                : 2;
+     } sc_bottom_right_t;
+
+typedef union {
+     unsigned long val : 32;
+     sc_bottom_right_t f;
+} sc_bottom_right_u;
+
+typedef struct _src_sc_bottom_right_t {
+     unsigned long sc_right                       : 14;
+     unsigned long                                : 2;
+     unsigned long sc_bottom                      : 14;
+     unsigned long                                : 2;
+     } src_sc_bottom_right_t;
+
+typedef union {
+     unsigned long val : 32;
+     src_sc_bottom_right_t f;
+} src_sc_bottom_right_u;
+
+typedef struct _global_alpha_t {
+     unsigned long alpha_r                        : 8;
+     unsigned long alpha_g                        : 8;
+     unsigned long alpha_b                        : 8;
+     unsigned long alpha_a                        : 8;
+     } global_alpha_t;
+
+typedef union {
+     unsigned long val : 32;
+     global_alpha_t f;
+} global_alpha_u;
+
+typedef struct _filter_coef_t {
+     unsigned long c_4                            : 4;
+     unsigned long c_3                            : 4;
+     unsigned long c_2                            : 4;
+     unsigned long c_1                            : 4;
+     unsigned long c1                             : 4;
+     unsigned long c2                             : 4;
+     unsigned long c3                             : 4;
+     unsigned long c4                             : 4;
+     } filter_coef_t;
+
+typedef union {
+     unsigned long val : 32;
+     filter_coef_t f;
+} filter_coef_u;
+
+typedef struct _mvc_cntl_start_t {
+     unsigned long mc_cntl_src_1_index            : 4;
+     unsigned long mc_cntl_dst_offset             : 20;
+     unsigned long mc_dst_pitch_mul               : 2;
+     unsigned long mc_cntl_src_2_index            : 3;
+     unsigned long mc_cntl_width_height_sel       : 3;
+     } mvc_cntl_start_t;
+
+typedef union {
+     unsigned long val : 32;
+     mvc_cntl_start_t f;
+} mvc_cntl_start_u;
+
+typedef struct _e2_arithmetic_cntl_t {
+     unsigned long opcode                         : 5;
+     unsigned long shiftright                     : 4;
+     unsigned long clamp                          : 1;
+     unsigned long rounding                       : 2;
+     unsigned long filter_n                       : 3;
+     unsigned long                                : 1;
+     unsigned long srcblend_inv                   : 1;
+     unsigned long srcblend                       : 4;
+     unsigned long                                : 3;
+     unsigned long dstblend_inv                   : 1;
+     unsigned long dstblend                       : 4;
+     unsigned long dst_signed                     : 1;
+     unsigned long autoinc                        : 1;
+     unsigned long                                : 1;
+     } e2_arithmetic_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     e2_arithmetic_cntl_t f;
+} e2_arithmetic_cntl_u;
+
+typedef struct _debug0_t {
+     unsigned long debug0_r                       : 8;
+     unsigned long                                : 8;
+     unsigned long debug0_rw                      : 8;
+     unsigned long                                : 8;
+     } debug0_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug0_t f;
+} debug0_u;
+
+typedef struct _debug1_t {
+     unsigned long debug1_r                       : 8;
+     unsigned long                                : 8;
+     unsigned long debug1_rw                      : 8;
+     unsigned long                                : 8;
+     } debug1_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug1_t f;
+} debug1_u;
+
+typedef struct _debug2_t {
+     unsigned long debug2_r                       : 8;
+     unsigned long                                : 8;
+     unsigned long debug2_rw                      : 8;
+     unsigned long                                : 8;
+     } debug2_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug2_t f;
+} debug2_u;
+
+typedef struct _debug3_t {
+     unsigned long                                : 32;
+     } debug3_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug3_t f;
+} debug3_u;
+
+typedef struct _debug4_t {
+     unsigned long                                : 32;
+     } debug4_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug4_t f;
+} debug4_u;
+
+typedef struct _debug5_t {
+     unsigned long                                : 32;
+     } debug5_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug5_t f;
+} debug5_u;
+
+typedef struct _debug6_t {
+     unsigned long                                : 32;
+     } debug6_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug6_t f;
+} debug6_u;
+
+typedef struct _debug7_t {
+     unsigned long                                : 32;
+     } debug7_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug7_t f;
+} debug7_u;
+
+typedef struct _debug8_t {
+     unsigned long                                : 32;
+     } debug8_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug8_t f;
+} debug8_u;
+
+typedef struct _debug9_t {
+     unsigned long                                : 32;
+     } debug9_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug9_t f;
+} debug9_u;
+
+typedef struct _debug10_t {
+     unsigned long                                : 32;
+     } debug10_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug10_t f;
+} debug10_u;
+
+typedef struct _debug11_t {
+     unsigned long                                : 32;
+     } debug11_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug11_t f;
+} debug11_u;
+
+typedef struct _debug12_t {
+     unsigned long                                : 32;
+     } debug12_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug12_t f;
+} debug12_u;
+
+typedef struct _debug13_t {
+     unsigned long                                : 32;
+     } debug13_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug13_t f;
+} debug13_u;
+
+typedef struct _debug14_t {
+     unsigned long                                : 32;
+     } debug14_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug14_t f;
+} debug14_u;
+
+typedef struct _debug15_t {
+     unsigned long                                : 32;
+     } debug15_t;
+
+typedef union {
+     unsigned long val : 32;
+     debug15_t f;
+} debug15_u;
+
+typedef struct _eng_cntl_t {
+     unsigned long erc_reg_rd_ws                  : 1;
+     unsigned long erc_reg_wr_ws                  : 1;
+     unsigned long erc_idle_reg_wr                : 1;
+     unsigned long dis_engine_triggers            : 1;
+     unsigned long dis_rop_src_uses_dst_w_h       : 1;
+     unsigned long dis_src_uses_dst_dirmaj        : 1;
+     unsigned long                                : 6;
+     unsigned long force_3dclk_when_2dclk         : 1;
+     unsigned long                                : 19;
+     } eng_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     eng_cntl_t f;
+} eng_cntl_u;
+
+typedef struct _eng_perf_cnt_t {
+     unsigned long perf_cnt                       : 20;
+     unsigned long perf_sel                       : 4;
+     unsigned long perf_en                        : 1;
+     unsigned long                                : 3;
+     unsigned long perf_clr                       : 1;
+     unsigned long                                : 3;
+     } eng_perf_cnt_t;
+
+typedef union {
+     unsigned long val : 32;
+     eng_perf_cnt_t f;
+} eng_perf_cnt_u;
+
+typedef struct _idct_runs_t {
+     unsigned long idct_runs_3                    : 8;
+     unsigned long idct_runs_2                    : 8;
+     unsigned long idct_runs_1                    : 8;
+     unsigned long idct_runs_0                    : 8;
+     } idct_runs_t;
+
+typedef union {
+     unsigned long val : 32;
+     idct_runs_t f;
+} idct_runs_u;
+
+typedef struct _idct_levels_t {
+     unsigned long idct_level_hi                  : 16;
+     unsigned long idct_level_lo                  : 16;
+     } idct_levels_t;
+
+typedef union {
+     unsigned long val : 32;
+     idct_levels_t f;
+} idct_levels_u;
+
+typedef struct _idct_control_t {
+     unsigned long idct_ctl_luma_rd_format        : 2;
+     unsigned long idct_ctl_chroma_rd_format      : 2;
+     unsigned long idct_ctl_scan_pattern          : 1;
+     unsigned long idct_ctl_intra                 : 1;
+     unsigned long idct_ctl_flush                 : 1;
+     unsigned long idct_ctl_passthru              : 1;
+     unsigned long idct_ctl_sw_reset              : 1;
+     unsigned long idct_ctl_constreq              : 1;
+     unsigned long idct_ctl_scramble              : 1;
+     unsigned long idct_ctl_alt_scan              : 1;
+     unsigned long                                : 20;
+     } idct_control_t;
+
+typedef union {
+     unsigned long val : 32;
+     idct_control_t f;
+} idct_control_u;
+
+typedef struct _idct_auth_control_t {
+     unsigned long control_bits                   : 32;
+     } idct_auth_control_t;
+
+typedef union {
+     unsigned long val : 32;
+     idct_auth_control_t f;
+} idct_auth_control_u;
+
+typedef struct _idct_auth_t {
+     unsigned long auth                           : 32;
+     } idct_auth_t;
+
+typedef union {
+     unsigned long val : 32;
+     idct_auth_t f;
+} idct_auth_u;
+
+typedef struct _mem_cntl_t {
+     unsigned long                                : 1;
+     unsigned long en_mem_ch1                     : 1;
+     unsigned long en_mem_ch2                     : 1;
+     unsigned long int_mem_mapping                : 1;
+     unsigned long                                : 28;
+     } mem_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     mem_cntl_t f;
+} mem_cntl_u;
+
+typedef struct _mem_arb_t {
+     unsigned long disp_time_slot                 : 4;
+     unsigned long disp_timer                     : 4;
+     unsigned long arb_option                     : 1;
+     unsigned long                                : 23;
+     } mem_arb_t;
+
+typedef union {
+     unsigned long val : 32;
+     mem_arb_t f;
+} mem_arb_u;
+
+typedef struct _mc_fb_location_t {
+     unsigned long mc_fb_start                    : 16;
+     unsigned long mc_fb_top                      : 16;
+     } mc_fb_location_t;
+
+typedef union {
+     unsigned long val : 32;
+     mc_fb_location_t f;
+} mc_fb_location_u;
+
+typedef struct _mem_ext_cntl_t {
+     unsigned long mem_ext_enable                 : 1;
+     unsigned long mem_ap_enable                  : 1;
+     unsigned long mem_addr_mapping               : 2;
+     unsigned long mem_wdoe_cntl                  : 2;
+     unsigned long mem_wdoe_extend                : 1;
+     unsigned long                                : 1;
+     unsigned long mem_page_timer                 : 8;
+     unsigned long mem_dynamic_cke                : 1;
+     unsigned long mem_sdram_tri_en               : 1;
+     unsigned long mem_self_refresh_en            : 1;
+     unsigned long mem_power_down                 : 1;
+     unsigned long mem_hw_power_down_en           : 1;
+     unsigned long mem_power_down_stat            : 1;
+     unsigned long                                : 3;
+     unsigned long mem_pd_mck                     : 1;
+     unsigned long mem_pd_ma                      : 1;
+     unsigned long mem_pd_mdq                     : 1;
+     unsigned long mem_tristate_mck               : 1;
+     unsigned long mem_tristate_ma                : 1;
+     unsigned long mem_tristate_mcke              : 1;
+     unsigned long mem_invert_mck                 : 1;
+     } mem_ext_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     mem_ext_cntl_t f;
+} mem_ext_cntl_u;
+
+typedef struct _mc_ext_mem_location_t {
+     unsigned long mc_ext_mem_start               : 16;
+     unsigned long mc_ext_mem_top                 : 16;
+     } mc_ext_mem_location_t;
+
+typedef union {
+     unsigned long val : 32;
+     mc_ext_mem_location_t f;
+} mc_ext_mem_location_u;
+
+typedef struct _mem_ext_timing_cntl_t {
+     unsigned long mem_trp                        : 2;
+     unsigned long mem_trcd                       : 2;
+     unsigned long mem_tras                       : 3;
+     unsigned long                                : 1;
+     unsigned long mem_trrd                       : 2;
+     unsigned long mem_tr2w                       : 2;
+     unsigned long mem_twr                        : 2;
+     unsigned long                                : 4;
+     unsigned long mem_twr_mode                   : 1;
+     unsigned long                                : 1;
+     unsigned long mem_refresh_dis                : 1;
+     unsigned long                                : 3;
+     unsigned long mem_refresh_rate               : 8;
+     } mem_ext_timing_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     mem_ext_timing_cntl_t f;
+} mem_ext_timing_cntl_u;
+
+typedef struct _mem_sdram_mode_reg_t {
+     unsigned long mem_mode_reg                   : 14;
+     unsigned long                                : 2;
+     unsigned long mem_read_latency               : 2;
+     unsigned long mem_schmen_latency             : 2;
+     unsigned long mem_cas_latency                : 2;
+     unsigned long mem_schmen_extend              : 1;
+     unsigned long                                : 8;
+     unsigned long mem_sdram_reset                : 1;
+     } mem_sdram_mode_reg_t;
+
+typedef union {
+     unsigned long val : 32;
+     mem_sdram_mode_reg_t f;
+} mem_sdram_mode_reg_u;
+
+typedef struct _mem_io_cntl_t {
+     unsigned long mem_sn_mck                     : 4;
+     unsigned long mem_sn_ma                      : 4;
+     unsigned long mem_sn_mdq                     : 4;
+     unsigned long mem_srn_mck                    : 1;
+     unsigned long mem_srn_ma                     : 1;
+     unsigned long mem_srn_mdq                    : 1;
+     unsigned long                                : 1;
+     unsigned long mem_sp_mck                     : 4;
+     unsigned long mem_sp_ma                      : 4;
+     unsigned long mem_sp_mdq                     : 4;
+     unsigned long mem_srp_mck                    : 1;
+     unsigned long mem_srp_ma                     : 1;
+     unsigned long mem_srp_mdq                    : 1;
+     unsigned long                                : 1;
+     } mem_io_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     mem_io_cntl_t f;
+} mem_io_cntl_u;
+
+typedef struct _mc_debug_t {
+     unsigned long mc_debug                       : 32;
+     } mc_debug_t;
+
+typedef union {
+     unsigned long val : 32;
+     mc_debug_t f;
+} mc_debug_u;
+
+typedef struct _mc_bist_ctrl_t {
+     unsigned long mc_bist_ctrl                   : 32;
+     } mc_bist_ctrl_t;
+
+typedef union {
+     unsigned long val : 32;
+     mc_bist_ctrl_t f;
+} mc_bist_ctrl_u;
+
+typedef struct _mc_bist_collar_read_t {
+     unsigned long mc_bist_collar_read            : 32;
+     } mc_bist_collar_read_t;
+
+typedef union {
+     unsigned long val : 32;
+     mc_bist_collar_read_t f;
+} mc_bist_collar_read_u;
+
+typedef struct _tc_mismatch_t {
+     unsigned long tc_mismatch                    : 24;
+     unsigned long                                : 8;
+     } tc_mismatch_t;
+
+typedef union {
+     unsigned long val : 32;
+     tc_mismatch_t f;
+} tc_mismatch_u;
+
+typedef struct _mc_perf_mon_cntl_t {
+     unsigned long clr_perf                       : 1;
+     unsigned long en_perf                        : 1;
+     unsigned long                                : 2;
+     unsigned long perf_op_a                      : 2;
+     unsigned long perf_op_b                      : 2;
+     unsigned long                                : 8;
+     unsigned long monitor_period                 : 8;
+     unsigned long perf_count_a_overflow          : 1;
+     unsigned long perf_count_b_overflow          : 1;
+     unsigned long                                : 6;
+     } mc_perf_mon_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     mc_perf_mon_cntl_t f;
+} mc_perf_mon_cntl_u;
+
+typedef struct _mc_perf_counters_t {
+     unsigned long mc_perf_counter_a              : 16;
+     unsigned long mc_perf_counter_b              : 16;
+     } mc_perf_counters_t;
+
+typedef union {
+     unsigned long val : 32;
+     mc_perf_counters_t f;
+} mc_perf_counters_u;
+
+typedef struct _wait_until_t {
+     unsigned long wait_crtc_pflip                : 1;
+     unsigned long wait_re_crtc_vline             : 1;
+     unsigned long wait_fe_crtc_vline             : 1;
+     unsigned long wait_crtc_vline                : 1;
+     unsigned long wait_dma_viph0_idle            : 1;
+     unsigned long wait_dma_viph1_idle            : 1;
+     unsigned long wait_dma_viph2_idle            : 1;
+     unsigned long wait_dma_viph3_idle            : 1;
+     unsigned long wait_dma_vid_idle              : 1;
+     unsigned long wait_dma_gui_idle              : 1;
+     unsigned long wait_cmdfifo                   : 1;
+     unsigned long wait_ov0_flip                  : 1;
+     unsigned long wait_ov0_slicedone             : 1;
+     unsigned long                                : 1;
+     unsigned long wait_2d_idle                   : 1;
+     unsigned long wait_3d_idle                   : 1;
+     unsigned long wait_2d_idleclean              : 1;
+     unsigned long wait_3d_idleclean              : 1;
+     unsigned long wait_host_idleclean            : 1;
+     unsigned long wait_extern_sig                : 1;
+     unsigned long cmdfifo_entries                : 7;
+     unsigned long                                : 3;
+     unsigned long wait_both_crtc_pflip           : 1;
+     unsigned long eng_display_select             : 1;
+     } wait_until_t;
+
+typedef union {
+     unsigned long val : 32;
+     wait_until_t f;
+} wait_until_u;
+
+typedef struct _isync_cntl_t {
+     unsigned long isync_any2d_idle3d             : 1;
+     unsigned long isync_any3d_idle2d             : 1;
+     unsigned long isync_trig2d_idle3d            : 1;
+     unsigned long isync_trig3d_idle2d            : 1;
+     unsigned long isync_wait_idlegui             : 1;
+     unsigned long isync_cpscratch_idlegui        : 1;
+     unsigned long                                : 26;
+     } isync_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     isync_cntl_t f;
+} isync_cntl_u;
+
+typedef struct _rbbm_guicntl_t {
+     unsigned long host_data_swap                 : 2;
+     unsigned long                                : 30;
+     } rbbm_guicntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_guicntl_t f;
+} rbbm_guicntl_u;
+
+typedef struct _rbbm_status_t {
+     unsigned long cmdfifo_avail                  : 7;
+     unsigned long                                : 1;
+     unsigned long hirq_on_rbb                    : 1;
+     unsigned long cprq_on_rbb                    : 1;
+     unsigned long cfrq_on_rbb                    : 1;
+     unsigned long hirq_in_rtbuf                  : 1;
+     unsigned long cprq_in_rtbuf                  : 1;
+     unsigned long cfrq_in_rtbuf                  : 1;
+     unsigned long cf_pipe_busy                   : 1;
+     unsigned long eng_ev_busy                    : 1;
+     unsigned long cp_cmdstrm_busy                : 1;
+     unsigned long e2_busy                        : 1;
+     unsigned long rb2d_busy                      : 1;
+     unsigned long rb3d_busy                      : 1;
+     unsigned long se_busy                        : 1;
+     unsigned long re_busy                        : 1;
+     unsigned long tam_busy                       : 1;
+     unsigned long tdm_busy                       : 1;
+     unsigned long pb_busy                        : 1;
+     unsigned long                                : 6;
+     unsigned long gui_active                     : 1;
+     } rbbm_status_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_status_t f;
+} rbbm_status_u;
+
+typedef struct _rbbm_cntl_t {
+     unsigned long rb_settle                      : 4;
+     unsigned long abortclks_hi                   : 3;
+     unsigned long                                : 1;
+     unsigned long abortclks_cp                   : 3;
+     unsigned long                                : 1;
+     unsigned long abortclks_cfifo                : 3;
+     unsigned long                                : 2;
+     unsigned long cpq_data_swap                  : 1;
+     unsigned long                                : 3;
+     unsigned long no_abort_idct                  : 1;
+     unsigned long no_abort_bios                  : 1;
+     unsigned long no_abort_fb                    : 1;
+     unsigned long no_abort_cp                    : 1;
+     unsigned long no_abort_hi                    : 1;
+     unsigned long no_abort_hdp                   : 1;
+     unsigned long no_abort_mc                    : 1;
+     unsigned long no_abort_aic                   : 1;
+     unsigned long no_abort_vip                   : 1;
+     unsigned long no_abort_disp                  : 1;
+     unsigned long no_abort_cg                    : 1;
+     } rbbm_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_cntl_t f;
+} rbbm_cntl_u;
+
+typedef struct _rbbm_soft_reset_t {
+     unsigned long soft_reset_cp                  : 1;
+     unsigned long soft_reset_hi                  : 1;
+     unsigned long reserved3                      : 3;
+     unsigned long soft_reset_e2                  : 1;
+     unsigned long reserved2                      : 2;
+     unsigned long soft_reset_mc                  : 1;
+     unsigned long reserved1                      : 2;
+     unsigned long soft_reset_disp                : 1;
+     unsigned long soft_reset_cg                  : 1;
+     unsigned long                                : 19;
+     } rbbm_soft_reset_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_soft_reset_t f;
+} rbbm_soft_reset_u;
+
+typedef struct _nqwait_until_t {
+     unsigned long wait_gui_idle                  : 1;
+     unsigned long                                : 31;
+     } nqwait_until_t;
+
+typedef union {
+     unsigned long val : 32;
+     nqwait_until_t f;
+} nqwait_until_u;
+
+typedef struct _rbbm_debug_t {
+     unsigned long rbbm_debug                     : 32;
+     } rbbm_debug_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_debug_t f;
+} rbbm_debug_u;
+
+typedef struct _rbbm_cmdfifo_addr_t {
+     unsigned long cmdfifo_addr                   : 6;
+     unsigned long                                : 26;
+     } rbbm_cmdfifo_addr_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_cmdfifo_addr_t f;
+} rbbm_cmdfifo_addr_u;
+
+typedef struct _rbbm_cmdfifo_datal_t {
+     unsigned long cmdfifo_datal                  : 32;
+     } rbbm_cmdfifo_datal_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_cmdfifo_datal_t f;
+} rbbm_cmdfifo_datal_u;
+
+typedef struct _rbbm_cmdfifo_datah_t {
+     unsigned long cmdfifo_datah                  : 12;
+     unsigned long                                : 20;
+     } rbbm_cmdfifo_datah_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_cmdfifo_datah_t f;
+} rbbm_cmdfifo_datah_u;
+
+typedef struct _rbbm_cmdfifo_stat_t {
+     unsigned long cmdfifo_rptr                   : 6;
+     unsigned long                                : 2;
+     unsigned long cmdfifo_wptr                   : 6;
+     unsigned long                                : 18;
+     } rbbm_cmdfifo_stat_t;
+
+typedef union {
+     unsigned long val : 32;
+     rbbm_cmdfifo_stat_t f;
+} rbbm_cmdfifo_stat_u;
+
+typedef struct _clk_pin_cntl_t {
+     unsigned long osc_en                         : 1;
+     unsigned long osc_gain                       : 5;
+     unsigned long dont_use_xtalin                : 1;
+     unsigned long xtalin_pm_en                   : 1;
+     unsigned long xtalin_dbl_en                  : 1;
+     unsigned long                                : 7;
+     unsigned long cg_debug                       : 16;
+     } clk_pin_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     clk_pin_cntl_t f;
+} clk_pin_cntl_u;
+
+typedef struct _pll_ref_fb_div_t {
+     unsigned long pll_ref_div                    : 4;
+     unsigned long                                : 4;
+     unsigned long pll_fb_div_int                 : 6;
+     unsigned long                                : 2;
+     unsigned long pll_fb_div_frac                : 3;
+     unsigned long                                : 1;
+     unsigned long pll_reset_time                 : 4;
+     unsigned long pll_lock_time                  : 8;
+     } pll_ref_fb_div_t;
+
+typedef union {
+     unsigned long val : 32;
+     pll_ref_fb_div_t f;
+} pll_ref_fb_div_u;
+
+typedef struct _pll_cntl_t {
+     unsigned long pll_pwdn                       : 1;
+     unsigned long pll_reset                      : 1;
+     unsigned long pll_pm_en                      : 1;
+     unsigned long pll_mode                       : 1;
+     unsigned long pll_refclk_sel                 : 1;
+     unsigned long pll_fbclk_sel                  : 1;
+     unsigned long pll_tcpoff                     : 1;
+     unsigned long pll_pcp                        : 3;
+     unsigned long pll_pvg                        : 3;
+     unsigned long pll_vcofr                      : 1;
+     unsigned long pll_ioffset                    : 2;
+     unsigned long pll_pecc_mode                  : 2;
+     unsigned long pll_pecc_scon                  : 2;
+     unsigned long pll_dactal                     : 4;
+     unsigned long pll_cp_clip                    : 2;
+     unsigned long pll_conf                       : 3;
+     unsigned long pll_mbctrl                     : 2;
+     unsigned long pll_ring_off                   : 1;
+     } pll_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     pll_cntl_t f;
+} pll_cntl_u;
+
+typedef struct _sclk_cntl_t {
+     unsigned long sclk_src_sel                   : 2;
+     unsigned long                                : 2;
+     unsigned long sclk_post_div_fast             : 4;
+     unsigned long sclk_clkon_hys                 : 3;
+     unsigned long sclk_post_div_slow             : 4;
+     unsigned long disp_cg_ok2switch_en           : 1;
+     unsigned long sclk_force_reg                 : 1;
+     unsigned long sclk_force_disp                : 1;
+     unsigned long sclk_force_mc                  : 1;
+     unsigned long sclk_force_extmc               : 1;
+     unsigned long sclk_force_cp                  : 1;
+     unsigned long sclk_force_e2                  : 1;
+     unsigned long sclk_force_e3                  : 1;
+     unsigned long sclk_force_idct                : 1;
+     unsigned long sclk_force_bist                : 1;
+     unsigned long busy_extend_cp                 : 1;
+     unsigned long busy_extend_e2                 : 1;
+     unsigned long busy_extend_e3                 : 1;
+     unsigned long busy_extend_idct               : 1;
+     unsigned long                                : 3;
+     } sclk_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     sclk_cntl_t f;
+} sclk_cntl_u;
+
+typedef struct _pclk_cntl_t {
+     unsigned long pclk_src_sel                   : 2;
+     unsigned long                                : 2;
+     unsigned long pclk_post_div                  : 4;
+     unsigned long                                : 8;
+     unsigned long pclk_force_disp                : 1;
+     unsigned long                                : 15;
+     } pclk_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     pclk_cntl_t f;
+} pclk_cntl_u;
+
+typedef struct _clk_test_cntl_t {
+     unsigned long testclk_sel                    : 4;
+     unsigned long                                : 3;
+     unsigned long start_check_freq               : 1;
+     unsigned long tstcount_rst                   : 1;
+     unsigned long                                : 15;
+     unsigned long test_count                     : 8;
+     } clk_test_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     clk_test_cntl_t f;
+} clk_test_cntl_u;
+
+typedef struct _pwrmgt_cntl_t {
+     unsigned long pwm_enable                     : 1;
+     unsigned long                                : 1;
+     unsigned long pwm_mode_req                   : 2;
+     unsigned long pwm_wakeup_cond                : 2;
+     unsigned long pwm_fast_noml_hw_en            : 1;
+     unsigned long pwm_noml_fast_hw_en            : 1;
+     unsigned long pwm_fast_noml_cond             : 4;
+     unsigned long pwm_noml_fast_cond             : 4;
+     unsigned long pwm_idle_timer                 : 8;
+     unsigned long pwm_busy_timer                 : 8;
+     } pwrmgt_cntl_t;
+
+typedef union {
+     unsigned long val : 32;
+     pwrmgt_cntl_t f;
+} pwrmgt_cntl_u;
+
+typedef struct _pwrmgt_status_t {
+     unsigned long pwm_mode                       : 2;
+     unsigned long                                : 30;
+     } pwrmgt_status_t;
+
+typedef union {
+     unsigned long val : 32;
+     pwrmgt_status_t f;
+} pwrmgt_status_u;
+
+typedef struct tagDISPLAYSTATE {
+lcd_format_u	       LcdFormat;
+crtc_total_u	       CrtcTotal;
+active_h_disp_u	       ActiveHDisp;
+active_v_disp_u	       ActiveVDisp;
+crtc_ss_u	       CrtcSS;
+crtc_ls_u	       CrtcLS;
+crtc_gs_u	       CrtcGS;
+crtc_vpos_gs_u	       CrtcVPosGS;
+crtc_gclk_u	       CrtcGClk;
+crtc_goe_u	       CrtcGOE;
+crtc_rev_u	       CrtcRev;
+crtc_dclk_u	       CrtcDClk;
+crtc_default_count_u   CrtcDefaultCount;
+crtc_frame_u	       CrtcFrame;
+crtc_frame_vpos_u      CrtcFrameVPos;
+lcdd_cntl1_u	       LcddCntl1;
+lcdd_cntl2_u	       LcddCntl2;
+genlcd_cntl1_u	       GenlcdCntl1;
+genlcd_cntl2_u	       GenlcdCntl2;
+lcd_background_color_u LcdBackgroundColor;
+brightness_cntl_u      Brightness_Cntl;
+} DISPLAYSTATE;
+
+typedef struct {
+s16		X_Top_Left;      	// x coordinate of top left corner
+s16		Y_Top_Left;	   	// y coordinate of top left corner
+s16		X_Bottom_Right;		// x coordinate of bottom right corner
+s16		Y_Bottom_Right;		// y coordinate of bottom right corner
+} ATI_CLIPRECT;
+
+typedef struct tagGUISTATE {
+dp_cntl_u            DpCntl;
+dp_gui_master_cntl_u GMC;
+e2_arithmetic_cntl_u E2AC;
+global_alpha_u	     GlobalAlpha;
+dst_pitch_u          dstPitch;
+dst_offset_u         dstOffset;
+src_pitch_u          srcPitch;
+src_offset_u         srcOffset;
+u32		     FrgrdColour;
+u32		     BkgrdColour;
+ATI_CLIPRECT	     SrcClipRect;
+ATI_CLIPRECT	     DstClipRect;
+u32 		     BrushOffset;
+u16		     BrushHandle;
+// for 16bpp, SRC must be the same type as DST, can't go from 1555->565
+s8                   TurnOnDst565ForNon2D; 
+} GUISTATE;
+
+typedef struct tagGFXWINSTATE {
+graphic_ctrl_u		GraphicCtrl;
+graphic_offset_u        GraphicOffset;
+graphic_pitch_u		GraphicPitch;   // byte-based
+graphic_h_disp_u	GraphicHDisp;
+graphic_v_disp_u	GraphicVDisp;
+s8                      TurnOnDisp565;
+// These memory offsets need to be translated before writing to registers
+u32                     Grp_Offset;
+u32			Grp_W;
+u32			Grp_H;
+u32			Grp_Src_X;
+u32			Grp_Src_Y;
+u32			Grp_Src_W;      // pixel-based
+} GFXWINSTATE;
+
+typedef struct tagPREVSTATE {
+u16              PrevOverlayX;
+u16              PrevOverlayY;
+u8               bOverlayWasOn;
+u16              PrevGfxWinX;
+u16              PrevGfxWinY;
+u8               bGfxWinWasOn;
+} PREVSTATE;
+
+typedef struct tagPOWERSTATE {
+clk_pin_cntl_u   ClkPinCntl;
+pll_ref_fb_div_u PllRefFbDiv;
+pll_cntl_u       PllCntl;
+sclk_cntl_u      SclkCntl;
+pclk_cntl_u      PclkCntl;
+clk_test_cntl_u  ClkTestCntl;
+pwrmgt_cntl_u    PwrmgtCntl;
+u32              Freq;
+u8               tf100;
+u8		 tf80;
+u8               tf20;
+u8               M;
+u8               N_int;
+u8               N_fac;
+u8               lock_time;
+u8               tfgoal;
+u8               AutoMode;
+u8               PWMMode;
+u16              FastSclk;
+u16              NormSclk;
+PREVSTATE        PrevState;
+} POWERSTATE;
+
+typedef struct tagAPERTURE {
+u32 MMRegBase;
+u32 CfgRegBase;
+u32 McFbStart;
+u32 McFbTop;
+u32 McExtMemStart;
+u32 McExtMemTop;
+u32 WrapStart;
+u32 WrapTop;
+} APERTURE;
+
+#endif
+
diff --git a/recipes/mplayer/files/w100-Makefile.patch b/recipes/mplayer/files/w100-Makefile.patch
new file mode 100644
index 0000000000..01899556d3
--- /dev/null
+++ b/recipes/mplayer/files/w100-Makefile.patch
@@ -0,0 +1,10 @@
+--- mplayer_20060519/Makefile.orig	2006-05-30 10:29:18.000000000 +0100
++++ mplayer_20060519/Makefile	2006-05-30 10:29:53.000000000 +0100
+@@ -74,6 +74,7 @@
+           $(DIRECTFB_LIB) \
+           $(CACA_LIB) \
+ 	  $(VESA_LIB) \
++	  $(W100_LIB)
+ 
+ ifeq ($(EXTERNAL_VIDIX),yes)
+ VO_LIBS += $(EXTERNAL_VIDIX_LIB)
diff --git a/recipes/mplayer/files/w100-configure-svn.patch b/recipes/mplayer/files/w100-configure-svn.patch
new file mode 100644
index 0000000000..e3067a4724
--- /dev/null
+++ b/recipes/mplayer/files/w100-configure-svn.patch
@@ -0,0 +1,48 @@
+Index: trunk/configure
+===================================================================
+--- trunk.orig/configure
++++ trunk/configure
+@@ -1623,6 +1623,7 @@ _caca=auto
+ _svga=auto
+ _vesa=auto
+ _fbdev=auto
++_w100=no
+ _dvb=auto
+ _dvbhead=auto
+ _dxr2=auto
+@@ -1822,6 +1823,8 @@ for ac_option do
+   --disable-vesa)	_vesa=no	;;
+   --enable-fbdev)	_fbdev=yes	;;
+   --disable-fbdev)	_fbdev=no	;;
++  --enable-w100)        _w100=yes       ;;
++  --disable-w100)       _w100=no        ;;
+   --enable-dvb)		_dvb=yes	;;
+   --disable-dvb)        _dvb=no		;;
+   --enable-dvbhead)	_dvbhead=yes	;;
+@@ -4280,6 +4283,18 @@ else
+ fi
+ echores "$_fbdev"
+ 
++echocheck "ATI Imageon 100 (w100)"
++if test "$_w100" = yes ; then
++  _def_w100='#define HAVE_W100 1'
++  _ld_w100='-laticore'
++  _libs_mplayer="$_libs_mplayer $_ld_w100"
++  _vosrc="$_vosrc vo_w100.c"
++  _vomodules="w100 $_vomodules"
++else
++  _def_w100='#undef HAVE_W100'
++  _novomodules="w100 $_novomodules"
++fi
++echores "$_w100"
+ 
+ 
+ echocheck "DVB"
+@@ -8227,6 +8242,7 @@ $_def_mga
+ $_def_xmga
+ $_def_syncfb
+ $_def_fbdev
++$_def_w100
+ $_def_dxr2
+ $_def_dxr3
+ $_def_ivtv
diff --git a/recipes/mplayer/files/w100-configure.patch b/recipes/mplayer/files/w100-configure.patch
new file mode 100644
index 0000000000..03610610e4
--- /dev/null
+++ b/recipes/mplayer/files/w100-configure.patch
@@ -0,0 +1,53 @@
+--- mplayer_20060519/configure.orig	2006-05-30 10:23:24.000000000 +0100
++++ mplayer_20060519/configure	2006-05-30 10:27:24.000000000 +0100
+@@ -1585,6 +1585,7 @@
+ _svga=auto
+ _vesa=auto
+ _fbdev=auto
++_w100=no
+ _dvb=auto
+ _dvbhead=auto
+ _dxr2=auto
+@@ -1767,6 +1768,8 @@
+   --disable-vesa)	_vesa=no	;;
+   --enable-fbdev)	_fbdev=yes	;;
+   --disable-fbdev)	_fbdev=no	;;
++  --enable-w100)        _w100=yes       ;;
++  --disable-w100)       _w100=no        ;;
+   --enable-dvb)		_dvb=yes	;;
+   --disable-dvb)        _dvb=no		;;
+   --enable-dvbhead)	_dvbhead=yes	;;
+@@ -4200,6 +4203,17 @@
+ fi
+ echores "$_fbdev"
+ 
++echocheck "ATI Imageon 100 (w100)"
++if test "$_w100" = yes ; then
++  _def_w100='#define HAVE_W100 1'
++  _ld_w100='-laticore'
++  _vosrc="$_vosrc vo_w100.c"
++  _vomodules="w100 $_vomodules"
++else
++  _def_w100='#undef HAVE_W100'
++  _novomodules="w100 $_novomodules"
++fi
++echores "$_w100"
+ 
+ 
+ echocheck "DVB"
+@@ -7441,6 +7455,7 @@
+ AA_LIB = $_ld_aa
+ CACA_INC = $_inc_caca
+ CACA_LIB = $_ld_caca
++W100_LIB = $_ld_w100
+ 
+ # audio output
+ ALSA_LIB = $_ld_alsa
+@@ -8238,6 +8253,7 @@
+ $_def_xmga
+ $_def_syncfb
+ $_def_fbdev
++$_def_w100
+ $_def_dxr2
+ $_def_dxr3
+ $_def_dvb
diff --git a/recipes/mplayer/files/w100-mplayer.patch b/recipes/mplayer/files/w100-mplayer.patch
new file mode 100644
index 0000000000..8ce37d014c
--- /dev/null
+++ b/recipes/mplayer/files/w100-mplayer.patch
@@ -0,0 +1,32 @@
+Index: MPlayer-1.0rc1/mplayer.c
+===================================================================
+--- MPlayer-1.0rc1.orig/mplayer.c
++++ MPlayer-1.0rc1/mplayer.c
+@@ -807,6 +807,17 @@ static void exit_sighandler(int x){
+   exit_player(NULL);
+ }
+ 
++//w100 driver additions
++int g_sigcont = 0;
++
++static void misc_sighandler(int x){
++  switch(x){
++    case SIGCONT:
++      ++ g_sigcont;
++      break;
++  }
++}
++
+ extern void mp_input_register_options(m_config_t* cfg);
+ 
+ #include "mixer.h"
+@@ -3216,6 +3227,9 @@ current_module = NULL;
+ #endif
+ #endif
+ 
++// w100 driver additions
++  signal(SIGCONT,misc_sighandler);
++    
+ #ifdef HAVE_NEW_GUI
+   if(use_gui){
+        guiInit();
diff --git a/recipes/mplayer/files/w100-video_out.patch b/recipes/mplayer/files/w100-video_out.patch
new file mode 100644
index 0000000000..9855853fe6
--- /dev/null
+++ b/recipes/mplayer/files/w100-video_out.patch
@@ -0,0 +1,20 @@
+--- mplayer_20060519/libvo/video_out.c.orig	2006-05-30 11:25:57.000000000 +0100
++++ mplayer_20060519/libvo/video_out.c	2006-05-30 11:26:49.000000000 +0100
+@@ -86,6 +86,7 @@
+ extern vo_functions_t video_out_syncfb;
+ extern vo_functions_t video_out_fbdev;
+ extern vo_functions_t video_out_fbdev2;
++extern vo_functions_t video_out_w100;
+ extern vo_functions_t video_out_svga;
+ extern vo_functions_t video_out_png;
+ extern vo_functions_t video_out_ggi;
+@@ -196,6 +197,9 @@
+ 	&video_out_fbdev,
+ 	&video_out_fbdev2,
+ #endif
++#ifdef HAVE_W100
++	&video_out_w100,
++#endif
+ #ifdef HAVE_SVGALIB
+ 	&video_out_svga,
+ #endif
diff --git a/recipes/mplayer/files/yuv.S b/recipes/mplayer/files/yuv.S
new file mode 100644
index 0000000000..3eaf284a61
--- /dev/null
+++ b/recipes/mplayer/files/yuv.S
@@ -0,0 +1,119 @@
+/*
+    Copyright (C) 2008 Mans Rullgard
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+ */
+
+        .fpu neon
+        .text
+
+@ yuv420_to_yuv422(uint8_t *yuv, uint8_t *y, uint8_t *u, uint8_t *v,
+@                  int w, int h, int yw, int cw, int dw)
+
+#define yuv  r0
+#define y    r1
+#define u    r2
+#define v    r3
+#define w    r4
+#define h    r5
+#define yw   r6
+#define cw   r7
+#define dw   r8
+
+#define tyuv r9
+#define ty   r10
+#define tu   r11
+#define tv   r12
+#define i    lr
+
+        .global yuv420_to_yuv422
+        .func   yuv420_to_yuv422
+yuv420_to_yuv422:
+        push            {r4-r11,lr}
+        add             r4,  sp,  #36
+        ldm             r4, {r4-r8}
+        dmb
+1:
+        mov             tu,   u
+        mov             tv,   v
+        vld1.64         {d2}, [u,:64], cw               @ u0
+        vld1.64         {d3}, [v,:64], cw               @ v0
+        mov             tyuv, yuv
+        mov             ty,   y
+        vzip.8          d2,   d3                        @ u0v0
+        mov             i,    #16
+2:                      
+        pld             [y, #64]
+        vld1.64         {d0, d1},   [y,:128], yw        @ y0
+        pld             [u, #64]
+        subs            i,    i,    #4
+        vld1.64         {d6},       [u,:64],  cw        @ u2
+        pld             [y, #64]
+        vld1.64         {d4, d5},   [y,:128], yw        @ y1
+        pld             [v, #64]
+        vld1.64         {d7},       [v,:64],  cw        @ v2
+        pld             [y, #64]
+        vld1.64         {d16,d17},  [y,:128], yw        @ y2
+        vzip.8          d6,   d7                        @ u2v2
+        pld             [u, #64]
+        vld1.64         {d22},      [u,:64],  cw        @ u4
+        pld             [v, #64]
+        vld1.64         {d23},      [v,:64],  cw        @ v4
+        pld             [y, #64]
+        vld1.64         {d20,d21},  [y,:128], yw        @ y3
+        vmov            q9,   q3                        @ u2v2
+        vzip.8          d22,  d23                       @ u4v4
+        vrhadd.u8       q3,   q1,   q3                  @ u1v1
+        vzip.8          q0,   q1                        @ y0u0y0v0
+        vmov            q12,  q11                       @ u4v4
+        vzip.8          q2,   q3                        @ y1u1y1v1
+        vrhadd.u8       q11,  q9,   q11                 @ u3v3
+        vst1.64         {d0-d3},    [yuv,:128], dw      @ y0u0y0v0
+        vzip.8          q8,   q9                        @ y2u2y2v2
+        vst1.64         {d4-d7},    [yuv,:128], dw      @ y1u1y1v1
+        vzip.8          q10,  q11                       @ y3u3y3v3
+        vst1.64         {d16-d19},  [yuv,:128], dw      @ y2u2y2v2
+        vmov            q1,   q12
+        vst1.64         {d20-d23},  [yuv,:128], dw      @ y3u3y3v3
+        bgt             2b
+
+        subs            w,    w,    #16
+        add             yuv,  tyuv, #32
+        add             y,    ty,   #16
+        add             u,    tu,   #8
+        add             v,    tv,   #8
+        bgt             1b
+
+        ldr             w,    [sp, #36]
+        subs            h,    h,    #16
+        add             yuv,  yuv,  dw, lsl #4
+        sub             yuv,  yuv,  w,  lsl #1
+        add             y,    y,    yw, lsl #4
+        sub             y,    y,    w
+        add             u,    u,    cw, lsl #3
+        sub             u,    u,    w,  asr #1
+        add             v,    v,    cw, lsl #3
+        sub             v,    v,    w,  asr #1
+        bgt             1b
+
+        pop             {r4-r11,pc}
+        .endfunc
+