Makefile               |    4 +
 arm/dsputil_neon.c     |   16 ++++
 arm/dsputil_neon_s.S   |  178 +++++++++++++++++++++++++++++++++++++------------
 arm/simple_idct_neon.S |   17 ++++
 arm/vp3dsp_neon.S      |   94 +++++++++++++++++++++++++
 5 files changed, 265 insertions(+), 44 deletions(-)
diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon.c ffmpeg-0.5/libavcodec/arm/dsputil_neon.c
--- ffmpeg.old/libavcodec/arm/dsputil_neon.c	2009-01-31 00:13:19.000000000 +0100
+++ ffmpeg-0.5/libavcodec/arm/dsputil_neon.c	2009-05-30 11:27:54.000000000 +0200
@@ -41,6 +41,10 @@
 
 void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
 
+void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+
 void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
 void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
 void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
@@ -146,6 +150,9 @@
                             DCTELEM *block, int stride,
                             const uint8_t nnzc[6*8]);
 
+void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
+void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
+
 void ff_vector_fmul_neon(float *dst, const float *src, int len);
 void ff_vector_fmul_window_neon(float *dst, const float *src0,
                                 const float *src1, const float *win,
@@ -176,6 +183,10 @@
 
     c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
 
+    c->add_pixels_clamped = ff_add_pixels_clamped_neon;
+    c->put_pixels_clamped = ff_put_pixels_clamped_neon;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+
     c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
     c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
 
@@ -247,6 +258,11 @@
     c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
     c->h264_idct_add8       = ff_h264_idct_add8_neon;
 
+    if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
+        c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
+        c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
+    }
+
     c->vector_fmul = ff_vector_fmul_neon;
     c->vector_fmul_window = ff_vector_fmul_window_neon;
 
diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon_s.S ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S
--- ffmpeg.old/libavcodec/arm/dsputil_neon_s.S	2009-01-31 00:13:19.000000000 +0100
+++ ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S	2009-05-30 11:27:54.000000000 +0200
@@ -38,13 +38,13 @@
         pld             [r1, r2]
         pld             [r1, r2, lsl #1]
 .if \avg
-        vld1.64         {d16,d17}, [ip], r2
+        vld1.64         {d16,d17}, [ip,:128], r2
         vrhadd.u8       q0,  q0,  q8
-        vld1.64         {d18,d19}, [ip], r2
+        vld1.64         {d18,d19}, [ip,:128], r2
         vrhadd.u8       q1,  q1,  q9
-        vld1.64         {d20,d21}, [ip], r2
+        vld1.64         {d20,d21}, [ip,:128], r2
         vrhadd.u8       q2,  q2,  q10
-        vld1.64         {d22,d23}, [ip], r2
+        vld1.64         {d22,d23}, [ip,:128], r2
         vrhadd.u8       q3,  q3,  q11
 .endif
         subs            r3,  r3,  #4
@@ -73,35 +73,29 @@
         .endm
 
         .macro pixels16_y2 vhadd=vrhadd.u8
-        push            {lr}
-        add             ip,  r1,  r2
-        lsl             lr,  r2,  #1
-        vld1.64         {d0, d1},  [r1], lr
-        vld1.64         {d2, d3},  [ip], lr
+        vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d2, d3},  [r1], r2
 1:      subs            r3,  r3,  #2
         \vhadd          q2,  q0,  q1
-        vld1.64         {d0, d1},  [r1],      lr
+        vld1.64         {d0, d1},  [r1], r2
         \vhadd          q3,  q0,  q1
-        vld1.64         {d2, d3},  [ip],      lr
+        vld1.64         {d2, d3},  [r1], r2
         pld             [r1]
-        pld             [ip]
+        pld             [r1, r2]
         vst1.64         {d4, d5},  [r0,:128], r2
         vst1.64         {d6, d7},  [r0,:128], r2
         bne             1b
-        pop             {pc}
+        bx              lr
         .endm
 
         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
-        push            {lr}
-        lsl             lr,  r2,  #1
-        add             ip,  r1,  r2
-        vld1.64         {d0-d2},   [r1], lr
-        vld1.64         {d4-d6},   [ip], lr
+        vld1.64         {d0-d2},   [r1], r2
+        vld1.64         {d4-d6},   [r1], r2
 .if \no_rnd
         vmov.i16        q13, #1
 .endif
         pld             [r1]
-        pld             [ip]
+        pld             [r1, r2]
         vext.8          q1,  q0,  q1,  #1
         vext.8          q3,  q2,  q3,  #1
         vaddl.u8        q8,  d0,  d2
@@ -109,7 +103,7 @@
         vaddl.u8        q9,  d4,  d6
         vaddl.u8        q11, d5,  d7
 1:      subs            r3,  r3,  #2
-        vld1.64         {d0-d2},   [r1], lr
+        vld1.64         {d0-d2},   [r1], r2
         vadd.u16        q12, q8,  q9
         pld             [r1]
 .if \no_rnd
@@ -123,11 +117,11 @@
 .endif
         \vshrn          d29, q1,  #2
         vaddl.u8        q8,  d0,  d30
-        vld1.64         {d2-d4},   [ip], lr
+        vld1.64         {d2-d4},   [r1], r2
         vaddl.u8        q10, d1,  d31
         vst1.64         {d28,d29}, [r0,:128], r2
         vadd.u16        q12, q8,  q9
-        pld             [ip]
+        pld             [r1, r2]
 .if \no_rnd
         vadd.u16        q12, q12, q13
 .endif
@@ -142,7 +136,7 @@
         vaddl.u8        q11, d3,  d5
         vst1.64         {d30,d31}, [r0,:128], r2
         bgt             1b
-        pop             {pc}
+        bx              lr
         .endm
 
         .macro pixels8
@@ -180,41 +174,35 @@
         .endm
 
         .macro pixels8_y2 vhadd=vrhadd.u8
-        push            {lr}
-        add             ip,  r1,  r2
-        lsl             lr,  r2,  #1
-        vld1.64         {d0},      [r1], lr
-        vld1.64         {d1},      [ip], lr
+        vld1.64         {d0},      [r1], r2
+        vld1.64         {d1},      [r1], r2
 1:      subs            r3,  r3,  #2
         \vhadd          d4,  d0,  d1
-        vld1.64         {d0},      [r1],     lr
+        vld1.64         {d0},      [r1], r2
         \vhadd          d5,  d0,  d1
-        vld1.64         {d1},      [ip],     lr
+        vld1.64         {d1},      [r1], r2
         pld             [r1]
-        pld             [ip]
+        pld             [r1, r2]
         vst1.64         {d4},      [r0,:64], r2
         vst1.64         {d5},      [r0,:64], r2
         bne             1b
-        pop             {pc}
+        bx              lr
         .endm
 
         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
-        push            {lr}
-        lsl             lr,  r2,  #1
-        add             ip,  r1,  r2
-        vld1.64         {d0, d1},  [r1], lr
-        vld1.64         {d2, d3},  [ip], lr
+        vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d2, d3},  [r1], r2
 .if \no_rnd
         vmov.i16        q11, #1
 .endif
         pld             [r1]
-        pld             [ip]
+        pld             [r1, r2]
         vext.8          d4,  d0,  d1,  #1
         vext.8          d6,  d2,  d3,  #1
         vaddl.u8        q8,  d0,  d4
         vaddl.u8        q9,  d2,  d6
 1:      subs            r3,  r3,  #2
-        vld1.64         {d0, d1},  [r1], lr
+        vld1.64         {d0, d1},  [r1], r2
         pld             [r1]
         vadd.u16        q10, q8,  q9
         vext.8          d4,  d0,  d1,  #1
@@ -223,9 +211,9 @@
 .endif
         vaddl.u8        q8,  d0,  d4
         \vshrn          d5,  q10, #2
-        vld1.64         {d2, d3},  [ip], lr
+        vld1.64         {d2, d3},  [r1], r2
         vadd.u16        q10, q8,  q9
-        pld             [ip]
+        pld             [r1, r2]
 .if \no_rnd
         vadd.u16        q10, q10, q11
 .endif
@@ -235,7 +223,7 @@
         vaddl.u8        q9,  d2,  d6
         vst1.64         {d7},      [r0,:64], r2
         bgt             1b
-        pop             {pc}
+        bx              lr
         .endm
 
         .macro pixfunc pfx name suf rnd_op args:vararg
@@ -273,6 +261,112 @@
         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
 
+function ff_put_pixels_clamped_neon, export=1
+        vld1.64         {d16-d19}, [r0,:128]!
+        vqmovun.s16     d0, q8
+        vld1.64         {d20-d23}, [r0,:128]!
+        vqmovun.s16     d1, q9
+        vld1.64         {d24-d27}, [r0,:128]!
+        vqmovun.s16     d2, q10
+        vld1.64         {d28-d31}, [r0,:128]!
+        vqmovun.s16     d3, q11
+        vst1.64         {d0},      [r1,:64], r2
+        vqmovun.s16     d4, q12
+        vst1.64         {d1},      [r1,:64], r2
+        vqmovun.s16     d5, q13
+        vst1.64         {d2},      [r1,:64], r2
+        vqmovun.s16     d6, q14
+        vst1.64         {d3},      [r1,:64], r2
+        vqmovun.s16     d7, q15
+        vst1.64         {d4},      [r1,:64], r2
+        vst1.64         {d5},      [r1,:64], r2
+        vst1.64         {d6},      [r1,:64], r2
+        vst1.64         {d7},      [r1,:64], r2
+        bx              lr
+        .endfunc
+
+function ff_put_signed_pixels_clamped_neon, export=1
+        vmov.u8         d31, #128
+        vld1.64         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d0, q8
+        vld1.64         {d18-d19}, [r0,:128]!
+        vqmovn.s16      d1, q9
+        vld1.64         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d2, q8
+        vld1.64         {d18-d19}, [r0,:128]!
+        vadd.u8         d0, d0, d31
+        vld1.64         {d20-d21}, [r0,:128]!
+        vadd.u8         d1, d1, d31
+        vld1.64         {d22-d23}, [r0,:128]!
+        vadd.u8         d2, d2, d31
+        vst1.64         {d0},      [r1,:64], r2
+        vqmovn.s16      d3, q9
+        vst1.64         {d1},      [r1,:64], r2
+        vqmovn.s16      d4, q10
+        vst1.64         {d2},      [r1,:64], r2
+        vqmovn.s16      d5, q11
+        vld1.64         {d24-d25}, [r0,:128]!
+        vadd.u8         d3, d3, d31
+        vld1.64         {d26-d27}, [r0,:128]!
+        vadd.u8         d4, d4, d31
+        vadd.u8         d5, d5, d31
+        vst1.64         {d3},      [r1,:64], r2
+        vqmovn.s16      d6, q12
+        vst1.64         {d4},      [r1,:64], r2
+        vqmovn.s16      d7, q13
+        vst1.64         {d5},      [r1,:64], r2
+        vadd.u8         d6, d6, d31
+        vadd.u8         d7, d7, d31
+        vst1.64         {d6},      [r1,:64], r2
+        vst1.64         {d7},      [r1,:64], r2
+        bx              lr
+        .endfunc
+
+function ff_add_pixels_clamped_neon, export=1
+        mov             r3, r1
+        vld1.64         {d16},   [r1,:64], r2
+        vld1.64         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vld1.64         {d17},   [r1,:64], r2
+        vld1.64         {d2-d3}, [r0,:128]!
+        vqmovun.s16     d0, q0
+        vld1.64         {d18},   [r1,:64], r2
+        vaddw.u8        q1, q1, d17
+        vld1.64         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.64         {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.64         {d19},   [r1,:64], r2
+        vld1.64         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vqmovun.s16     d4, q2
+        vst1.64         {d2},    [r3,:64], r2
+        vld1.64         {d16},   [r1,:64], r2
+        vqmovun.s16     d6, q3
+        vld1.64         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vst1.64         {d4},    [r3,:64], r2
+        vld1.64         {d17},   [r1,:64], r2
+        vld1.64         {d2-d3}, [r0,:128]!
+        vaddw.u8        q1, q1, d17
+        vst1.64         {d6},    [r3,:64], r2
+        vqmovun.s16     d0, q0
+        vld1.64         {d18},   [r1,:64], r2
+        vld1.64         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.64         {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.64         {d19},   [r1,:64], r2
+        vqmovun.s16     d4, q2
+        vld1.64         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vst1.64         {d2},    [r3,:64], r2
+        vqmovun.s16     d6, q3
+        vst1.64         {d4},    [r3,:64], r2
+        vst1.64         {d6},    [r3,:64], r2
+        bx              lr
+        .endfunc
+
 function ff_float_to_int16_neon, export=1
         subs            r2,  r2,  #8
         vld1.64         {d0-d1},  [r1,:128]!
diff -Nurd ffmpeg.old/libavcodec/arm/simple_idct_neon.S ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S
--- ffmpeg.old/libavcodec/arm/simple_idct_neon.S	2008-12-30 04:13:52.000000000 +0100
+++ ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S	2009-05-30 11:27:54.000000000 +0200
@@ -68,6 +68,19 @@
         .text
         .align 6
 
+function idct_row4_pld_neon
+        pld             [r0]
+        add             r3,  r0,  r1,  lsl #2
+        pld             [r0, r1]
+        pld             [r0, r1, lsl #1]
+        pld             [r3, -r1]
+        pld             [r3]
+        pld             [r3, r1]
+        add             r3,  r3,  r1,  lsl #1
+        pld             [r3]
+        pld             [r3, r1]
+        .endfunc
+
 function idct_row4_neon
         vmov.i32        q15, #(1<<(ROW_SHIFT-1))
         vld1.64         {d2-d5},  [r2,:128]!
@@ -252,7 +265,7 @@
 function ff_simple_idct_put_neon, export=1
         idct_start      r2
 
-        bl              idct_row4_neon
+        bl              idct_row4_pld_neon
         bl              idct_row4_neon
         add             r2,  r2,  #-128
         bl              idct_col4_neon
@@ -307,7 +320,7 @@
 function ff_simple_idct_add_neon, export=1
         idct_start      r2
 
-        bl              idct_row4_neon
+        bl              idct_row4_pld_neon
         bl              idct_row4_neon
         add             r2,  r2,  #-128
         bl              idct_col4_neon
diff -Nurd ffmpeg.old/libavcodec/arm/vp3dsp_neon.S ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S
--- ffmpeg.old/libavcodec/arm/vp3dsp_neon.S	1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S	2009-05-30 11:27:54.000000000 +0200
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+.macro vp3_loop_filter
+    vsubl.u8        q3,  d18, d17
+    vsubl.u8        q2,  d16, d19
+    vadd.i16        q1,  q3,  q3
+    vadd.i16        q2,  q2,  q3
+    vadd.i16        q0,  q1,  q2
+    vrshr.s16       q0,  q0,  #3
+    vmovl.u8        q9,  d18
+    vdup.u16        q15, r2
+
+    vabs.s16        q1,  q0
+    vshr.s16        q0,  q0,  #15
+    vqsub.u16       q2,  q15, q1
+    vqsub.u16       q3,  q2,  q1
+    vsub.i16        q1,  q2,  q3
+    veor            q1,  q1,  q0
+    vsub.i16        q0,  q1,  q0
+
+    vaddw.u8        q2,  q0,  d17
+    vsub.i16        q3,  q9,  q0
+    vqmovun.s16     d0,  q2
+    vqmovun.s16     d1,  q3
+.endm
+
+function ff_vp3_v_loop_filter_neon, export=1
+    sub             ip,  r0,  r1
+    sub             r0,  r0,  r1,  lsl #1
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d17}, [r0,:64], r1
+    vld1.64         {d18}, [r0,:64], r1
+    vld1.64         {d19}, [r0,:64], r1
+    ldrb            r2,    [r2, #129*4]
+
+    vp3_loop_filter
+
+    vst1.64         {d0},  [ip,:64], r1
+    vst1.64         {d1},  [ip,:64], r1
+    bx              lr
+.endfunc
+
+function ff_vp3_h_loop_filter_neon, export=1
+    sub             ip,  r0,  #1
+    sub             r0,  r0,  #2
+    vld1.32         {d16[]},  [r0], r1
+    vld1.32         {d17[]},  [r0], r1
+    vld1.32         {d18[]},  [r0], r1
+    vld1.32         {d19[]},  [r0], r1
+    vld1.32         {d16[1]}, [r0], r1
+    vld1.32         {d17[1]}, [r0], r1
+    vld1.32         {d18[1]}, [r0], r1
+    vld1.32         {d19[1]}, [r0], r1
+    ldrb            r2,  [r2, #129*4]
+
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+    vtrn.16         d16, d18
+    vtrn.16         d17, d19
+
+    vp3_loop_filter
+
+    vtrn.8          d0,  d1
+
+    vst1.16         {d0[0]}, [ip], r1
+    vst1.16         {d1[0]}, [ip], r1
+    vst1.16         {d0[1]}, [ip], r1
+    vst1.16         {d1[1]}, [ip], r1
+    vst1.16         {d0[2]}, [ip], r1
+    vst1.16         {d1[2]}, [ip], r1
+    vst1.16         {d0[3]}, [ip], r1
+    vst1.16         {d1[3]}, [ip], r1
+    bx              lr
+.endfunc
diff -Nurd ffmpeg.old/libavcodec/Makefile ffmpeg-0.5/libavcodec/Makefile
--- ffmpeg.old/libavcodec/Makefile	2009-02-26 03:29:24.000000000 +0100
+++ ffmpeg-0.5/libavcodec/Makefile	2009-05-30 11:29:51.000000000 +0200
@@ -477,11 +477,15 @@
 OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
                                           arm/mpegvideo_iwmmxt.o        \
 
+NEON-OBJS-$(CONFIG_THEORA_DECODER)     += arm/vp3dsp_neon.o
+NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o
+
 OBJS-$(HAVE_NEON)                      += arm/dsputil_neon.o            \
                                           arm/dsputil_neon_s.o          \
                                           arm/h264dsp_neon.o            \
                                           arm/h264idct_neon.o           \
                                           arm/simple_idct_neon.o        \
+                                          $(NEON-OBJS-yes)
 
 OBJS-$(ARCH_BFIN)                      += bfin/dsputil_bfin.o           \
                                           bfin/fdct_bfin.o              \