summaryrefslogtreecommitdiff
path: root/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff
diff options
context:
space:
mode:
authorKoen Kooi <koen@openembedded.org>2009-05-30 11:34:23 +0200
committerKoen Kooi <koen@openembedded.org>2009-05-30 11:34:23 +0200
commit381151e490a307050d35273058cb5c6d60472ab6 (patch)
tree27baee19943156ecb8299f11a372aa0ef916ec47 /recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff
parent618bf2cd022e97a36a5f321d5f90cff9bb17bd3f (diff)
ffmpeg 0.5: sync arm optimizations with current git
Diffstat (limited to 'recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff')
-rw-r--r--recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff495
1 files changed, 495 insertions, 0 deletions
diff --git a/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff b/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff
new file mode 100644
index 0000000000..7c72ccd665
--- /dev/null
+++ b/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff
@@ -0,0 +1,495 @@
+ Makefile | 4 +
+ arm/dsputil_neon.c | 16 ++++
+ arm/dsputil_neon_s.S | 178 +++++++++++++++++++++++++++++++++++++------------
+ arm/simple_idct_neon.S | 17 ++++
+ arm/vp3dsp_neon.S | 94 +++++++++++++++++++++++++
+ 5 files changed, 265 insertions(+), 44 deletions(-)
+diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon.c ffmpeg-0.5/libavcodec/arm/dsputil_neon.c
+--- ffmpeg.old/libavcodec/arm/dsputil_neon.c 2009-01-31 00:13:19.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/dsputil_neon.c 2009-05-30 11:27:54.000000000 +0200
+@@ -41,6 +41,10 @@
+
+ void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+
++void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++
+ void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
+@@ -146,6 +150,9 @@
+ DCTELEM *block, int stride,
+ const uint8_t nnzc[6*8]);
+
++void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
++void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
++
+ void ff_vector_fmul_neon(float *dst, const float *src, int len);
+ void ff_vector_fmul_window_neon(float *dst, const float *src0,
+ const float *src1, const float *win,
+@@ -176,6 +183,10 @@
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+
++ c->add_pixels_clamped = ff_add_pixels_clamped_neon;
++ c->put_pixels_clamped = ff_put_pixels_clamped_neon;
++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
++
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+
+@@ -247,6 +258,11 @@
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+ c->h264_idct_add8 = ff_h264_idct_add8_neon;
+
++ if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
++ c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
++ c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
++ }
++
+ c->vector_fmul = ff_vector_fmul_neon;
+ c->vector_fmul_window = ff_vector_fmul_window_neon;
+
+diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon_s.S ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S
+--- ffmpeg.old/libavcodec/arm/dsputil_neon_s.S 2009-01-31 00:13:19.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S 2009-05-30 11:27:54.000000000 +0200
+@@ -38,13 +38,13 @@
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+ .if \avg
+- vld1.64 {d16,d17}, [ip], r2
++ vld1.64 {d16,d17}, [ip,:128], r2
+ vrhadd.u8 q0, q0, q8
+- vld1.64 {d18,d19}, [ip], r2
++ vld1.64 {d18,d19}, [ip,:128], r2
+ vrhadd.u8 q1, q1, q9
+- vld1.64 {d20,d21}, [ip], r2
++ vld1.64 {d20,d21}, [ip,:128], r2
+ vrhadd.u8 q2, q2, q10
+- vld1.64 {d22,d23}, [ip], r2
++ vld1.64 {d22,d23}, [ip,:128], r2
+ vrhadd.u8 q3, q3, q11
+ .endif
+ subs r3, r3, #4
+@@ -73,35 +73,29 @@
+ .endm
+
+ .macro pixels16_y2 vhadd=vrhadd.u8
+- push {lr}
+- add ip, r1, r2
+- lsl lr, r2, #1
+- vld1.64 {d0, d1}, [r1], lr
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d0, d1}, [r1], r2
++ vld1.64 {d2, d3}, [r1], r2
+ 1: subs r3, r3, #2
+ \vhadd q2, q0, q1
+- vld1.64 {d0, d1}, [r1], lr
++ vld1.64 {d0, d1}, [r1], r2
+ \vhadd q3, q0, q1
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d2, d3}, [r1], r2
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vst1.64 {d4, d5}, [r0,:128], r2
+ vst1.64 {d6, d7}, [r0,:128], r2
+ bne 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
+- push {lr}
+- lsl lr, r2, #1
+- add ip, r1, r2
+- vld1.64 {d0-d2}, [r1], lr
+- vld1.64 {d4-d6}, [ip], lr
++ vld1.64 {d0-d2}, [r1], r2
++ vld1.64 {d4-d6}, [r1], r2
+ .if \no_rnd
+ vmov.i16 q13, #1
+ .endif
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vext.8 q1, q0, q1, #1
+ vext.8 q3, q2, q3, #1
+ vaddl.u8 q8, d0, d2
+@@ -109,7 +103,7 @@
+ vaddl.u8 q9, d4, d6
+ vaddl.u8 q11, d5, d7
+ 1: subs r3, r3, #2
+- vld1.64 {d0-d2}, [r1], lr
++ vld1.64 {d0-d2}, [r1], r2
+ vadd.u16 q12, q8, q9
+ pld [r1]
+ .if \no_rnd
+@@ -123,11 +117,11 @@
+ .endif
+ \vshrn d29, q1, #2
+ vaddl.u8 q8, d0, d30
+- vld1.64 {d2-d4}, [ip], lr
++ vld1.64 {d2-d4}, [r1], r2
+ vaddl.u8 q10, d1, d31
+ vst1.64 {d28,d29}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+- pld [ip]
++ pld [r1, r2]
+ .if \no_rnd
+ vadd.u16 q12, q12, q13
+ .endif
+@@ -142,7 +136,7 @@
+ vaddl.u8 q11, d3, d5
+ vst1.64 {d30,d31}, [r0,:128], r2
+ bgt 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixels8
+@@ -180,41 +174,35 @@
+ .endm
+
+ .macro pixels8_y2 vhadd=vrhadd.u8
+- push {lr}
+- add ip, r1, r2
+- lsl lr, r2, #1
+- vld1.64 {d0}, [r1], lr
+- vld1.64 {d1}, [ip], lr
++ vld1.64 {d0}, [r1], r2
++ vld1.64 {d1}, [r1], r2
+ 1: subs r3, r3, #2
+ \vhadd d4, d0, d1
+- vld1.64 {d0}, [r1], lr
++ vld1.64 {d0}, [r1], r2
+ \vhadd d5, d0, d1
+- vld1.64 {d1}, [ip], lr
++ vld1.64 {d1}, [r1], r2
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vst1.64 {d4}, [r0,:64], r2
+ vst1.64 {d5}, [r0,:64], r2
+ bne 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
+- push {lr}
+- lsl lr, r2, #1
+- add ip, r1, r2
+- vld1.64 {d0, d1}, [r1], lr
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d0, d1}, [r1], r2
++ vld1.64 {d2, d3}, [r1], r2
+ .if \no_rnd
+ vmov.i16 q11, #1
+ .endif
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vext.8 d4, d0, d1, #1
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q8, d0, d4
+ vaddl.u8 q9, d2, d6
+ 1: subs r3, r3, #2
+- vld1.64 {d0, d1}, [r1], lr
++ vld1.64 {d0, d1}, [r1], r2
+ pld [r1]
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+@@ -223,9 +211,9 @@
+ .endif
+ vaddl.u8 q8, d0, d4
+ \vshrn d5, q10, #2
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d2, d3}, [r1], r2
+ vadd.u16 q10, q8, q9
+- pld [ip]
++ pld [r1, r2]
+ .if \no_rnd
+ vadd.u16 q10, q10, q11
+ .endif
+@@ -235,7 +223,7 @@
+ vaddl.u8 q9, d2, d6
+ vst1.64 {d7}, [r0,:64], r2
+ bgt 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixfunc pfx name suf rnd_op args:vararg
+@@ -273,6 +261,112 @@
+ pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
+
++function ff_put_pixels_clamped_neon, export=1
++ vld1.64 {d16-d19}, [r0,:128]!
++ vqmovun.s16 d0, q8
++ vld1.64 {d20-d23}, [r0,:128]!
++ vqmovun.s16 d1, q9
++ vld1.64 {d24-d27}, [r0,:128]!
++ vqmovun.s16 d2, q10
++ vld1.64 {d28-d31}, [r0,:128]!
++ vqmovun.s16 d3, q11
++ vst1.64 {d0}, [r1,:64], r2
++ vqmovun.s16 d4, q12
++ vst1.64 {d1}, [r1,:64], r2
++ vqmovun.s16 d5, q13
++ vst1.64 {d2}, [r1,:64], r2
++ vqmovun.s16 d6, q14
++ vst1.64 {d3}, [r1,:64], r2
++ vqmovun.s16 d7, q15
++ vst1.64 {d4}, [r1,:64], r2
++ vst1.64 {d5}, [r1,:64], r2
++ vst1.64 {d6}, [r1,:64], r2
++ vst1.64 {d7}, [r1,:64], r2
++ bx lr
++ .endfunc
++
++function ff_put_signed_pixels_clamped_neon, export=1
++ vmov.u8 d31, #128
++ vld1.64 {d16-d17}, [r0,:128]!
++ vqmovn.s16 d0, q8
++ vld1.64 {d18-d19}, [r0,:128]!
++ vqmovn.s16 d1, q9
++ vld1.64 {d16-d17}, [r0,:128]!
++ vqmovn.s16 d2, q8
++ vld1.64 {d18-d19}, [r0,:128]!
++ vadd.u8 d0, d0, d31
++ vld1.64 {d20-d21}, [r0,:128]!
++ vadd.u8 d1, d1, d31
++ vld1.64 {d22-d23}, [r0,:128]!
++ vadd.u8 d2, d2, d31
++ vst1.64 {d0}, [r1,:64], r2
++ vqmovn.s16 d3, q9
++ vst1.64 {d1}, [r1,:64], r2
++ vqmovn.s16 d4, q10
++ vst1.64 {d2}, [r1,:64], r2
++ vqmovn.s16 d5, q11
++ vld1.64 {d24-d25}, [r0,:128]!
++ vadd.u8 d3, d3, d31
++ vld1.64 {d26-d27}, [r0,:128]!
++ vadd.u8 d4, d4, d31
++ vadd.u8 d5, d5, d31
++ vst1.64 {d3}, [r1,:64], r2
++ vqmovn.s16 d6, q12
++ vst1.64 {d4}, [r1,:64], r2
++ vqmovn.s16 d7, q13
++ vst1.64 {d5}, [r1,:64], r2
++ vadd.u8 d6, d6, d31
++ vadd.u8 d7, d7, d31
++ vst1.64 {d6}, [r1,:64], r2
++ vst1.64 {d7}, [r1,:64], r2
++ bx lr
++ .endfunc
++
++function ff_add_pixels_clamped_neon, export=1
++ mov r3, r1
++ vld1.64 {d16}, [r1,:64], r2
++ vld1.64 {d0-d1}, [r0,:128]!
++ vaddw.u8 q0, q0, d16
++ vld1.64 {d17}, [r1,:64], r2
++ vld1.64 {d2-d3}, [r0,:128]!
++ vqmovun.s16 d0, q0
++ vld1.64 {d18}, [r1,:64], r2
++ vaddw.u8 q1, q1, d17
++ vld1.64 {d4-d5}, [r0,:128]!
++ vaddw.u8 q2, q2, d18
++ vst1.64 {d0}, [r3,:64], r2
++ vqmovun.s16 d2, q1
++ vld1.64 {d19}, [r1,:64], r2
++ vld1.64 {d6-d7}, [r0,:128]!
++ vaddw.u8 q3, q3, d19
++ vqmovun.s16 d4, q2
++ vst1.64 {d2}, [r3,:64], r2
++ vld1.64 {d16}, [r1,:64], r2
++ vqmovun.s16 d6, q3
++ vld1.64 {d0-d1}, [r0,:128]!
++ vaddw.u8 q0, q0, d16
++ vst1.64 {d4}, [r3,:64], r2
++ vld1.64 {d17}, [r1,:64], r2
++ vld1.64 {d2-d3}, [r0,:128]!
++ vaddw.u8 q1, q1, d17
++ vst1.64 {d6}, [r3,:64], r2
++ vqmovun.s16 d0, q0
++ vld1.64 {d18}, [r1,:64], r2
++ vld1.64 {d4-d5}, [r0,:128]!
++ vaddw.u8 q2, q2, d18
++ vst1.64 {d0}, [r3,:64], r2
++ vqmovun.s16 d2, q1
++ vld1.64 {d19}, [r1,:64], r2
++ vqmovun.s16 d4, q2
++ vld1.64 {d6-d7}, [r0,:128]!
++ vaddw.u8 q3, q3, d19
++ vst1.64 {d2}, [r3,:64], r2
++ vqmovun.s16 d6, q3
++ vst1.64 {d4}, [r3,:64], r2
++ vst1.64 {d6}, [r3,:64], r2
++ bx lr
++ .endfunc
++
+ function ff_float_to_int16_neon, export=1
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r1,:128]!
+diff -Nurd ffmpeg.old/libavcodec/arm/simple_idct_neon.S ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S
+--- ffmpeg.old/libavcodec/arm/simple_idct_neon.S 2008-12-30 04:13:52.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S 2009-05-30 11:27:54.000000000 +0200
+@@ -68,6 +68,19 @@
+ .text
+ .align 6
+
++function idct_row4_pld_neon
++ pld [r0]
++ add r3, r0, r1, lsl #2
++ pld [r0, r1]
++ pld [r0, r1, lsl #1]
++ pld [r3, -r1]
++ pld [r3]
++ pld [r3, r1]
++ add r3, r3, r1, lsl #1
++ pld [r3]
++ pld [r3, r1]
++ .endfunc
++
+ function idct_row4_neon
+ vmov.i32 q15, #(1<<(ROW_SHIFT-1))
+ vld1.64 {d2-d5}, [r2,:128]!
+@@ -252,7 +265,7 @@
+ function ff_simple_idct_put_neon, export=1
+ idct_start r2
+
+- bl idct_row4_neon
++ bl idct_row4_pld_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+@@ -307,7 +320,7 @@
+ function ff_simple_idct_add_neon, export=1
+ idct_start r2
+
+- bl idct_row4_neon
++ bl idct_row4_pld_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+diff -Nurd ffmpeg.old/libavcodec/arm/vp3dsp_neon.S ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S
+--- ffmpeg.old/libavcodec/arm/vp3dsp_neon.S 1970-01-01 01:00:00.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S 2009-05-30 11:27:54.000000000 +0200
+@@ -0,0 +1,94 @@
++/*
++ * Copyright (c) 2009 David Conrad
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "asm.S"
++
++.macro vp3_loop_filter
++ vsubl.u8 q3, d18, d17
++ vsubl.u8 q2, d16, d19
++ vadd.i16 q1, q3, q3
++ vadd.i16 q2, q2, q3
++ vadd.i16 q0, q1, q2
++ vrshr.s16 q0, q0, #3
++ vmovl.u8 q9, d18
++ vdup.u16 q15, r2
++
++ vabs.s16 q1, q0
++ vshr.s16 q0, q0, #15
++ vqsub.u16 q2, q15, q1
++ vqsub.u16 q3, q2, q1
++ vsub.i16 q1, q2, q3
++ veor q1, q1, q0
++ vsub.i16 q0, q1, q0
++
++ vaddw.u8 q2, q0, d17
++ vsub.i16 q3, q9, q0
++ vqmovun.s16 d0, q2
++ vqmovun.s16 d1, q3
++.endm
++
++function ff_vp3_v_loop_filter_neon, export=1
++ sub ip, r0, r1
++ sub r0, r0, r1, lsl #1
++ vld1.64 {d16}, [r0,:64], r1
++ vld1.64 {d17}, [r0,:64], r1
++ vld1.64 {d18}, [r0,:64], r1
++ vld1.64 {d19}, [r0,:64], r1
++ ldrb r2, [r2, #129*4]
++
++ vp3_loop_filter
++
++ vst1.64 {d0}, [ip,:64], r1
++ vst1.64 {d1}, [ip,:64], r1
++ bx lr
++.endfunc
++
++function ff_vp3_h_loop_filter_neon, export=1
++ sub ip, r0, #1
++ sub r0, r0, #2
++ vld1.32 {d16[]}, [r0], r1
++ vld1.32 {d17[]}, [r0], r1
++ vld1.32 {d18[]}, [r0], r1
++ vld1.32 {d19[]}, [r0], r1
++ vld1.32 {d16[1]}, [r0], r1
++ vld1.32 {d17[1]}, [r0], r1
++ vld1.32 {d18[1]}, [r0], r1
++ vld1.32 {d19[1]}, [r0], r1
++ ldrb r2, [r2, #129*4]
++
++ vtrn.8 d16, d17
++ vtrn.8 d18, d19
++ vtrn.16 d16, d18
++ vtrn.16 d17, d19
++
++ vp3_loop_filter
++
++ vtrn.8 d0, d1
++
++ vst1.16 {d0[0]}, [ip], r1
++ vst1.16 {d1[0]}, [ip], r1
++ vst1.16 {d0[1]}, [ip], r1
++ vst1.16 {d1[1]}, [ip], r1
++ vst1.16 {d0[2]}, [ip], r1
++ vst1.16 {d1[2]}, [ip], r1
++ vst1.16 {d0[3]}, [ip], r1
++ vst1.16 {d1[3]}, [ip], r1
++ bx lr
++.endfunc
+diff -Nurd ffmpeg.old/libavcodec/Makefile ffmpeg-0.5/libavcodec/Makefile
+--- ffmpeg.old/libavcodec/Makefile 2009-02-26 03:29:24.000000000 +0100
++++ ffmpeg-0.5/libavcodec/Makefile 2009-05-30 11:29:51.000000000 +0200
+@@ -477,11 +477,15 @@
+ OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
+ arm/mpegvideo_iwmmxt.o \
+
++NEON-OBJS-$(CONFIG_THEORA_DECODER) += arm/vp3dsp_neon.o
++NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
++
+ OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \
+ arm/dsputil_neon_s.o \
+ arm/h264dsp_neon.o \
+ arm/h264idct_neon.o \
+ arm/simple_idct_neon.o \
++ $(NEON-OBJS-yes)
+
+ OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \
+ bfin/fdct_bfin.o \