diff options
author | Koen Kooi <koen@openembedded.org> | 2009-05-30 11:34:23 +0200 |
---|---|---|
committer | Koen Kooi <koen@openembedded.org> | 2009-05-30 11:34:23 +0200 |
commit | 381151e490a307050d35273058cb5c6d60472ab6 (patch) | |
tree | 27baee19943156ecb8299f11a372aa0ef916ec47 /recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff | |
parent | 618bf2cd022e97a36a5f321d5f90cff9bb17bd3f (diff) |
ffmpeg 0.5: sync arm optimizations with current git
Diffstat (limited to 'recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff')
-rw-r--r-- | recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff | 495 |
1 files changed, 495 insertions, 0 deletions
diff --git a/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff b/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff new file mode 100644 index 0000000000..7c72ccd665 --- /dev/null +++ b/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff @@ -0,0 +1,495 @@ + Makefile | 4 + + arm/dsputil_neon.c | 16 ++++ + arm/dsputil_neon_s.S | 178 +++++++++++++++++++++++++++++++++++++------------ + arm/simple_idct_neon.S | 17 ++++ + arm/vp3dsp_neon.S | 94 +++++++++++++++++++++++++ + 5 files changed, 265 insertions(+), 44 deletions(-) +diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon.c ffmpeg-0.5/libavcodec/arm/dsputil_neon.c +--- ffmpeg.old/libavcodec/arm/dsputil_neon.c 2009-01-31 00:13:19.000000000 +0100 ++++ ffmpeg-0.5/libavcodec/arm/dsputil_neon.c 2009-05-30 11:27:54.000000000 +0200 +@@ -41,6 +41,10 @@ + + void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); + ++void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); ++void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); ++void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); ++ + void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); + void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); + void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); +@@ -146,6 +150,9 @@ + DCTELEM *block, int stride, + const uint8_t nnzc[6*8]); + ++void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); ++void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); ++ + void ff_vector_fmul_neon(float *dst, const float *src, int len); + void ff_vector_fmul_window_neon(float *dst, const float *src0, + const float *src1, const float *win, +@@ -176,6 +183,10 @@ + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + ++ c->add_pixels_clamped = ff_add_pixels_clamped_neon; ++ c->put_pixels_clamped = ff_put_pixels_clamped_neon; ++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; ++ + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; + +@@ -247,6 +258,11 @@ + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; + c->h264_idct_add8 = ff_h264_idct_add8_neon; + ++ if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) { ++ c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; ++ c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; ++ } ++ + c->vector_fmul = ff_vector_fmul_neon; + c->vector_fmul_window = ff_vector_fmul_window_neon; + +diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon_s.S ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S +--- ffmpeg.old/libavcodec/arm/dsputil_neon_s.S 2009-01-31 00:13:19.000000000 +0100 ++++ ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S 2009-05-30 11:27:54.000000000 +0200 +@@ -38,13 +38,13 @@ + pld [r1, r2] + pld [r1, r2, lsl #1] + .if \avg +- vld1.64 {d16,d17}, [ip], r2 ++ vld1.64 {d16,d17}, [ip,:128], r2 + vrhadd.u8 q0, q0, q8 +- vld1.64 {d18,d19}, [ip], r2 ++ vld1.64 {d18,d19}, [ip,:128], r2 + vrhadd.u8 q1, q1, q9 +- vld1.64 {d20,d21}, [ip], r2 ++ vld1.64 {d20,d21}, [ip,:128], r2 + vrhadd.u8 q2, q2, q10 +- vld1.64 {d22,d23}, [ip], r2 ++ vld1.64 {d22,d23}, [ip,:128], r2 + vrhadd.u8 q3, q3, q11 + .endif + subs r3, r3, #4 +@@ -73,35 +73,29 @@ + .endm + + .macro pixels16_y2 vhadd=vrhadd.u8 +- push {lr} +- add ip, r1, r2 +- lsl lr, r2, #1 +- vld1.64 {d0, d1}, [r1], lr +- vld1.64 {d2, d3}, [ip], lr ++ vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d2, d3}, [r1], r2 + 1: subs r3, r3, #2 + \vhadd q2, q0, q1 +- vld1.64 {d0, d1}, [r1], lr ++ vld1.64 {d0, d1}, [r1], r2 + \vhadd q3, q0, q1 +- vld1.64 {d2, d3}, [ip], lr ++ vld1.64 {d2, d3}, [r1], r2 + pld [r1] +- pld [ip] ++ pld [r1, r2] + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b +- pop {pc} ++ bx lr + .endm + + .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 +- push {lr} +- lsl lr, r2, #1 +- add ip, r1, r2 +- vld1.64 {d0-d2}, [r1], lr +- vld1.64 {d4-d6}, [ip], lr ++ vld1.64 {d0-d2}, [r1], r2 ++ vld1.64 {d4-d6}, [r1], r2 + .if \no_rnd + vmov.i16 q13, #1 + .endif + pld [r1] +- pld [ip] ++ pld [r1, r2] + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + vaddl.u8 q8, d0, d2 +@@ -109,7 +103,7 @@ + vaddl.u8 q9, d4, d6 + vaddl.u8 q11, d5, d7 + 1: subs r3, r3, #2 +- vld1.64 {d0-d2}, [r1], lr ++ vld1.64 {d0-d2}, [r1], r2 + vadd.u16 q12, q8, q9 + pld [r1] + .if \no_rnd +@@ -123,11 +117,11 @@ + .endif + \vshrn d29, q1, #2 + vaddl.u8 q8, d0, d30 +- vld1.64 {d2-d4}, [ip], lr ++ vld1.64 {d2-d4}, [r1], r2 + vaddl.u8 q10, d1, d31 + vst1.64 {d28,d29}, [r0,:128], r2 + vadd.u16 q12, q8, q9 +- pld [ip] ++ pld [r1, r2] + .if \no_rnd + vadd.u16 q12, q12, q13 + .endif +@@ -142,7 +136,7 @@ + vaddl.u8 q11, d3, d5 + vst1.64 {d30,d31}, [r0,:128], r2 + bgt 1b +- pop {pc} ++ bx lr + .endm + + .macro pixels8 +@@ -180,41 +174,35 @@ + .endm + + .macro pixels8_y2 vhadd=vrhadd.u8 +- push {lr} +- add ip, r1, r2 +- lsl lr, r2, #1 +- vld1.64 {d0}, [r1], lr +- vld1.64 {d1}, [ip], lr ++ vld1.64 {d0}, [r1], r2 ++ vld1.64 {d1}, [r1], r2 + 1: subs r3, r3, #2 + \vhadd d4, d0, d1 +- vld1.64 {d0}, [r1], lr ++ vld1.64 {d0}, [r1], r2 + \vhadd d5, d0, d1 +- vld1.64 {d1}, [ip], lr ++ vld1.64 {d1}, [r1], r2 + pld [r1] +- pld [ip] ++ pld [r1, r2] + vst1.64 {d4}, [r0,:64], r2 + vst1.64 {d5}, [r0,:64], r2 + bne 1b +- pop {pc} ++ bx lr + .endm + + .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 +- push {lr} +- lsl lr, r2, #1 +- add ip, r1, r2 +- vld1.64 {d0, d1}, [r1], lr +- vld1.64 {d2, d3}, [ip], lr ++ vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d2, d3}, [r1], r2 + .if \no_rnd + vmov.i16 q11, #1 + .endif + pld [r1] +- pld [ip] ++ pld [r1, r2] + vext.8 d4, d0, d1, #1 + vext.8 d6, d2, d3, #1 + vaddl.u8 q8, d0, d4 + vaddl.u8 q9, d2, d6 + 1: subs r3, r3, #2 +- vld1.64 {d0, d1}, [r1], lr ++ vld1.64 {d0, d1}, [r1], r2 + pld [r1] + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +@@ -223,9 +211,9 @@ + .endif + vaddl.u8 q8, d0, d4 + \vshrn d5, q10, #2 +- vld1.64 {d2, d3}, [ip], lr ++ vld1.64 {d2, d3}, [r1], r2 + vadd.u16 q10, q8, q9 +- pld [ip] ++ pld [r1, r2] + .if \no_rnd + vadd.u16 q10, q10, q11 + .endif +@@ -235,7 +223,7 @@ + vaddl.u8 q9, d2, d6 + vst1.64 {d7}, [r0,:64], r2 + bgt 1b +- pop {pc} ++ bx lr + .endm + + .macro pixfunc pfx name suf rnd_op args:vararg +@@ -273,6 +261,112 @@ + pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 + ++function ff_put_pixels_clamped_neon, export=1 ++ vld1.64 {d16-d19}, [r0,:128]! ++ vqmovun.s16 d0, q8 ++ vld1.64 {d20-d23}, [r0,:128]! ++ vqmovun.s16 d1, q9 ++ vld1.64 {d24-d27}, [r0,:128]! ++ vqmovun.s16 d2, q10 ++ vld1.64 {d28-d31}, [r0,:128]! ++ vqmovun.s16 d3, q11 ++ vst1.64 {d0}, [r1,:64], r2 ++ vqmovun.s16 d4, q12 ++ vst1.64 {d1}, [r1,:64], r2 ++ vqmovun.s16 d5, q13 ++ vst1.64 {d2}, [r1,:64], r2 ++ vqmovun.s16 d6, q14 ++ vst1.64 {d3}, [r1,:64], r2 ++ vqmovun.s16 d7, q15 ++ vst1.64 {d4}, [r1,:64], r2 ++ vst1.64 {d5}, [r1,:64], r2 ++ vst1.64 {d6}, [r1,:64], r2 ++ vst1.64 {d7}, [r1,:64], r2 ++ bx lr ++ .endfunc ++ ++function ff_put_signed_pixels_clamped_neon, export=1 ++ vmov.u8 d31, #128 ++ vld1.64 {d16-d17}, [r0,:128]! ++ vqmovn.s16 d0, q8 ++ vld1.64 {d18-d19}, [r0,:128]! ++ vqmovn.s16 d1, q9 ++ vld1.64 {d16-d17}, [r0,:128]! ++ vqmovn.s16 d2, q8 ++ vld1.64 {d18-d19}, [r0,:128]! ++ vadd.u8 d0, d0, d31 ++ vld1.64 {d20-d21}, [r0,:128]! ++ vadd.u8 d1, d1, d31 ++ vld1.64 {d22-d23}, [r0,:128]! ++ vadd.u8 d2, d2, d31 ++ vst1.64 {d0}, [r1,:64], r2 ++ vqmovn.s16 d3, q9 ++ vst1.64 {d1}, [r1,:64], r2 ++ vqmovn.s16 d4, q10 ++ vst1.64 {d2}, [r1,:64], r2 ++ vqmovn.s16 d5, q11 ++ vld1.64 {d24-d25}, [r0,:128]! ++ vadd.u8 d3, d3, d31 ++ vld1.64 {d26-d27}, [r0,:128]! ++ vadd.u8 d4, d4, d31 ++ vadd.u8 d5, d5, d31 ++ vst1.64 {d3}, [r1,:64], r2 ++ vqmovn.s16 d6, q12 ++ vst1.64 {d4}, [r1,:64], r2 ++ vqmovn.s16 d7, q13 ++ vst1.64 {d5}, [r1,:64], r2 ++ vadd.u8 d6, d6, d31 ++ vadd.u8 d7, d7, d31 ++ vst1.64 {d6}, [r1,:64], r2 ++ vst1.64 {d7}, [r1,:64], r2 ++ bx lr ++ .endfunc ++ ++function ff_add_pixels_clamped_neon, export=1 ++ mov r3, r1 ++ vld1.64 {d16}, [r1,:64], r2 ++ vld1.64 {d0-d1}, [r0,:128]! ++ vaddw.u8 q0, q0, d16 ++ vld1.64 {d17}, [r1,:64], r2 ++ vld1.64 {d2-d3}, [r0,:128]! ++ vqmovun.s16 d0, q0 ++ vld1.64 {d18}, [r1,:64], r2 ++ vaddw.u8 q1, q1, d17 ++ vld1.64 {d4-d5}, [r0,:128]! ++ vaddw.u8 q2, q2, d18 ++ vst1.64 {d0}, [r3,:64], r2 ++ vqmovun.s16 d2, q1 ++ vld1.64 {d19}, [r1,:64], r2 ++ vld1.64 {d6-d7}, [r0,:128]! ++ vaddw.u8 q3, q3, d19 ++ vqmovun.s16 d4, q2 ++ vst1.64 {d2}, [r3,:64], r2 ++ vld1.64 {d16}, [r1,:64], r2 ++ vqmovun.s16 d6, q3 ++ vld1.64 {d0-d1}, [r0,:128]! ++ vaddw.u8 q0, q0, d16 ++ vst1.64 {d4}, [r3,:64], r2 ++ vld1.64 {d17}, [r1,:64], r2 ++ vld1.64 {d2-d3}, [r0,:128]! ++ vaddw.u8 q1, q1, d17 ++ vst1.64 {d6}, [r3,:64], r2 ++ vqmovun.s16 d0, q0 ++ vld1.64 {d18}, [r1,:64], r2 ++ vld1.64 {d4-d5}, [r0,:128]! ++ vaddw.u8 q2, q2, d18 ++ vst1.64 {d0}, [r3,:64], r2 ++ vqmovun.s16 d2, q1 ++ vld1.64 {d19}, [r1,:64], r2 ++ vqmovun.s16 d4, q2 ++ vld1.64 {d6-d7}, [r0,:128]! ++ vaddw.u8 q3, q3, d19 ++ vst1.64 {d2}, [r3,:64], r2 ++ vqmovun.s16 d6, q3 ++ vst1.64 {d4}, [r3,:64], r2 ++ vst1.64 {d6}, [r3,:64], r2 ++ bx lr ++ .endfunc ++ + function ff_float_to_int16_neon, export=1 + subs r2, r2, #8 + vld1.64 {d0-d1}, [r1,:128]! +diff -Nurd ffmpeg.old/libavcodec/arm/simple_idct_neon.S ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S +--- ffmpeg.old/libavcodec/arm/simple_idct_neon.S 2008-12-30 04:13:52.000000000 +0100 ++++ ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S 2009-05-30 11:27:54.000000000 +0200 +@@ -68,6 +68,19 @@ + .text + .align 6 + ++function idct_row4_pld_neon ++ pld [r0] ++ add r3, r0, r1, lsl #2 ++ pld [r0, r1] ++ pld [r0, r1, lsl #1] ++ pld [r3, -r1] ++ pld [r3] ++ pld [r3, r1] ++ add r3, r3, r1, lsl #1 ++ pld [r3] ++ pld [r3, r1] ++ .endfunc ++ + function idct_row4_neon + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) + vld1.64 {d2-d5}, [r2,:128]! +@@ -252,7 +265,7 @@ + function ff_simple_idct_put_neon, export=1 + idct_start r2 + +- bl idct_row4_neon ++ bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon +@@ -307,7 +320,7 @@ + function ff_simple_idct_add_neon, export=1 + idct_start r2 + +- bl idct_row4_neon ++ bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon +diff -Nurd ffmpeg.old/libavcodec/arm/vp3dsp_neon.S ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S +--- ffmpeg.old/libavcodec/arm/vp3dsp_neon.S 1970-01-01 01:00:00.000000000 +0100 ++++ ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S 2009-05-30 11:27:54.000000000 +0200 +@@ -0,0 +1,94 @@ ++/* ++ * Copyright (c) 2009 David Conrad ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "asm.S" ++ ++.macro vp3_loop_filter ++ vsubl.u8 q3, d18, d17 ++ vsubl.u8 q2, d16, d19 ++ vadd.i16 q1, q3, q3 ++ vadd.i16 q2, q2, q3 ++ vadd.i16 q0, q1, q2 ++ vrshr.s16 q0, q0, #3 ++ vmovl.u8 q9, d18 ++ vdup.u16 q15, r2 ++ ++ vabs.s16 q1, q0 ++ vshr.s16 q0, q0, #15 ++ vqsub.u16 q2, q15, q1 ++ vqsub.u16 q3, q2, q1 ++ vsub.i16 q1, q2, q3 ++ veor q1, q1, q0 ++ vsub.i16 q0, q1, q0 ++ ++ vaddw.u8 q2, q0, d17 ++ vsub.i16 q3, q9, q0 ++ vqmovun.s16 d0, q2 ++ vqmovun.s16 d1, q3 ++.endm ++ ++function ff_vp3_v_loop_filter_neon, export=1 ++ sub ip, r0, r1 ++ sub r0, r0, r1, lsl #1 ++ vld1.64 {d16}, [r0,:64], r1 ++ vld1.64 {d17}, [r0,:64], r1 ++ vld1.64 {d18}, [r0,:64], r1 ++ vld1.64 {d19}, [r0,:64], r1 ++ ldrb r2, [r2, #129*4] ++ ++ vp3_loop_filter ++ ++ vst1.64 {d0}, [ip,:64], r1 ++ vst1.64 {d1}, [ip,:64], r1 ++ bx lr ++.endfunc ++ ++function ff_vp3_h_loop_filter_neon, export=1 ++ sub ip, r0, #1 ++ sub r0, r0, #2 ++ vld1.32 {d16[]}, [r0], r1 ++ vld1.32 {d17[]}, [r0], r1 ++ vld1.32 {d18[]}, [r0], r1 ++ vld1.32 {d19[]}, [r0], r1 ++ vld1.32 {d16[1]}, [r0], r1 ++ vld1.32 {d17[1]}, [r0], r1 ++ vld1.32 {d18[1]}, [r0], r1 ++ vld1.32 {d19[1]}, [r0], r1 ++ ldrb r2, [r2, #129*4] ++ ++ vtrn.8 d16, d17 ++ vtrn.8 d18, d19 ++ vtrn.16 d16, d18 ++ vtrn.16 d17, d19 ++ ++ vp3_loop_filter ++ ++ vtrn.8 d0, d1 ++ ++ vst1.16 {d0[0]}, [ip], r1 ++ vst1.16 {d1[0]}, [ip], r1 ++ vst1.16 {d0[1]}, [ip], r1 ++ vst1.16 {d1[1]}, [ip], r1 ++ vst1.16 {d0[2]}, [ip], r1 ++ vst1.16 {d1[2]}, [ip], r1 ++ vst1.16 {d0[3]}, [ip], r1 ++ vst1.16 {d1[3]}, [ip], r1 ++ bx lr ++.endfunc +diff -Nurd ffmpeg.old/libavcodec/Makefile ffmpeg-0.5/libavcodec/Makefile +--- ffmpeg.old/libavcodec/Makefile 2009-02-26 03:29:24.000000000 +0100 ++++ ffmpeg-0.5/libavcodec/Makefile 2009-05-30 11:29:51.000000000 +0200 +@@ -477,11 +477,15 @@ + OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ + arm/mpegvideo_iwmmxt.o \ + ++NEON-OBJS-$(CONFIG_THEORA_DECODER) += arm/vp3dsp_neon.o ++NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o ++ + OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \ + arm/dsputil_neon_s.o \ + arm/h264dsp_neon.o \ + arm/h264idct_neon.o \ + arm/simple_idct_neon.o \ ++ $(NEON-OBJS-yes) + + OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \ + bfin/fdct_bfin.o \ |