From: Mans Rullgard Date: Sun, 3 Aug 2008 16:46:43 +0000 (+0100) Subject: ARM: NEON optimised vector_fmul_window X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=98feb31064dccfd16ce189ff4aec9ccedddf6b04 ARM: NEON optimised vector_fmul_window --- diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c index f9d32c0..6c44940 100644 --- a/libavcodec/armv4l/dsputil_neon.c +++ b/libavcodec/armv4l/dsputil_neon.c @@ -91,6 +91,10 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); +void ff_vector_fmul_window_neon(float *dst, const float *src0, + const float *src1, const float *win, + float add_bias, int len); + void ff_float_to_int16_neon(int16_t *, const float *, long); void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); @@ -164,6 +168,8 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->h264_idct_add = ff_h264_idct_add_neon; c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + c->vector_fmul_window = ff_vector_fmul_window_neon; + c->float_to_int16 = ff_float_to_int16_neon; c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S index 6a54803..49a09b8 100644 --- a/libavcodec/armv4l/dsputil_neon_s.S +++ b/libavcodec/armv4l/dsputil_neon_s.S @@ -324,6 +324,49 @@ extern ff_float_to_int16_interleave_neon pop {r4,r5,pc} .endfunc +extern ff_vector_fmul_window_neon + vld1.32 {d16[],d17[]}, [sp,:32] + push {r4,r5,lr} + ldr lr, [sp, #16] + sub r2, r2, #8 + sub r5, lr, #2 + add r2, r2, r5, lsl #2 + add r4, r3, r5, lsl #3 + add ip, r0, r5, lsl #3 + mov r5, #-16 + dmb + vld1.64 {d0,d1}, [r1,:128]! + vld1.64 {d2,d3}, [r2,:128], r5 + vld1.64 {d4,d5}, [r3,:128]! + vld1.64 {d6,d7}, [r4,:128], r5 +1: vmov q10, q8 + vmov q11, q8 + vmla.f32 q11, q0, q2 + vrev64.32 q3, q3 + vswp d6, d7 + vmla.f32 q10, q0, q3 + vrev64.32 q1, q1 + vswp d2, d3 + subs lr, lr, #4 + vmla.f32 q11, q1, q3 + vmls.f32 q10, q1, q2 + beq 2f + vld1.64 {d0,d1}, [r1,:128]! + vld1.64 {d2,d3}, [r2,:128], r5 + vld1.64 {d4,d5}, [r3,:128]! + vld1.64 {d6,d7}, [r4,:128], r5 + vrev64.32 q11, q11 + vswp d22, d23 + vst1.64 {d20,d21}, [r0,:128]! + vst1.64 {d22,d23}, [ip,:128], r5 + b 1b +2: vrev64.32 q11, q11 + vswp d22, d23 + vst1.64 {d20,d21}, [r0,:128]! + vst1.64 {d22,d23}, [ip,:128], r5 + pop {r4,r5,pc} + .endfunc + #ifdef CONFIG_VORBIS_DECODER extern ff_vorbis_inverse_coupling_neon vmov.i32 q10, #(1<<31)