diff options
author | Koen Kooi <koen@openembedded.org> | 2008-09-24 18:14:11 +0000 |
---|---|---|
committer | Koen Kooi <koen@openembedded.org> | 2008-09-24 18:14:11 +0000 |
commit | 05bc62adb05c2821ec7608f0513c149edc30952b (patch) | |
tree | e1197178f004928b0d11ed6e2e43fad4b7ee8f98 /packages/mplayer/files | |
parent | f4da94042bddb70a89c709aa43589256130ef0b2 (diff) |
mplayer svn: update SRCREV, remove w100 and pxafb patches that don't apply anymore, add patches for NEON
Diffstat (limited to 'packages/mplayer/files')
23 files changed, 3171 insertions, 7 deletions
diff --git a/packages/mplayer/files/Makefile-codec-cfg.patch b/packages/mplayer/files/Makefile-codec-cfg.patch index 84c17a9017..9ce22a8839 100644 --- a/packages/mplayer/files/Makefile-codec-cfg.patch +++ b/packages/mplayer/files/Makefile-codec-cfg.patch @@ -1,11 +1,11 @@ ---- /tmp/Makefile 2008-06-10 20:55:43.100403024 +0200 -+++ trunk/Makefile 2008-06-10 20:56:10.881647093 +0200 -@@ -731,7 +731,7 @@ - $(CC) -o $@ $^ $(LDFLAGS_MENCODER) +--- /tmp/Makefile 2008-09-24 19:24:26.000000000 +0200 ++++ trunk/Makefile 2008-09-24 19:25:01.683198000 +0200 +@@ -752,7 +752,7 @@ + $(CC) -o $@ $^ $(LDFLAGS_MPLAYER) codec-cfg$(EXESUF): codec-cfg.c codec-cfg.h help_mp.h -- $(HOST_CC) -O -I. -DCODECS2HTML -o $@ $< -+ $(BUILD_CC) -O -I. -DCODECS2HTML -o $@ $< +- $(HOST_CC) -O -DCODECS2HTML $(EXTRA_INC) -o $@ $< ++ $(BUILD_CC) -O -DCODECS2HTML $(EXTRA_INC) -o $@ $< codecs.conf.h: codec-cfg$(EXESUF) etc/codecs.conf - ./codec-cfg$(EXESUF) ./etc/codecs.conf > $@ + ./$^ > $@ diff --git a/packages/mplayer/files/armv5te/.mtn2git_empty b/packages/mplayer/files/armv5te/.mtn2git_empty new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/packages/mplayer/files/armv5te/.mtn2git_empty diff --git a/packages/mplayer/files/armv5te/configh b/packages/mplayer/files/armv5te/configh new file mode 100644 index 0000000000..46c647e2d5 --- /dev/null +++ b/packages/mplayer/files/armv5te/configh @@ -0,0 +1,6 @@ +#define HAVE_LLRINT 1 +#define HAVE_ROUNDF 1 +#define ARCH_ARMV4L 1 +#define ENABLE_ARMV4L 1 +#define HAVE_ARMV5TE 1 +#define ENABLE_ARMV5TE 1 diff --git a/packages/mplayer/files/armv5te/configmak b/packages/mplayer/files/armv5te/configmak new file mode 100644 index 0000000000..aa9978515d --- /dev/null +++ b/packages/mplayer/files/armv5te/configmak @@ -0,0 +1,3 @@ +ARCH_ARMV4L=yes +HAVE_ARMV5TE=yes + diff --git a/packages/mplayer/files/armv6/.mtn2git_empty b/packages/mplayer/files/armv6/.mtn2git_empty new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/packages/mplayer/files/armv6/.mtn2git_empty diff --git a/packages/mplayer/files/armv6/configh b/packages/mplayer/files/armv6/configh new file mode 100644 index 0000000000..2301e723d6 --- /dev/null +++ b/packages/mplayer/files/armv6/configh @@ -0,0 +1,8 @@ +#define HAVE_LLRINT 1 +#define HAVE_ROUNDF 1 +#define ARCH_ARMV4L 1 +#define ENABLE_ARMV4L 1 +#define HAVE_ARMV5TE 1 +#define ENABLE_ARMV5TE 1 +#define HAVE_ARMV6 1 +#define ENABLE_ARMV6 1 diff --git a/packages/mplayer/files/armv6/configmak b/packages/mplayer/files/armv6/configmak new file mode 100644 index 0000000000..4db5dc0dfd --- /dev/null +++ b/packages/mplayer/files/armv6/configmak @@ -0,0 +1,3 @@ +ARCH_ARMV4L=yes +HAVE_ARMV5TE=yes +HAVE_ARMV6=yes diff --git a/packages/mplayer/files/armv7a/.mtn2git_empty b/packages/mplayer/files/armv7a/.mtn2git_empty new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/packages/mplayer/files/armv7a/.mtn2git_empty diff --git a/packages/mplayer/files/armv7a/configh b/packages/mplayer/files/armv7a/configh new file mode 100644 index 0000000000..245e40f56a --- /dev/null +++ b/packages/mplayer/files/armv7a/configh @@ -0,0 +1,14 @@ +#define HAVE_LLRINT 1 +#define HAVE_ROUNDF 1 +#define ARCH_ARMV4L 1 +#define ENABLE_ARMV4L 1 +#define HAVE_ARMV5TE 1 +#define ENABLE_ARMV5TE 1 +#define HAVE_ARMV6 1 +#define ENABLE_ARMV6 1 +#define HAVE_ARMV6T2 1 +#define ENABLE_ARMV6T2 1 +#define HAVE_ARMVFP 1 +#define ENABLE_ARMVFP 1 +#define HAVE_NEON 1 +#define ENABLE_NEON 1 diff --git a/packages/mplayer/files/armv7a/configmak b/packages/mplayer/files/armv7a/configmak new file mode 100644 index 0000000000..50d549f794 --- /dev/null +++ b/packages/mplayer/files/armv7a/configmak @@ -0,0 +1,6 @@ +ARCH_ARMV4L=yes +HAVE_ARMV5TE=yes +HAVE_ARMV6=yes +HAVE_ARMV6T2=yes +HAVE_ARMVFP=yes +HAVE_NEON=yes diff --git a/packages/mplayer/files/configh b/packages/mplayer/files/configh new file mode 100644 index 0000000000..2fe7658383 --- /dev/null +++ b/packages/mplayer/files/configh @@ -0,0 +1,2 @@ +#define HAVE_LLRINT 1 +#define HAVE_ROUNDF 1 diff --git a/packages/mplayer/files/configmak b/packages/mplayer/files/configmak new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/packages/mplayer/files/configmak diff --git a/packages/mplayer/files/mru-neon-float-to-int16.diff b/packages/mplayer/files/mru-neon-float-to-int16.diff new file mode 100644 index 0000000000..7a874cab30 --- /dev/null +++ b/packages/mplayer/files/mru-neon-float-to-int16.diff @@ -0,0 +1,107 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Thu, 31 Jul 2008 02:35:42 +0000 (+0100) +Subject: ARM: NEON optimised float_to_int16 +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=f16a738cfc3307cbcba2f9c8aff4b5aa43144731 + +ARM: NEON optimised float_to_int16 +--- + +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index 6dbe835..b584e5b 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -91,6 +91,9 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); + void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); + ++void ff_float_to_int16_neon(int16_t *, const float *, long); ++void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); ++ + void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + { + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; +@@ -158,4 +161,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; ++ ++ c->float_to_int16 = ff_float_to_int16_neon; ++ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; + } +diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S +index fc5e401..44f75ba 100644 +--- a/libavcodec/armv4l/dsputil_neon_s.S ++++ b/libavcodec/armv4l/dsputil_neon_s.S +@@ -252,3 +252,72 @@ + defun2 put_pixels8_x2, _no_rnd, vhadd.u8 + defun2 put_pixels8_y2, _no_rnd, vhadd.u8 + defun2 put_pixels8_xy2, _no_rnd, vshrn.u16, 1 ++ ++extern ff_float_to_int16_neon ++ dmb ++1: vld1.64 {d0-d3}, [r1,:128]! ++ vcvt.s32.f32 q2, q0 ++ vcvt.s32.f32 q3, q1 ++ subs r2, r2, #8 ++ vqmovn.s32 d4, q2 ++ vqmovn.s32 d5, q3 ++ vst1.64 {d4-d5}, [r0,:128]! ++ bgt 1b ++ bx lr ++ .endfunc ++ ++extern ff_float_to_int16_interleave_neon ++ cmp r3, #2 ++ ldrlt r1, [r1] ++ blt ff_float_to_int16_neon ++ bne 2f ++ ++ ldr ip, [r1] ++ ldr r1, [r1, #4] ++ vld1.64 {d0-d3}, [ip,:128]! ++ vld1.64 {d4-d7}, [r1,:128]! ++ dmb ++1: vcvt.s32.f32 q8, q0 ++ vcvt.s32.f32 q9, q1 ++ vcvt.s32.f32 q10, q2 ++ vcvt.s32.f32 q11, q3 ++ subs r2, r2, #8 ++ vqmovn.s32 d16, q8 ++ vqmovn.s32 d17, q9 ++ vqmovn.s32 d18, q10 ++ vqmovn.s32 d19, q11 ++ beq 1f ++ vld1.64 {d0-d3}, [ip,:128]! ++ vld1.64 {d4-d7}, [r1,:128]! ++ vst2.16 {d16-d19}, [r0,:64]! ++ b 1b ++1: vst2.16 {d16-d19}, [r0,:64]! ++ bx lr ++ ++2: push {r4,r5,lr} ++ lsls r4, r3, #1 ++ dmb ++ b 4f ++3: vld1.64 {d0-d3}, [ip,:128]! ++ vcvt.s32.f32 q2, q0 ++ vcvt.s32.f32 q3, q1 ++ subs lr, lr, #8 ++ vqmovn.s32 d4, q2 ++ vqmovn.s32 d5, q3 ++ vst1.16 {d4[0]}, [r5,:16], r4 ++ vst1.16 {d4[1]}, [r5,:16], r4 ++ vst1.16 {d4[2]}, [r5,:16], r4 ++ vst1.16 {d4[3]}, [r5,:16], r4 ++ vst1.16 {d5[0]}, [r5,:16], r4 ++ vst1.16 {d5[1]}, [r5,:16], r4 ++ vst1.16 {d5[2]}, [r5,:16], r4 ++ vst1.16 {d5[3]}, [r5,:16], r4 ++ bgt 3b ++ subs r3, r3, #1 ++4: ldr ip, [r1], #4 ++ mov lr, r2 ++ mov r5, r0 ++ add r0, r0, #2 ++ bne 3b ++ pop {r4,r5,pc} ++ .endfunc diff --git a/packages/mplayer/files/mru-neon-h264-chrome.diff b/packages/mplayer/files/mru-neon-h264-chrome.diff new file mode 100644 index 0000000000..cb6c4ff991 --- /dev/null +++ b/packages/mplayer/files/mru-neon-h264-chrome.diff @@ -0,0 +1,364 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Fri, 11 Jul 2008 01:20:07 +0000 (+0100) +Subject: ARM: NEON optimised {put,avg}_h264_chroma_mc[48] +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=d3aa8f93b8a0061e0c3ac12aeed055961abfc113 + +ARM: NEON optimised {put,avg}_h264_chroma_mc[48] +--- + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 7fa02fa..36ba158 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -437,6 +437,7 @@ OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ + + ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \ + armv4l/simple_idct_neon.o \ ++ armv4l/h264dsp_neon.o \ + + OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \ + sparc/simple_idct_vis.o \ +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index 8a10dde..a6d86cd 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -42,6 +42,12 @@ void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); + void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); + ++void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); ++void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); ++ ++void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); ++void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); ++ + void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + { + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; +@@ -62,6 +68,12 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + ++ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; ++ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; ++ ++ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; ++ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; ++ + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon; + } +diff --git a/libavcodec/armv4l/h264dsp_neon.S b/libavcodec/armv4l/h264dsp_neon.S +new file mode 100644 +index 0000000..28d9aa7 +--- /dev/null ++++ b/libavcodec/armv4l/h264dsp_neon.S +@@ -0,0 +1,308 @@ ++/* ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ .fpu neon ++ ++/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ ++ .macro h264_chroma_mc8 avg=0 ++ push {r4-r7, lr} ++ ldrd r4, [sp, #20] ++.if \avg ++ mov lr, r0 ++.endif ++ pld [r1] ++ pld [r1, r2] ++ ++ muls r7, r4, r5 ++ rsb r6, r7, r5, lsl #3 ++ rsb ip, r7, r4, lsl #3 ++ sub r4, r7, r4, lsl #3 ++ sub r4, r4, r5, lsl #3 ++ add r4, r4, #64 ++ ++ dmb ++ ++ beq 2f ++ ++ add r5, r1, r2 ++ ++ vdup.8 d0, r4 ++ lsl r4, r2, #1 ++ vdup.8 d1, ip ++ vld1.64 {d4, d5}, [r1], r4 ++ vdup.8 d2, r6 ++ vld1.64 {d6, d7}, [r5], r4 ++ vdup.8 d3, r7 ++ ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ ++1: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d5, d1 ++ vld1.64 {d4, d5}, [r1], r4 ++ vmlal.u8 q8, d6, d2 ++ vext.8 d5, d4, d5, #1 ++ vmlal.u8 q8, d7, d3 ++ vmull.u8 q9, d6, d0 ++ subs r3, r3, #2 ++ vmlal.u8 q9, d7, d1 ++ vmlal.u8 q9, d4, d2 ++ vmlal.u8 q9, d5, d3 ++ vrshrn.u16 d16, q8, #6 ++ vld1.64 {d6, d7}, [r5], r4 ++ pld [r1] ++ vrshrn.u16 d17, q9, #6 ++.if \avg ++ vld1.64 {d20}, [lr,:64], r2 ++ vld1.64 {d21}, [lr,:64], r2 ++ vrhadd.u8 q8, q8, q10 ++.endif ++ vext.8 d7, d6, d7, #1 ++ vst1.64 {d16}, [r0,:64], r2 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 1b ++ ++ pop {r4-r7, pc} ++ ++2: tst r6, r6 ++ add ip, ip, r6 ++ vdup.8 d0, r4 ++ vdup.8 d1, ip ++ ++ beq 4f ++ ++ add r5, r1, r2 ++ lsl r4, r2, #1 ++ vld1.64 {d4}, [r1], r4 ++ vld1.64 {d6}, [r5], r4 ++ ++3: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d6, d1 ++ vld1.64 {d4}, [r1], r4 ++ vmull.u8 q9, d6, d0 ++ vmlal.u8 q9, d4, d1 ++ vld1.64 {d6}, [r5], r4 ++ vrshrn.u16 d16, q8, #6 ++ vrshrn.u16 d17, q9, #6 ++.if \avg ++ vld1.64 {d20}, [lr,:64], r2 ++ vld1.64 {d21}, [lr,:64], r2 ++ vrhadd.u8 q8, q8, q10 ++.endif ++ subs r3, r3, #2 ++ pld [r1] ++ vst1.64 {d16}, [r0,:64], r2 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 3b ++ ++ pop {r4-r7, pc} ++ ++4: vld1.64 {d4, d5}, [r1], r2 ++ vld1.64 {d6, d7}, [r1], r2 ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ ++5: pld [r1] ++ subs r3, r3, #2 ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d5, d1 ++ vld1.64 {d4, d5}, [r1], r2 ++ vmull.u8 q9, d6, d0 ++ vmlal.u8 q9, d7, d1 ++ pld [r1] ++ vext.8 d5, d4, d5, #1 ++ vrshrn.u16 d16, q8, #6 ++ vrshrn.u16 d17, q9, #6 ++.if \avg ++ vld1.64 {d20}, [lr,:64], r2 ++ vld1.64 {d21}, [lr,:64], r2 ++ vrhadd.u8 q8, q8, q10 ++.endif ++ vld1.64 {d6, d7}, [r1], r2 ++ vext.8 d7, d6, d7, #1 ++ vst1.64 {d16}, [r0,:64], r2 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 5b ++ ++ pop {r4-r7, pc} ++ .endm ++ ++/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ ++ .macro h264_chroma_mc4 avg=0 ++ push {r4-r7, lr} ++ ldrd r4, [sp, #20] ++.if \avg ++ mov lr, r0 ++.endif ++ pld [r1] ++ pld [r1, r2] ++ ++ muls r7, r4, r5 ++ rsb r6, r7, r5, lsl #3 ++ rsb ip, r7, r4, lsl #3 ++ sub r4, r7, r4, lsl #3 ++ sub r4, r4, r5, lsl #3 ++ add r4, r4, #64 ++ ++ dmb ++ ++ beq 2f ++ ++ add r5, r1, r2 ++ ++ vdup.8 d0, r4 ++ lsl r4, r2, #1 ++ vdup.8 d1, ip ++ vld1.64 {d4}, [r1], r4 ++ vdup.8 d2, r6 ++ vld1.64 {d6}, [r5], r4 ++ vdup.8 d3, r7 ++ ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d4, d5 ++ vtrn.32 d6, d7 ++ ++ vtrn.32 d0, d1 ++ vtrn.32 d2, d3 ++ ++1: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d6, d2 ++ vld1.64 {d4}, [r1], r4 ++ vext.8 d5, d4, d5, #1 ++ vtrn.32 d4, d5 ++ vmull.u8 q9, d6, d0 ++ vmlal.u8 q9, d4, d2 ++ vld1.64 {d6}, [r5], r4 ++ vadd.i16 d16, d16, d17 ++ vadd.i16 d17, d18, d19 ++ vrshrn.u16 d16, q8, #6 ++ subs r3, r3, #2 ++ pld [r1] ++.if \avg ++ vld1.32 {d20[0]}, [lr,:32], r2 ++ vld1.32 {d20[1]}, [lr,:32], r2 ++ vrhadd.u8 d16, d16, d20 ++.endif ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d6, d7 ++ vst1.32 {d16[0]}, [r0,:32], r2 ++ vst1.32 {d16[1]}, [r0,:32], r2 ++ bgt 1b ++ ++ pop {r4-r7, pc} ++ ++2: tst r6, r6 ++ add ip, ip, r6 ++ vdup.8 d0, r4 ++ vdup.8 d1, ip ++ vtrn.32 d0, d1 ++ ++ beq 4f ++ ++ vext.32 d1, d0, d1, #1 ++ add r5, r1, r2 ++ lsl r4, r2, #1 ++ vld1.32 {d4[0]}, [r1], r4 ++ vld1.32 {d4[1]}, [r5], r4 ++ ++3: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vld1.32 {d4[0]}, [r1], r4 ++ vmull.u8 q9, d4, d1 ++ vld1.32 {d4[1]}, [r5], r4 ++ vadd.i16 d16, d16, d17 ++ vadd.i16 d17, d18, d19 ++ vrshrn.u16 d16, q8, #6 ++.if \avg ++ vld1.32 {d20[0]}, [lr,:32], r2 ++ vld1.32 {d20[1]}, [lr,:32], r2 ++ vrhadd.u8 d16, d16, d20 ++.endif ++ subs r3, r3, #2 ++ pld [r1] ++ vst1.32 {d16[0]}, [r0,:32], r2 ++ vst1.32 {d16[1]}, [r0,:32], r2 ++ bgt 3b ++ ++ pop {r4-r7, pc} ++ ++4: vld1.64 {d4}, [r1], r2 ++ vld1.64 {d6}, [r1], r2 ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d4, d5 ++ vtrn.32 d6, d7 ++ ++5: vmull.u8 q8, d4, d0 ++ vmull.u8 q9, d6, d0 ++ subs r3, r3, #2 ++ vld1.64 {d4}, [r1], r2 ++ vext.8 d5, d4, d5, #1 ++ vtrn.32 d4, d5 ++ vadd.i16 d16, d16, d17 ++ vadd.i16 d17, d18, d19 ++ pld [r1] ++ vrshrn.u16 d16, q8, #6 ++.if \avg ++ vld1.32 {d20[0]}, [lr,:32], r2 ++ vld1.32 {d20[1]}, [lr,:32], r2 ++ vrhadd.u8 d16, d16, d20 ++.endif ++ vld1.64 {d6}, [r1], r2 ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d6, d7 ++ pld [r1] ++ vst1.32 {d16[0]}, [r0,:32], r2 ++ vst1.32 {d16[1]}, [r0,:32], r2 ++ bgt 5b ++ ++ pop {r4-r7, pc} ++ .endm ++ ++ .text ++ .align ++ ++ .global ff_put_h264_chroma_mc8_neon ++ .func ff_put_h264_chroma_mc8_neon ++ff_put_h264_chroma_mc8_neon: ++ h264_chroma_mc8 ++ .endfunc ++ ++ .global ff_avg_h264_chroma_mc8_neon ++ .func ff_avg_h264_chroma_mc8_neon ++ff_avg_h264_chroma_mc8_neon: ++ h264_chroma_mc8 avg=1 ++ .endfunc ++ ++ .global ff_put_h264_chroma_mc4_neon ++ .func ff_put_h264_chroma_mc4_neon ++ff_put_h264_chroma_mc4_neon: ++ h264_chroma_mc4 ++ .endfunc ++ ++ .global ff_avg_h264_chroma_mc4_neon ++ .func ff_avg_h264_chroma_mc4_neon ++ff_avg_h264_chroma_mc4_neon: ++ h264_chroma_mc4 avg=1 ++ .endfunc diff --git a/packages/mplayer/files/mru-neon-h264-loopfilter.diff b/packages/mplayer/files/mru-neon-h264-loopfilter.diff new file mode 100644 index 0000000000..056702517b --- /dev/null +++ b/packages/mplayer/files/mru-neon-h264-loopfilter.diff @@ -0,0 +1,346 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Fri, 15 Aug 2008 00:02:55 +0000 (+0100) +Subject: ARM: NEON optimised H.264 loop filter +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=0c1b6bb0814587bd4c8a895c6d7dc2dd4cc2841a + +ARM: NEON optimised H.264 loop filter +--- + +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index a6d86cd..68ecbe8 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -48,6 +48,15 @@ void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); + void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + ++void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, ++ int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, ++ int beta, int8_t *tc0); ++void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, ++ int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, ++ int beta, int8_t *tc0); ++ + void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + { + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; +@@ -76,4 +85,9 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon; ++ ++ c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; ++ c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; ++ c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; ++ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + } +diff --git a/libavcodec/armv4l/h264dsp_neon.S b/libavcodec/armv4l/h264dsp_neon.S +index 28d9aa7..ac793b2 100644 +--- a/libavcodec/armv4l/h264dsp_neon.S ++++ b/libavcodec/armv4l/h264dsp_neon.S +@@ -306,3 +306,303 @@ ff_put_h264_chroma_mc4_neon: + ff_avg_h264_chroma_mc4_neon: + h264_chroma_mc4 avg=1 + .endfunc ++ ++ /* H.264 loop filter */ ++ ++ .macro h264_loop_filter_start ++ ldr ip, [sp] ++ tst r2, r2 ++ ldr ip, [ip] ++ tstne r3, r3 ++ vmov.32 d24[0], ip ++ and ip, ip, ip, lsl #16 ++ bxeq lr ++ ands ip, ip, ip, lsl #8 ++ bxlt lr ++ .endm ++ ++ .macro align_push_regs ++ and ip, sp, #15 ++ add ip, ip, #32 ++ sub sp, sp, ip ++ dmb ++ vst1.64 {d12-d15}, [sp,:128] ++ sub sp, sp, #32 ++ vst1.64 {d8-d11}, [sp,:128] ++ .endm ++ ++ .macro align_pop_regs ++ vld1.64 {d8-d11}, [sp,:128]! ++ vld1.64 {d12-d15}, [sp,:128], ip ++ .endm ++ ++ .macro h264_loop_filter_luma ++ vdup.8 q11, r2 @ alpha ++ vmovl.u8 q12, d24 ++ vabd.u8 q6, q8, q0 @ abs(p0 - q0) ++ vmovl.u16 q12, d24 ++ vabd.u8 q14, q9, q8 @ abs(p1 - p0) ++ vsli.16 q12, q12, #8 ++ vabd.u8 q15, q1, q0 @ abs(q1 - q0) ++ vsli.32 q12, q12, #16 ++ vclt.u8 q6, q6, q11 @ < alpha ++ vdup.8 q11, r3 @ beta ++ vclt.s8 q7, q12, #0 ++ vclt.u8 q14, q14, q11 @ < beta ++ vclt.u8 q15, q15, q11 @ < beta ++ vbic q6, q6, q7 ++ vabd.u8 q4, q10, q8 @ abs(p2 - p0) ++ vand q6, q6, q14 ++ vabd.u8 q5, q2, q0 @ abs(q2 - q0) ++ vclt.u8 q4, q4, q11 @ < beta ++ vand q6, q6, q15 ++ vclt.u8 q5, q5, q11 @ < beta ++ vand q4, q4, q6 ++ vand q5, q5, q6 ++ vand q12, q12, q6 ++ vrhadd.u8 q14, q8, q0 ++ vsub.i8 q6, q12, q4 ++ vqadd.u8 q7, q9, q12 ++ vhadd.u8 q10, q10, q14 ++ vsub.i8 q6, q6, q5 ++ vhadd.u8 q14, q2, q14 ++ vmin.u8 q7, q7, q10 ++ vqsub.u8 q11, q9, q12 ++ vqadd.u8 q2, q1, q12 ++ vmax.u8 q7, q7, q11 ++ vqsub.u8 q11, q1, q12 ++ vmin.u8 q14, q2, q14 ++ vmovl.u8 q2, d0 ++ vmax.u8 q14, q14, q11 ++ vmovl.u8 q10, d1 ++ vsubw.u8 q2, q2, d16 ++ vsubw.u8 q10, q10, d17 ++ vshl.i16 q2, q2, #2 ++ vshl.i16 q10, q10, #2 ++ vaddw.u8 q2, q2, d18 ++ vaddw.u8 q10, q10, d19 ++ vsubw.u8 q2, q2, d2 ++ vsubw.u8 q10, q10, d3 ++ vrshrn.i16 d4, q2, #3 ++ vrshrn.i16 d5, q10, #3 ++ vbsl q4, q7, q9 ++ vbsl q5, q14, q1 ++ vneg.s8 q7, q6 ++ vmovl.u8 q14, d16 ++ vmin.s8 q2, q2, q6 ++ vmovl.u8 q6, d17 ++ vmax.s8 q2, q2, q7 ++ vmovl.u8 q11, d0 ++ vmovl.u8 q12, d1 ++ vaddw.s8 q14, q14, d4 ++ vaddw.s8 q6, q6, d5 ++ vsubw.s8 q11, q11, d4 ++ vsubw.s8 q12, q12, d5 ++ vqmovun.s16 d16, q14 ++ vqmovun.s16 d17, q6 ++ vqmovun.s16 d0, q11 ++ vqmovun.s16 d1, q12 ++ .endm ++ ++ .global ff_h264_v_loop_filter_luma_neon ++ .func ff_h264_v_loop_filter_luma_neon ++ff_h264_v_loop_filter_luma_neon: ++ h264_loop_filter_start ++ ++ vld1.64 {d0, d1}, [r0,:128], r1 ++ vld1.64 {d2, d3}, [r0,:128], r1 ++ vld1.64 {d4, d5}, [r0,:128], r1 ++ sub r0, r0, r1, lsl #2 ++ sub r0, r0, r1, lsl #1 ++ vld1.64 {d20,d21}, [r0,:128], r1 ++ vld1.64 {d18,d19}, [r0,:128], r1 ++ vld1.64 {d16,d17}, [r0,:128], r1 ++ ++ align_push_regs ++ ++ h264_loop_filter_luma ++ ++ sub r0, r0, r1, lsl #1 ++ vst1.64 {d8, d9}, [r0,:128], r1 ++ vst1.64 {d16,d17}, [r0,:128], r1 ++ vst1.64 {d0, d1}, [r0,:128], r1 ++ vst1.64 {d10,d11}, [r0,:128] ++ ++ align_pop_regs ++ bx lr ++ .endfunc ++ ++ .global ff_h264_h_loop_filter_luma_neon ++ .func ff_h264_h_loop_filter_luma_neon ++ff_h264_h_loop_filter_luma_neon: ++ h264_loop_filter_start ++ ++ sub r0, r0, #4 ++ vld1.64 {d6}, [r0], r1 ++ vld1.64 {d20}, [r0], r1 ++ vld1.64 {d18}, [r0], r1 ++ vld1.64 {d16}, [r0], r1 ++ vld1.64 {d0}, [r0], r1 ++ vld1.64 {d2}, [r0], r1 ++ vld1.64 {d4}, [r0], r1 ++ vld1.64 {d26}, [r0], r1 ++ vld1.64 {d7}, [r0], r1 ++ vld1.64 {d21}, [r0], r1 ++ vld1.64 {d19}, [r0], r1 ++ vld1.64 {d17}, [r0], r1 ++ vld1.64 {d1}, [r0], r1 ++ vld1.64 {d3}, [r0], r1 ++ vld1.64 {d5}, [r0], r1 ++ vld1.64 {d27}, [r0], r1 ++ ++ vtrn.32 q3, q0 ++ vtrn.32 q10, q1 ++ vtrn.32 q9, q2 ++ vtrn.32 q8, q13 ++ vtrn.16 q3, q9 ++ vtrn.16 q10, q8 ++ vtrn.16 q0, q2 ++ vtrn.16 q1, q13 ++ vtrn.8 q3, q10 ++ vtrn.8 q9, q8 ++ vtrn.8 q0, q1 ++ vtrn.8 q2, q13 ++ ++ align_push_regs ++ sub sp, sp, #16 ++ vst1.64 {d4, d5}, [sp,:128] ++ sub sp, sp, #16 ++ vst1.64 {d20,d21}, [sp,:128] ++ ++ h264_loop_filter_luma ++ ++ vld1.64 {d20,d21}, [sp,:128]! ++ vld1.64 {d4, d5}, [sp,:128]! ++ ++ vtrn.32 q3, q0 ++ vtrn.32 q10, q5 ++ vtrn.32 q4, q2 ++ vtrn.32 q8, q13 ++ vtrn.16 q3, q4 ++ vtrn.16 q10, q8 ++ vtrn.16 q0, q2 ++ vtrn.16 q5, q13 ++ vtrn.8 q3, q10 ++ vtrn.8 q4, q8 ++ vtrn.8 q0, q5 ++ vtrn.8 q2, q13 ++ ++ sub r0, r0, r1, lsl #4 ++ vst1.64 {d6}, [r0], r1 ++ vst1.64 {d20}, [r0], r1 ++ vst1.64 {d8}, [r0], r1 ++ vst1.64 {d16}, [r0], r1 ++ vst1.64 {d0}, [r0], r1 ++ vst1.64 {d10}, [r0], r1 ++ vst1.64 {d4}, [r0], r1 ++ vst1.64 {d26}, [r0], r1 ++ vst1.64 {d7}, [r0], r1 ++ vst1.64 {d21}, [r0], r1 ++ vst1.64 {d9}, [r0], r1 ++ vst1.64 {d17}, [r0], r1 ++ vst1.64 {d1}, [r0], r1 ++ vst1.64 {d11}, [r0], r1 ++ vst1.64 {d5}, [r0], r1 ++ vst1.64 {d27}, [r0], r1 ++ ++ align_pop_regs ++ bx lr ++ .endfunc ++ ++ .macro h264_loop_filter_chroma ++ vdup.8 d22, r2 @ alpha ++ vmovl.u8 q12, d24 ++ vabd.u8 d26, d16, d0 @ abs(p0 - q0) ++ vmovl.u8 q2, d0 ++ vabd.u8 d28, d18, d16 @ abs(p1 - p0) ++ vsubw.u8 q2, q2, d16 ++ vsli.16 d24, d24, #8 ++ vshl.i16 q2, q2, #2 ++ vabd.u8 d30, d2, d0 @ abs(q1 - q0) ++ vaddw.u8 q2, q2, d18 ++ vclt.u8 d26, d26, d22 @ < alpha ++ vsubw.u8 q2, q2, d2 ++ vdup.8 d22, r3 @ beta ++ vclt.s8 d25, d24, #0 ++ vrshrn.i16 d4, q2, #3 ++ vclt.u8 d28, d28, d22 @ < beta ++ vbic d26, d26, d25 ++ vclt.u8 d30, d30, d22 @ < beta ++ vand d26, d26, d28 ++ vneg.s8 d25, d24 ++ vand d26, d26, d30 ++ vmin.s8 d4, d4, d24 ++ vmovl.u8 q14, d16 ++ vand d4, d4, d26 ++ vmax.s8 d4, d4, d25 ++ vmovl.u8 q11, d0 ++ vaddw.s8 q14, q14, d4 ++ vsubw.s8 q11, q11, d4 ++ vqmovun.s16 d16, q14 ++ vqmovun.s16 d0, q11 ++ .endm ++ ++ .global ff_h264_v_loop_filter_chroma_neon ++ .func ff_h264_v_loop_filter_chroma_neon ++ff_h264_v_loop_filter_chroma_neon: ++ h264_loop_filter_start ++ ++ sub r0, r0, r1, lsl #1 ++ vld1.64 {d18}, [r0,:64], r1 ++ vld1.64 {d16}, [r0,:64], r1 ++ vld1.64 {d0}, [r0,:64], r1 ++ vld1.64 {d2}, [r0,:64] ++ ++ h264_loop_filter_chroma ++ ++ sub r0, r0, r1, lsl #1 ++ vst1.64 {d16}, [r0,:64], r1 ++ vst1.64 {d0}, [r0,:64], r1 ++ ++ bx lr ++ .endfunc ++ ++ .global ff_h264_h_loop_filter_chroma_neon ++ .func ff_h264_h_loop_filter_chroma_neon ++ff_h264_h_loop_filter_chroma_neon: ++ h264_loop_filter_start ++ ++ sub r0, r0, #2 ++ vld1.32 {d18[0]}, [r0], r1 ++ vld1.32 {d16[0]}, [r0], r1 ++ vld1.32 {d0[0]}, [r0], r1 ++ vld1.32 {d2[0]}, [r0], r1 ++ vld1.32 {d18[1]}, [r0], r1 ++ vld1.32 {d16[1]}, [r0], r1 ++ vld1.32 {d0[1]}, [r0], r1 ++ vld1.32 {d2[1]}, [r0], r1 ++ ++ vtrn.16 d18, d0 ++ vtrn.16 d16, d2 ++ vtrn.8 d18, d16 ++ vtrn.8 d0, d2 ++ ++ h264_loop_filter_chroma ++ ++ vtrn.16 d18, d0 ++ vtrn.16 d16, d2 ++ vtrn.8 d18, d16 ++ vtrn.8 d0, d2 ++ ++ sub r0, r0, r1, lsl #3 ++ vst1.32 {d18[0]}, [r0], r1 ++ vst1.32 {d16[0]}, [r0], r1 ++ vst1.32 {d0[0]}, [r0], r1 ++ vst1.32 {d2[0]}, [r0], r1 ++ vst1.32 {d18[1]}, [r0], r1 ++ vst1.32 {d16[1]}, [r0], r1 ++ vst1.32 {d0[1]}, [r0], r1 ++ vst1.32 {d2[1]}, [r0], r1 ++ ++ bx lr ++ .endfunc diff --git a/packages/mplayer/files/mru-neon-h264-qpel.diff b/packages/mplayer/files/mru-neon-h264-qpel.diff new file mode 100644 index 0000000000..6ed479b19b --- /dev/null +++ b/packages/mplayer/files/mru-neon-h264-qpel.diff @@ -0,0 +1,1040 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Sat, 23 Aug 2008 00:24:04 +0000 (+0100) +Subject: ARM: NEON optimised H.264 8x8 and 16x16 qpel MC +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=55661fd933572f67248c0730f6c75a6db0f0eb6a + +ARM: NEON optimised H.264 8x8 and 16x16 qpel MC +--- + +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index 68ecbe8..a932aa9 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -40,7 +40,38 @@ void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + + void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); ++ + void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); + + void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); + void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); +@@ -83,8 +114,39 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; + +- c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon; +- c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon; ++ c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; ++ c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; ++ c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; ++ c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; ++ c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; ++ c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; ++ c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; ++ c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; ++ c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; ++ c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; ++ c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; ++ c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; ++ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; ++ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; ++ c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; ++ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; ++ ++ c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; ++ c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; ++ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; ++ c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; ++ c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; ++ c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; ++ c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; ++ c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; ++ c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; ++ c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; ++ c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; ++ c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; ++ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; ++ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; ++ c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; ++ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; + + c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; +diff --git a/libavcodec/armv4l/h264dsp_neon.S b/libavcodec/armv4l/h264dsp_neon.S +index ac793b2..398e9c8 100644 +--- a/libavcodec/armv4l/h264dsp_neon.S ++++ b/libavcodec/armv4l/h264dsp_neon.S +@@ -20,6 +20,39 @@ + + .fpu neon + ++ .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 ++ vtrn.32 \r0, \r4 ++ vtrn.32 \r1, \r5 ++ vtrn.32 \r2, \r6 ++ vtrn.32 \r3, \r7 ++ vtrn.16 \r0, \r2 ++ vtrn.16 \r1, \r3 ++ vtrn.16 \r4, \r6 ++ vtrn.16 \r5, \r7 ++ vtrn.8 \r0, \r1 ++ vtrn.8 \r2, \r3 ++ vtrn.8 \r4, \r5 ++ vtrn.8 \r6, \r7 ++ .endm ++ ++ .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 ++ vswp \r0, \r4 ++ vswp \r1, \r5 ++ vswp \r2, \r6 ++ vswp \r3, \r7 ++ .endm ++ ++ .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 ++ vtrn.32 \r0, \r2 ++ vtrn.32 \r1, \r3 ++ vtrn.32 \r4, \r6 ++ vtrn.32 \r5, \r7 ++ vtrn.16 \r0, \r1 ++ vtrn.16 \r2, \r3 ++ vtrn.16 \r4, \r5 ++ vtrn.16 \r6, \r7 ++ .endm ++ + /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ + .macro h264_chroma_mc8 avg=0 + push {r4-r7, lr} +@@ -455,18 +488,7 @@ ff_h264_h_loop_filter_luma_neon: + vld1.64 {d5}, [r0], r1 + vld1.64 {d27}, [r0], r1 + +- vtrn.32 q3, q0 +- vtrn.32 q10, q1 +- vtrn.32 q9, q2 +- vtrn.32 q8, q13 +- vtrn.16 q3, q9 +- vtrn.16 q10, q8 +- vtrn.16 q0, q2 +- vtrn.16 q1, q13 +- vtrn.8 q3, q10 +- vtrn.8 q9, q8 +- vtrn.8 q0, q1 +- vtrn.8 q2, q13 ++ transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 + + align_push_regs + sub sp, sp, #16 +@@ -479,18 +501,7 @@ ff_h264_h_loop_filter_luma_neon: + vld1.64 {d20,d21}, [sp,:128]! + vld1.64 {d4, d5}, [sp,:128]! + +- vtrn.32 q3, q0 +- vtrn.32 q10, q5 +- vtrn.32 q4, q2 +- vtrn.32 q8, q13 +- vtrn.16 q3, q4 +- vtrn.16 q10, q8 +- vtrn.16 q0, q2 +- vtrn.16 q5, q13 +- vtrn.8 q3, q10 +- vtrn.8 q4, q8 +- vtrn.8 q0, q5 +- vtrn.8 q2, q13 ++ transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13 + + sub r0, r0, r1, lsl #4 + vst1.64 {d6}, [r0], r1 +@@ -606,3 +617,862 @@ ff_h264_h_loop_filter_chroma_neon: + + bx lr + .endfunc ++ ++ /* H.264 qpel MC */ ++ ++ .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 ++ vext.8 d4, \r0, \r1, #1 ++ vext.8 d2, \r0, \r1, #2 ++ vext.8 d3, \r0, \r1, #3 ++ vext.8 d5, \r0, \r1, #4 ++ vext.8 d6, \r0, \r1, #5 ++ ++ vext.8 d20, \r2, \r3, #1 ++ vext.8 d18, \r2, \r3, #2 ++ vext.8 d19, \r2, \r3, #3 ++ vext.8 d21, \r2, \r3, #4 ++ vext.8 d7, \r2, \r3, #5 ++ ++ vaddl.u8 q1, d2, d3 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q0, \r0, d6 ++ vaddl.u8 q9, d18, d19 ++ vaddl.u8 q10, d20, d21 ++ vaddl.u8 q8, \r2, d7 ++ ++ vshl.i16 q3, q1, #4 ++ vshl.i16 q1, q1, #2 ++ vshl.i16 q15, q2, #2 ++ vadd.i16 q1, q1, q3 ++ vadd.i16 q2, q2, q15 ++ ++ vshl.i16 q3, q9, #4 ++ vshl.i16 q9, q9, #2 ++ vshl.i16 q15, q10, #2 ++ vadd.i16 q9, q9, q3 ++ vadd.i16 q10, q10, q15 ++ ++ vsub.i16 q1, q1, q2 ++ vsub.i16 q9, q9, q10 ++.if \narrow ++ vadd.i16 q1, q1, q0 ++ vadd.i16 q9, q9, q8 ++ vqrshrun.s16 \d0, q1, #5 ++ vqrshrun.s16 \d1, q9, #5 ++.else ++ vadd.i16 \d0, q1, q0 ++ vadd.i16 \d1, q9, q8 ++.endif ++ .endm ++ ++ .macro lowpass_8_1 r0, r1, d0, narrow=1 ++ vext.8 d4, \r0, \r1, #1 ++ vext.8 d2, \r0, \r1, #2 ++ vext.8 d3, \r0, \r1, #3 ++ vext.8 d5, \r0, \r1, #4 ++ vext.8 d6, \r0, \r1, #5 ++ ++ vaddl.u8 q1, d2, d3 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q0, \r0, d6 ++ ++ vshl.i16 q3, q1, #4 ++ vshl.i16 q1, q1, #2 ++ vshl.i16 q15, q2, #2 ++ vadd.i16 q1, q1, q3 ++ vadd.i16 q2, q2, q15 ++ ++ vadd.i16 q1, q1, q0 ++.if \narrow ++ vsub.i16 q1, q1, q2 ++ vqrshrun.s16 \d0, q1, #5 ++.else ++ vsub.i16 \d0, q1, q2 ++.endif ++ .endm ++ ++ .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d ++ vext.16 q2, \r0, \r1, #1 ++ vext.16 q1, \r0, \r1, #2 ++ vext.16 q0, \r0, \r1, #3 ++ vext.16 q3, \r0, \r1, #4 ++ vext.16 \r1, \r0, \r1, #5 ++ ++ vaddl.s16 q9, d2, d0 ++ vaddl.s16 q1, d3, d1 ++ vaddl.s16 q10, d4, d6 ++ vaddl.s16 q2, d5, d7 ++ vaddl.s16 q0, \h0, \h1 ++ vaddl.s16 q8, \l0, \l1 ++ ++ vshl.i32 q3, q9, #4 ++ vshl.i32 q9, q9, #2 ++ vshl.i32 q15, q10, #2 ++ vadd.i32 q9, q9, q3 ++ vadd.i32 q10, q10, q15 ++ ++ vshl.i32 q3, q1, #4 ++ vshl.i32 q1, q1, #2 ++ vshl.i32 q15, q2, #2 ++ vadd.i32 q1, q1, q3 ++ vadd.i32 q2, q2, q15 ++ ++ vadd.i32 q9, q9, q8 ++ vsub.i32 q9, q9, q10 ++ ++ vadd.i32 q1, q1, q0 ++ vsub.i32 q1, q1, q2 ++ ++ vrshrn.s32 d18, q9, #10 ++ vrshrn.s32 d19, q1, #10 ++ ++ vqmovun.s16 \d, q9 ++ .endm ++ ++ .func put_h264_qpel16_h_lowpass_neon_packed ++put_h264_qpel16_h_lowpass_neon_packed: ++ mov r4, lr ++ mov ip, #16 ++ mov r3, #8 ++ bl put_h264_qpel8_h_lowpass_neon ++ sub r1, r1, r2, lsl #4 ++ add r1, r1, #8 ++ mov ip, #16 ++ mov lr, r4 ++ b put_h264_qpel8_h_lowpass_neon ++ .endfunc ++ ++ .func put_h264_qpel16_h_lowpass_neon ++put_h264_qpel16_h_lowpass_neon: ++ push {lr} ++ mov ip, #16 ++ dmb ++ bl put_h264_qpel8_h_lowpass_neon ++ sub r0, r0, r3, lsl #4 ++ sub r1, r1, r2, lsl #4 ++ add r0, r0, #8 ++ add r1, r1, #8 ++ mov ip, #16 ++ pop {lr} ++ .endfunc ++ ++ .func put_h264_qpel8_h_lowpass_neon ++put_h264_qpel8_h_lowpass_neon: ++1: vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d16,d17}, [r1], r2 ++ subs ip, ip, #2 ++ lowpass_8 d0, d1, d16, d17, d0, d16 ++ vst1.64 {d0}, [r0,:64], r3 ++ vst1.64 {d16}, [r0,:64], r3 ++ bne 1b ++ bx lr ++ .endfunc ++ ++ .func put_h264_qpel16_h_lowpass_l2_neon ++put_h264_qpel16_h_lowpass_l2_neon: ++ push {lr} ++ mov ip, #16 ++ dmb ++ bl put_h264_qpel8_h_lowpass_l2_neon ++ sub r0, r0, r2, lsl #4 ++ sub r1, r1, r2, lsl #4 ++ sub r3, r3, r2, lsl #4 ++ add r0, r0, #8 ++ add r1, r1, #8 ++ add r3, r3, #8 ++ mov ip, #16 ++ pop {lr} ++ .endfunc ++ ++ .func put_h264_qpel8_h_lowpass_l2_neon ++put_h264_qpel8_h_lowpass_l2_neon: ++1: vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d16,d17}, [r1], r2 ++ vld1.64 {d28}, [r3], r2 ++ vld1.64 {d29}, [r3], r2 ++ subs ip, ip, #2 ++ lowpass_8 d0, d1, d16, d17, d0, d1 ++ vrhadd.u8 q0, q0, q14 ++ vst1.64 {d0}, [r0,:64], r2 ++ vst1.64 {d1}, [r0,:64], r2 ++ bne 1b ++ bx lr ++ .endfunc ++ ++ .func put_h264_qpel16_v_lowpass_neon_packed ++put_h264_qpel16_v_lowpass_neon_packed: ++ mov r4, lr ++ mov r2, #8 ++ bl put_h264_qpel8_v_lowpass_neon ++ sub r1, r1, r3, lsl #2 ++ bl put_h264_qpel8_v_lowpass_neon ++ sub r1, r1, r3, lsl #4 ++ sub r1, r1, r3, lsl #2 ++ add r1, r1, #8 ++ bl put_h264_qpel8_v_lowpass_neon ++ sub r1, r1, r3, lsl #2 ++ mov lr, r4 ++ b put_h264_qpel8_v_lowpass_neon ++ .endfunc ++ ++ .func put_h264_qpel16_v_lowpass_neon ++put_h264_qpel16_v_lowpass_neon: ++ mov r4, lr ++ bl put_h264_qpel8_v_lowpass_neon ++ sub r1, r1, r3, lsl #2 ++ bl put_h264_qpel8_v_lowpass_neon ++ sub r0, r0, r2, lsl #4 ++ add r0, r0, #8 ++ sub r1, r1, r3, lsl #4 ++ sub r1, r1, r3, lsl #2 ++ add r1, r1, #8 ++ bl put_h264_qpel8_v_lowpass_neon ++ sub r1, r1, r3, lsl #2 ++ mov lr, r4 ++ .endfunc ++ ++ .func put_h264_qpel8_v_lowpass_neon ++put_h264_qpel8_v_lowpass_neon: ++ vld1.64 {d8}, [r1], r3 ++ vld1.64 {d10}, [r1], r3 ++ vld1.64 {d12}, [r1], r3 ++ vld1.64 {d14}, [r1], r3 ++ vld1.64 {d22}, [r1], r3 ++ vld1.64 {d24}, [r1], r3 ++ vld1.64 {d26}, [r1], r3 ++ vld1.64 {d28}, [r1], r3 ++ vld1.64 {d9}, [r1], r3 ++ vld1.64 {d11}, [r1], r3 ++ vld1.64 {d13}, [r1], r3 ++ vld1.64 {d15}, [r1], r3 ++ vld1.64 {d23}, [r1] ++ ++ transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 ++ lowpass_8 d8, d9, d10, d11, d8, d10 ++ lowpass_8 d12, d13, d14, d15, d12, d14 ++ lowpass_8 d22, d23, d24, d25, d22, d24 ++ lowpass_8 d26, d27, d28, d29, d26, d28 ++ transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 ++ ++ vst1.64 {d8}, [r0,:64], r2 ++ vst1.64 {d10}, [r0,:64], r2 ++ vst1.64 {d12}, [r0,:64], r2 ++ vst1.64 {d14}, [r0,:64], r2 ++ vst1.64 {d22}, [r0,:64], r2 ++ vst1.64 {d24}, [r0,:64], r2 ++ vst1.64 {d26}, [r0,:64], r2 ++ vst1.64 {d28}, [r0,:64], r2 ++ ++ bx lr ++ .endfunc ++ ++ .func put_h264_qpel16_v_lowpass_l2_neon ++put_h264_qpel16_v_lowpass_l2_neon: ++ mov r4, lr ++ bl put_h264_qpel8_v_lowpass_l2_neon ++ sub r1, r1, r3, lsl #2 ++ bl put_h264_qpel8_v_lowpass_l2_neon ++ sub r0, r0, r3, lsl #4 ++ sub ip, ip, r2, lsl #4 ++ add r0, r0, #8 ++ add ip, ip, #8 ++ sub r1, r1, r3, lsl #4 ++ sub r1, r1, r3, lsl #2 ++ add r1, r1, #8 ++ bl put_h264_qpel8_v_lowpass_l2_neon ++ sub r1, r1, r3, lsl #2 ++ mov lr, r4 ++ .endfunc ++ ++ .func put_h264_qpel8_v_lowpass_l2_neon ++put_h264_qpel8_v_lowpass_l2_neon: ++ vld1.64 {d8}, [r1], r3 ++ vld1.64 {d10}, [r1], r3 ++ vld1.64 {d12}, [r1], r3 ++ vld1.64 {d14}, [r1], r3 ++ vld1.64 {d22}, [r1], r3 ++ vld1.64 {d24}, [r1], r3 ++ vld1.64 {d26}, [r1], r3 ++ vld1.64 {d28}, [r1], r3 ++ vld1.64 {d9}, [r1], r3 ++ vld1.64 {d11}, [r1], r3 ++ vld1.64 {d13}, [r1], r3 ++ vld1.64 {d15}, [r1], r3 ++ vld1.64 {d23}, [r1] ++ ++ transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 ++ lowpass_8 d8, d9, d10, d11, d8, d9 ++ lowpass_8 d12, d13, d14, d15, d12, d13 ++ lowpass_8 d22, d23, d24, d25, d22, d23 ++ lowpass_8 d26, d27, d28, d29, d26, d27 ++ transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 ++ ++ vld1.64 {d0}, [ip], r2 ++ vld1.64 {d1}, [ip], r2 ++ vld1.64 {d2}, [ip], r2 ++ vld1.64 {d3}, [ip], r2 ++ vld1.64 {d4}, [ip], r2 ++ vrhadd.u8 q0, q0, q4 ++ vld1.64 {d5}, [ip], r2 ++ vrhadd.u8 q1, q1, q6 ++ vld1.64 {d6}, [ip], r2 ++ vrhadd.u8 q2, q2, q11 ++ vld1.64 {d7}, [ip], r2 ++ ++ vst1.64 {d0}, [r0,:64], r3 ++ vst1.64 {d1}, [r0,:64], r3 ++ vrhadd.u8 q3, q3, q13 ++ vst1.64 {d2}, [r0,:64], r3 ++ vst1.64 {d3}, [r0,:64], r3 ++ vst1.64 {d4}, [r0,:64], r3 ++ vst1.64 {d5}, [r0,:64], r3 ++ vst1.64 {d6}, [r0,:64], r3 ++ vst1.64 {d7}, [r0,:64], r3 ++ ++ bx lr ++ .endfunc ++ ++ .func put_h264_qpel8_hv_lowpass_neon_top ++put_h264_qpel8_hv_lowpass_neon_top: ++ mov ip, #12 ++1: vld1.64 {d0, d1}, [r1], r3 ++ vld1.64 {d16,d17}, [r1], r3 ++ subs ip, ip, #2 ++ lowpass_8 d0, d1, d16, d17, q0, q1, narrow=0 ++ vst1.64 {d0-d3}, [r4,:128]! ++ bne 1b ++ ++ vld1.64 {d0, d1}, [r1] ++ lowpass_8_1 d0, d1, q12, narrow=0 ++ ++ mov ip, #-16 ++ add r4, r4, ip ++ vld1.64 {d30,d31}, [r4,:128], ip ++ vld1.64 {d20,d21}, [r4,:128], ip ++ vld1.64 {d18,d19}, [r4,:128], ip ++ vld1.64 {d16,d17}, [r4,:128], ip ++ vld1.64 {d14,d15}, [r4,:128], ip ++ vld1.64 {d12,d13}, [r4,:128], ip ++ vld1.64 {d10,d11}, [r4,:128], ip ++ vld1.64 {d8, d9}, [r4,:128], ip ++ vld1.64 {d6, d7}, [r4,:128], ip ++ vld1.64 {d4, d5}, [r4,:128], ip ++ vld1.64 {d2, d3}, [r4,:128], ip ++ vld1.64 {d0, d1}, [r4,:128] ++ ++ swap4 d1, d3, d5, d7, d8, d10, d12, d14 ++ transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 ++ ++ swap4 d17, d19, d21, d31, d24, d26, d28, d22 ++ transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 ++ ++ vst1.64 {d30,d31}, [r4,:128]! ++ vst1.64 {d6, d7}, [r4,:128]! ++ vst1.64 {d20,d21}, [r4,:128]! ++ vst1.64 {d4, d5}, [r4,:128]! ++ vst1.64 {d18,d19}, [r4,:128]! ++ vst1.64 {d2, d3}, [r4,:128]! ++ vst1.64 {d16,d17}, [r4,:128]! ++ vst1.64 {d0, d1}, [r4,:128] ++ ++ lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 ++ lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 ++ lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 ++ lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 ++ ++ vld1.64 {d16,d17}, [r4,:128], ip ++ vld1.64 {d30,d31}, [r4,:128], ip ++ lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 ++ vld1.64 {d16,d17}, [r4,:128], ip ++ vld1.64 {d30,d31}, [r4,:128], ip ++ lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 ++ vld1.64 {d16,d17}, [r4,:128], ip ++ vld1.64 {d30,d31}, [r4,:128], ip ++ lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 ++ vld1.64 {d16,d17}, [r4,:128], ip ++ vld1.64 {d30,d31}, [r4,:128] ++ lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 ++ ++ transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 ++ ++ bx lr ++ .endfunc ++ ++ .func put_h264_qpel8_hv_lowpass_neon ++put_h264_qpel8_hv_lowpass_neon: ++ mov r10, lr ++ bl put_h264_qpel8_hv_lowpass_neon_top ++ vst1.64 {d12}, [r0,:64], r2 ++ vst1.64 {d13}, [r0,:64], r2 ++ vst1.64 {d14}, [r0,:64], r2 ++ vst1.64 {d15}, [r0,:64], r2 ++ vst1.64 {d8}, [r0,:64], r2 ++ vst1.64 {d9}, [r0,:64], r2 ++ vst1.64 {d10}, [r0,:64], r2 ++ vst1.64 {d11}, [r0,:64], r2 ++ ++ mov lr, r10 ++ bx lr ++ .endfunc ++ ++ .func put_h264_qpel8_hv_lowpass_l2_neon ++put_h264_qpel8_hv_lowpass_l2_neon: ++ mov r10, lr ++ bl put_h264_qpel8_hv_lowpass_neon_top ++ ++ vld1.64 {d0, d1}, [r2,:128]! ++ vld1.64 {d2, d3}, [r2,:128]! ++ vrhadd.u8 q0, q0, q6 ++ vld1.64 {d4, d5}, [r2,:128]! ++ vrhadd.u8 q1, q1, q7 ++ vld1.64 {d6, d7}, [r2,:128]! ++ vrhadd.u8 q2, q2, q4 ++ ++ vst1.64 {d0}, [r0,:64], r3 ++ vrhadd.u8 q3, q3, q5 ++ vst1.64 {d1}, [r0,:64], r3 ++ vst1.64 {d2}, [r0,:64], r3 ++ vst1.64 {d3}, [r0,:64], r3 ++ vst1.64 {d4}, [r0,:64], r3 ++ vst1.64 {d5}, [r0,:64], r3 ++ vst1.64 {d6}, [r0,:64], r3 ++ vst1.64 {d7}, [r0,:64], r3 ++ ++ mov lr, r10 ++ bx lr ++ .endfunc ++ ++ .func put_h264_qpel16_hv_lowpass_neon ++put_h264_qpel16_hv_lowpass_neon: ++ mov r9, lr ++ bl put_h264_qpel8_hv_lowpass_neon ++ sub r1, r1, r3, lsl #2 ++ bl put_h264_qpel8_hv_lowpass_neon ++ sub r1, r1, r3, lsl #4 ++ sub r1, r1, r3, lsl #2 ++ add r1, r1, #8 ++ sub r0, r0, r2, lsl #4 ++ add r0, r0, #8 ++ bl put_h264_qpel8_hv_lowpass_neon ++ sub r1, r1, r3, lsl #2 ++ mov lr, r9 ++ b put_h264_qpel8_hv_lowpass_neon ++ .endfunc ++ ++ .func put_h264_qpel16_hv_lowpass_l2_neon ++put_h264_qpel16_hv_lowpass_l2_neon: ++ mov r9, lr ++ sub r2, r4, #256 ++ bl put_h264_qpel8_hv_lowpass_l2_neon ++ sub r1, r1, r3, lsl #2 ++ bl put_h264_qpel8_hv_lowpass_l2_neon ++ sub r1, r1, r3, lsl #4 ++ sub r1, r1, r3, lsl #2 ++ add r1, r1, #8 ++ sub r0, r0, r3, lsl #4 ++ add r0, r0, #8 ++ bl put_h264_qpel8_hv_lowpass_l2_neon ++ sub r1, r1, r3, lsl #2 ++ mov lr, r9 ++ b put_h264_qpel8_hv_lowpass_l2_neon ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc10_neon ++ .func ff_put_h264_qpel8_mc10_neon ++ff_put_h264_qpel8_mc10_neon: ++ mov r3, r1 ++ sub r1, r1, #2 ++ mov ip, #8 ++ dmb ++ b put_h264_qpel8_h_lowpass_l2_neon ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc20_neon ++ .func ff_put_h264_qpel8_mc20_neon ++ff_put_h264_qpel8_mc20_neon: ++ sub r1, r1, #2 ++ mov r3, r2 ++ mov ip, #8 ++ dmb ++ b put_h264_qpel8_h_lowpass_neon ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc30_neon ++ .func ff_put_h264_qpel8_mc30_neon ++ff_put_h264_qpel8_mc30_neon: ++ add r3, r1, #1 ++ sub r1, r1, #2 ++ mov ip, #8 ++ dmb ++ b put_h264_qpel8_h_lowpass_l2_neon ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc01_neon ++ .func ff_put_h264_qpel8_mc01_neon ++ff_put_h264_qpel8_mc01_neon: ++ push {lr} ++ mov ip, r1 ++put_h264_qpel8_mc01: ++ mov r3, r2 ++ sub r1, r1, r2, lsl #1 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel8_v_lowpass_l2_neon ++ vpop {d8-d15} ++ pop {pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc11_neon ++ .func ff_put_h264_qpel8_mc11_neon ++ff_put_h264_qpel8_mc11_neon: ++ push {r0, r1, r2, lr} ++put_h264_qpel8_mc11: ++ sub sp, sp, #64 ++ mov r0, sp ++ sub r1, r1, #2 ++ mov r3, #8 ++ mov ip, #8 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel8_h_lowpass_neon ++ ldrd r0, [sp, #128] ++ mov r3, r2 ++ add ip, sp, #64 ++ sub r1, r1, r2, lsl #1 ++ mov r2, #8 ++ bl put_h264_qpel8_v_lowpass_l2_neon ++ vpop {d8-d15} ++ add sp, sp, #76 ++ pop {pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc21_neon ++ .func ff_put_h264_qpel8_mc21_neon ++ff_put_h264_qpel8_mc21_neon: ++ push {r0, r1, r4, r10, r11, lr} ++put_h264_qpel8_mc21: ++ mov r11, sp ++ bic sp, sp, #15 ++ sub sp, sp, #(8*8+16*12) ++ sub r1, r1, #2 ++ mov r3, #8 ++ mov r0, sp ++ mov ip, #8 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel8_h_lowpass_neon ++ mov r4, r0 ++ ldrd r0, [r11] ++ sub r1, r1, r2, lsl #1 ++ sub r1, r1, #2 ++ mov r3, r2 ++ sub r2, r4, #64 ++ bl put_h264_qpel8_hv_lowpass_l2_neon ++ vpop {d8-d15} ++ add sp, r11, #8 ++ pop {r4, r10, r11, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc31_neon ++ .func ff_put_h264_qpel8_mc31_neon ++ff_put_h264_qpel8_mc31_neon: ++ add r1, r1, #1 ++ push {r0, r1, r2, lr} ++ sub r1, r1, #1 ++ b put_h264_qpel8_mc11 ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc02_neon ++ .func ff_put_h264_qpel8_mc02_neon ++ff_put_h264_qpel8_mc02_neon: ++ push {lr} ++ sub r1, r1, r2, lsl #1 ++ mov r3, r2 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel8_v_lowpass_neon ++ vpop {d8-d15} ++ pop {pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc12_neon ++ .func ff_put_h264_qpel8_mc12_neon ++ff_put_h264_qpel8_mc12_neon: ++ push {r0, r1, r4, r10, r11, lr} ++put_h264_qpel8_mc12: ++ mov r11, sp ++ bic sp, sp, #15 ++ sub sp, sp, #(8*8+16*12) ++ sub r1, r1, r2, lsl #1 ++ mov r3, r2 ++ mov r2, #8 ++ mov r0, sp ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel8_v_lowpass_neon ++ mov r4, r0 ++ ldrd r0, [r11] ++ sub r1, r1, r3, lsl #1 ++ sub r1, r1, #2 ++ sub r2, r4, #64 ++ bl put_h264_qpel8_hv_lowpass_l2_neon ++ vpop {d8-d15} ++ add sp, r11, #8 ++ pop {r4, r10, r11, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc22_neon ++ .func ff_put_h264_qpel8_mc22_neon ++ff_put_h264_qpel8_mc22_neon: ++ push {r4, r10, r11, lr} ++ mov r11, sp ++ bic sp, sp, #15 ++ sub r1, r1, r2, lsl #1 ++ sub r1, r1, #2 ++ mov r3, r2 ++ sub sp, sp, #(16*12) ++ mov r4, sp ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel8_hv_lowpass_neon ++ vpop {d8-d15} ++ mov sp, r11 ++ pop {r4, r10, r11, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc32_neon ++ .func ff_put_h264_qpel8_mc32_neon ++ff_put_h264_qpel8_mc32_neon: ++ push {r0, r1, r4, r10, r11, lr} ++ add r1, r1, #1 ++ b put_h264_qpel8_mc12 ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc03_neon ++ .func ff_put_h264_qpel8_mc03_neon ++ff_put_h264_qpel8_mc03_neon: ++ push {lr} ++ add ip, r1, r2 ++ b put_h264_qpel8_mc01 ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc13_neon ++ .func ff_put_h264_qpel8_mc13_neon ++ff_put_h264_qpel8_mc13_neon: ++ push {r0, r1, r2, lr} ++ add r1, r1, r2 ++ b put_h264_qpel8_mc11 ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc23_neon ++ .func ff_put_h264_qpel8_mc23_neon ++ff_put_h264_qpel8_mc23_neon: ++ push {r0, r1, r4, r10, r11, lr} ++ add r1, r1, r2 ++ b put_h264_qpel8_mc21 ++ .endfunc ++ ++ .global ff_put_h264_qpel8_mc33_neon ++ .func ff_put_h264_qpel8_mc33_neon ++ff_put_h264_qpel8_mc33_neon: ++ add r1, r1, #1 ++ push {r0, r1, r2, lr} ++ add r1, r1, r2 ++ sub r1, r1, #1 ++ b put_h264_qpel8_mc11 ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc10_neon ++ .func ff_put_h264_qpel16_mc10_neon ++ff_put_h264_qpel16_mc10_neon: ++ mov r3, r1 ++ sub r1, r1, #2 ++ b put_h264_qpel16_h_lowpass_l2_neon ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc20_neon ++ .func ff_put_h264_qpel16_mc20_neon ++ff_put_h264_qpel16_mc20_neon: ++ sub r1, r1, #2 ++ mov r3, r2 ++ b put_h264_qpel16_h_lowpass_neon ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc30_neon ++ .func ff_put_h264_qpel16_mc30_neon ++ff_put_h264_qpel16_mc30_neon: ++ add r3, r1, #1 ++ sub r1, r1, #2 ++ b put_h264_qpel16_h_lowpass_l2_neon ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc01_neon ++ .func ff_put_h264_qpel16_mc01_neon ++ff_put_h264_qpel16_mc01_neon: ++ push {r4, lr} ++ mov ip, r1 ++put_h264_qpel16_mc01: ++ mov r3, r2 ++ sub r1, r1, r2, lsl #1 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel16_v_lowpass_l2_neon ++ vpop {d8-d15} ++ pop {r4, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc11_neon ++ .func ff_put_h264_qpel16_mc11_neon ++ff_put_h264_qpel16_mc11_neon: ++ push {r0, r1, r4, lr} ++put_h264_qpel16_mc11: ++ sub sp, sp, #256 ++ mov r0, sp ++ sub r1, r1, #2 ++ mov r3, #16 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel16_h_lowpass_neon ++ add r0, sp, #256 ++ ldrd r0, [r0, #64] ++ mov r3, r2 ++ add ip, sp, #64 ++ sub r1, r1, r2, lsl #1 ++ mov r2, #16 ++ bl put_h264_qpel16_v_lowpass_l2_neon ++ vpop {d8-d15} ++ add sp, sp, #(256+8) ++ pop {r4, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc21_neon ++ .func ff_put_h264_qpel16_mc21_neon ++ff_put_h264_qpel16_mc21_neon: ++ push {r0, r1, r4-r5, r9-r11, lr} ++put_h264_qpel16_mc21: ++ mov r11, sp ++ bic sp, sp, #15 ++ sub sp, sp, #(16*16+16*12) ++ sub r1, r1, #2 ++ mov r0, sp ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel16_h_lowpass_neon_packed ++ mov r4, r0 ++ ldrd r0, [r11] ++ sub r1, r1, r2, lsl #1 ++ sub r1, r1, #2 ++ mov r3, r2 ++ bl put_h264_qpel16_hv_lowpass_l2_neon ++ vpop {d8-d15} ++ add sp, r11, #8 ++ pop {r4-r5, r9-r11, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc31_neon ++ .func ff_put_h264_qpel16_mc31_neon ++ff_put_h264_qpel16_mc31_neon: ++ add r1, r1, #1 ++ push {r0, r1, r4, lr} ++ sub r1, r1, #1 ++ b put_h264_qpel16_mc11 ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc02_neon ++ .func ff_put_h264_qpel16_mc02_neon ++ff_put_h264_qpel16_mc02_neon: ++ push {r4, lr} ++ sub r1, r1, r2, lsl #1 ++ mov r3, r2 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel16_v_lowpass_neon ++ vpop {d8-d15} ++ pop {r4, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc12_neon ++ .func ff_put_h264_qpel16_mc12_neon ++ff_put_h264_qpel16_mc12_neon: ++ push {r0, r1, r4-r5, r9-r11, lr} ++put_h264_qpel16_mc12: ++ mov r11, sp ++ bic sp, sp, #15 ++ sub sp, sp, #(16*16+16*12) ++ sub r1, r1, r2, lsl #1 ++ mov r0, sp ++ mov r3, r2 ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel16_v_lowpass_neon_packed ++ mov r4, r0 ++ ldrd r0, [r11] ++ sub r1, r1, r3, lsl #1 ++ sub r1, r1, #2 ++ mov r2, r3 ++ bl put_h264_qpel16_hv_lowpass_l2_neon ++ vpop {d8-d15} ++ add sp, r11, #8 ++ pop {r4-r5, r9-r11, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc22_neon ++ .func ff_put_h264_qpel16_mc22_neon ++ff_put_h264_qpel16_mc22_neon: ++ push {r4, r9-r11, lr} ++ mov r11, sp ++ bic sp, sp, #15 ++ sub r1, r1, r2, lsl #1 ++ sub r1, r1, #2 ++ mov r3, r2 ++ sub sp, sp, #(16*12) ++ mov r4, sp ++ dmb ++ vpush {d8-d15} ++ bl put_h264_qpel16_hv_lowpass_neon ++ vpop {d8-d15} ++ mov sp, r11 ++ pop {r4, r9-r11, pc} ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc32_neon ++ .func ff_put_h264_qpel16_mc32_neon ++ff_put_h264_qpel16_mc32_neon: ++ push {r0, r1, r4-r5, r9-r11, lr} ++ add r1, r1, #1 ++ b put_h264_qpel16_mc12 ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc03_neon ++ .func ff_put_h264_qpel16_mc03_neon ++ff_put_h264_qpel16_mc03_neon: ++ push {r4, lr} ++ add ip, r1, r2 ++ b put_h264_qpel16_mc01 ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc13_neon ++ .func ff_put_h264_qpel16_mc13_neon ++ff_put_h264_qpel16_mc13_neon: ++ push {r0, r1, r4, lr} ++ add r1, r1, r2 ++ b put_h264_qpel16_mc11 ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc23_neon ++ .func ff_put_h264_qpel16_mc23_neon ++ff_put_h264_qpel16_mc23_neon: ++ push {r0, r1, r4-r5, r9-r11, lr} ++ add r1, r1, r2 ++ b put_h264_qpel16_mc21 ++ .endfunc ++ ++ .global ff_put_h264_qpel16_mc33_neon ++ .func ff_put_h264_qpel16_mc33_neon ++ff_put_h264_qpel16_mc33_neon: ++ add r1, r1, #1 ++ push {r0, r1, r4, lr} ++ add r1, r1, r2 ++ sub r1, r1, #1 ++ b put_h264_qpel16_mc11 ++ .endfunc diff --git a/packages/mplayer/files/mru-neon-h264idct-dc.diff b/packages/mplayer/files/mru-neon-h264idct-dc.diff new file mode 100644 index 0000000000..9f316b1b5b --- /dev/null +++ b/packages/mplayer/files/mru-neon-h264idct-dc.diff @@ -0,0 +1,55 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Mon, 25 Aug 2008 00:05:54 +0000 (+0100) +Subject: ARM: NEON optimised h264_idct_dc_add +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=1097c36b47b5019b2a8668f82796ffe76f482408 + +ARM: NEON optimised h264_idct_dc_add +--- + +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index 74f9b4d..6dbe835 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -89,6 +89,7 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); + + void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); ++void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); + + void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + { +@@ -156,4 +157,5 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + + c->h264_idct_add = ff_h264_idct_add_neon; ++ c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + } +diff --git a/libavcodec/armv4l/h264idct_neon.S b/libavcodec/armv4l/h264idct_neon.S +index 8f456f3..34e217f 100644 +--- a/libavcodec/armv4l/h264idct_neon.S ++++ b/libavcodec/armv4l/h264idct_neon.S +@@ -75,3 +75,24 @@ ff_h264_idct_add_neon: + + bx lr + .endfunc ++ ++ .global ff_h264_idct_dc_add_neon ++ .func ff_h264_idct_dc_add_neon ++ff_h264_idct_dc_add_neon: ++ vld1.16 {d2[],d3[]}, [r1,:16] ++ vrshr.s16 q1, q1, #6 ++ vld1.32 {d0[0]}, [r0,:32], r2 ++ vld1.32 {d0[1]}, [r0,:32], r2 ++ vaddw.u8 q2, q1, d0 ++ vld1.32 {d1[0]}, [r0,:32], r2 ++ vld1.32 {d1[1]}, [r0,:32], r2 ++ vaddw.u8 q1, q1, d1 ++ vqmovun.s16 d0, q2 ++ vqmovun.s16 d1, q1 ++ sub r0, r0, r2, lsl #2 ++ vst1.32 {d0[0]}, [r0,:32], r2 ++ vst1.32 {d0[1]}, [r0,:32], r2 ++ vst1.32 {d1[0]}, [r0,:32], r2 ++ vst1.32 {d1[1]}, [r0,:32], r2 ++ bx lr ++ .endfunc diff --git a/packages/mplayer/files/mru-neon-h264idctadd.diff b/packages/mplayer/files/mru-neon-h264idctadd.diff new file mode 100644 index 0000000000..0f0931fbff --- /dev/null +++ b/packages/mplayer/files/mru-neon-h264idctadd.diff @@ -0,0 +1,123 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Sun, 24 Aug 2008 21:27:49 +0000 (+0100) +Subject: ARM: NEON optimised h264_idct_add +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=ebfab90234268bb35600a06e9982ca1358ea43f3 + +ARM: NEON optimised h264_idct_add +--- + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 36ba158..053e752 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -438,6 +438,7 @@ OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ + ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \ + armv4l/simple_idct_neon.o \ + armv4l/h264dsp_neon.o \ ++ armv4l/h264idct_neon.o \ + + OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \ + sparc/simple_idct_vis.o \ +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index a932aa9..74f9b4d 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -88,6 +88,8 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); + ++void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); ++ + void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + { + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; +@@ -152,4 +154,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; ++ ++ c->h264_idct_add = ff_h264_idct_add_neon; + } +diff --git a/libavcodec/armv4l/h264idct_neon.S b/libavcodec/armv4l/h264idct_neon.S +new file mode 100644 +index 0000000..8f456f3 +--- /dev/null ++++ b/libavcodec/armv4l/h264idct_neon.S +@@ -0,0 +1,77 @@ ++/* ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ .fpu neon ++ ++ .text ++ ++ .global ff_h264_idct_add_neon ++ .func ff_h264_idct_add_neon ++ff_h264_idct_add_neon: ++ mov r3, #(1<<5) ++ vmov.i16 d16, #0 ++ vmov.16 d16[0], r3 ++ vld1.64 {d0-d3}, [r1,:128] ++ vadd.i16 d0, d0, d16 ++ ++ vswp d1, d2 ++ vadd.i16 d4, d0, d1 ++ vshr.s16 q8, q1, #1 ++ vsub.i16 d5, d0, d1 ++ vadd.i16 d6, d2, d17 ++ vsub.i16 d7, d16, d3 ++ vadd.i16 q0, q2, q3 ++ vsub.i16 q1, q2, q3 ++ ++ vtrn.16 d0, d1 ++ vtrn.16 d3, d2 ++ vtrn.32 d0, d3 ++ vtrn.32 d1, d2 ++ ++ vadd.i16 d4, d0, d3 ++ vld1.32 {d18[0]}, [r0,:32], r2 ++ vswp d1, d3 ++ vshr.s16 q8, q1, #1 ++ vld1.32 {d19[1]}, [r0,:32], r2 ++ vsub.i16 d5, d0, d1 ++ vld1.32 {d18[1]}, [r0,:32], r2 ++ vadd.i16 d6, d16, d3 ++ vld1.32 {d19[0]}, [r0,:32], r2 ++ vsub.i16 d7, d2, d17 ++ sub r0, r0, r2, lsl #2 ++ vadd.i16 q0, q2, q3 ++ vsub.i16 q1, q2, q3 ++ ++ vshr.s16 q0, q0, #6 ++ vshr.s16 q1, q1, #6 ++ ++ vaddw.u8 q0, q0, d18 ++ vaddw.u8 q1, q1, d19 ++ ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ ++ vst1.32 {d0[0]}, [r0,:32], r2 ++ vst1.32 {d1[1]}, [r0,:32], r2 ++ vst1.32 {d0[1]}, [r0,:32], r2 ++ vst1.32 {d1[0]}, [r0,:32], r2 ++ ++ bx lr ++ .endfunc diff --git a/packages/mplayer/files/mru-neon-put-pixels.diff b/packages/mplayer/files/mru-neon-put-pixels.diff new file mode 100644 index 0000000000..85650d913b --- /dev/null +++ b/packages/mplayer/files/mru-neon-put-pixels.diff @@ -0,0 +1,376 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Fri, 13 Jun 2008 01:21:58 +0000 (+0100) +Subject: ARM: NEON optimised put_pixels functions +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=86410ed1948118a29c70946d5294df9feb04dfef + +ARM: NEON optimised put_pixels functions +--- + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index d91185e..27746df 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -433,6 +433,10 @@ ASM_OBJS-$(HAVE_ARMV5TE) += armv4l/simple_idct_armv5te.o \ + + ASM_OBJS-$(HAVE_ARMV6) += armv4l/simple_idct_armv6.o \ + ++OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ ++ ++ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \ ++ + OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \ + sparc/simple_idct_vis.o \ + +diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c +index 100b89e..89b51e7 100644 +--- a/libavcodec/armv4l/dsputil_arm.c ++++ b/libavcodec/armv4l/dsputil_arm.c +@@ -26,6 +26,7 @@ + + extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); + extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx); ++extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); + + extern void j_rev_dct_ARM(DCTELEM *data); + extern void simple_idct_ARM(DCTELEM *data); +@@ -302,4 +303,7 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) + #ifdef HAVE_ARMVFP + ff_float_init_arm_vfp(c, avctx); + #endif ++#ifdef HAVE_NEON ++ ff_dsputil_init_neon(c, avctx); ++#endif + } +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +new file mode 100644 +index 0000000..8a10dde +--- /dev/null ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -0,0 +1,67 @@ ++/* ++ * ARM NEON optimised DSP functions ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include <stdint.h> ++ ++#include "libavcodec/avcodec.h" ++#include "libavcodec/dsputil.h" ++ ++void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++ ++void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); ++ ++void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) ++{ ++ c->put_pixels_tab[0][0] = ff_put_pixels16_neon; ++ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; ++ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; ++ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; ++ c->put_pixels_tab[1][0] = ff_put_pixels8_neon; ++ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; ++ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; ++ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; ++ ++ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; ++ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; ++ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; ++ ++ c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon; ++ c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon; ++} +diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S +new file mode 100644 +index 0000000..fc5e401 +--- /dev/null ++++ b/libavcodec/armv4l/dsputil_neon_s.S +@@ -0,0 +1,254 @@ ++/* ++ * ARM NEON optimised DSP functions ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ .fpu neon ++ .text ++ ++ .macro put_pixels16 ++ dmb ++1: vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d2, d3}, [r1], r2 ++ vld1.64 {d4, d5}, [r1], r2 ++ vld1.64 {d6, d7}, [r1], r2 ++ pld [r1] ++ subs r3, r3, #4 ++ vst1.64 {d0, d1}, [r0,:128], r2 ++ vst1.64 {d2, d3}, [r0,:128], r2 ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ vst1.64 {d6, d7}, [r0,:128], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels16_x2 vhadd=vrhadd.u8 ++ dmb ++1: vld1.64 {d0-d2}, [r1], r2 ++ vld1.64 {d4-d6}, [r1], r2 ++ pld [r1] ++ subs r3, r3, #2 ++ vext.8 q1, q0, q1, #1 ++ vext.8 q3, q2, q3, #1 ++ \vhadd q0, q0, q1 ++ \vhadd q2, q2, q3 ++ vst1.64 {d0, d1}, [r0,:128], r2 ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels16_y2 vhadd=vrhadd.u8 ++ push {lr} ++ add ip, r1, r2 ++ lsl lr, r2, #1 ++ vld1.64 {d0, d1}, [r1], lr ++ vld1.64 {d2, d3}, [ip], lr ++ dmb ++1: subs r3, r3, #2 ++ \vhadd q2, q0, q1 ++ vld1.64 {d0, d1}, [r1], lr ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ \vhadd q2, q0, q1 ++ vld1.64 {d2, d3}, [ip], lr ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ bne 1b ++ pop {pc} ++ .endm ++ ++ .macro put_pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 ++ push {lr} ++ lsl lr, r2, #1 ++ add ip, r1, r2 ++ vld1.64 {d0-d2}, [r1], lr ++ vld1.64 {d4-d6}, [ip], lr ++ .if \no_rnd ++ vmov.i16 q13, #1 ++ .endif ++ pld [r1] ++ pld [ip] ++ vext.8 q1, q0, q1, #1 ++ vext.8 q3, q2, q3, #1 ++ vaddl.u8 q8, d0, d2 ++ vaddl.u8 q10, d1, d3 ++ vaddl.u8 q9, d4, d6 ++ vaddl.u8 q11, d5, d7 ++ dmb ++1: subs r3, r3, #2 ++ vld1.64 {d0-d2}, [r1], lr ++ vadd.u16 q12, q8, q9 ++ pld [r1] ++ .if \no_rnd ++ vadd.u16 q12, q12, q13 ++ .endif ++ vext.8 q15, q0, q1, #1 ++ vadd.u16 q1 , q10, q11 ++ \vshrn d28, q12, #2 ++ .if \no_rnd ++ vadd.u16 q1, q1, q13 ++ .endif ++ \vshrn d29, q1, #2 ++ vaddl.u8 q8, d0, d30 ++ vld1.64 {d2-d4}, [ip], lr ++ vaddl.u8 q10, d1, d31 ++ vst1.64 {d28,d29}, [r0,:128], r2 ++ vadd.u16 q12, q8, q9 ++ pld [ip] ++ .if \no_rnd ++ vadd.u16 q12, q12, q13 ++ .endif ++ vext.8 q2, q1, q2, #1 ++ vadd.u16 q0, q10, q11 ++ \vshrn d30, q12, #2 ++ .if \no_rnd ++ vadd.u16 q0, q0, q13 ++ .endif ++ \vshrn d31, q0, #2 ++ vaddl.u8 q9, d2, d4 ++ vaddl.u8 q11, d3, d5 ++ vst1.64 {d30,d31}, [r0,:128], r2 ++ bgt 1b ++ pop {pc} ++ .endm ++ ++ .macro put_pixels8 ++ dmb ++1: vld1.64 {d0}, [r1], r2 ++ vld1.64 {d1}, [r1], r2 ++ vld1.64 {d2}, [r1], r2 ++ vld1.64 {d3}, [r1], r2 ++ subs r3, r3, #4 ++ vst1.64 {d0}, [r0,:64], r2 ++ vst1.64 {d1}, [r0,:64], r2 ++ vst1.64 {d2}, [r0,:64], r2 ++ vst1.64 {d3}, [r0,:64], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels8_x2 vhadd=vrhadd.u8 ++ dmb ++1: vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d2, d3}, [r1], r2 ++ pld [r1] ++ subs r3, r3, #2 ++ vext.8 d1, d0, d1, #1 ++ vext.8 d3, d2, d3, #1 ++ vswp d1, d2 ++ \vhadd q0, q0, q1 ++ vst1.64 {d0}, [r0,:64], r2 ++ vst1.64 {d1}, [r0,:64], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels8_y2 vhadd=vrhadd.u8 ++ push {lr} ++ add ip, r1, r2 ++ lsl lr, r2, #1 ++ vld1.64 {d0}, [r1], lr ++ vld1.64 {d1}, [ip], lr ++ dmb ++1: subs r3, r3, #2 ++ \vhadd d4, d0, d1 ++ vld1.64 {d0}, [r1], lr ++ vst1.64 {d4}, [r0,:64], r2 ++ \vhadd d4, d0, d1 ++ vld1.64 {d1}, [ip], lr ++ vst1.64 {d4}, [r0,:64], r2 ++ bne 1b ++ pop {pc} ++ .endm ++ ++ .macro put_pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 ++ push {lr} ++ lsl lr, r2, #1 ++ add ip, r1, r2 ++ vld1.64 {d0, d1}, [r1], lr ++ vld1.64 {d2, d3}, [ip], lr ++ .if \no_rnd ++ vmov.i16 q11, #1 ++ .endif ++ pld [r1] ++ pld [ip] ++ vext.8 d4, d0, d1, #1 ++ vext.8 d6, d2, d3, #1 ++ vaddl.u8 q8, d0, d4 ++ vaddl.u8 q9, d2, d6 ++ dmb ++1: subs r3, r3, #2 ++ vld1.64 {d0, d1}, [r1], lr ++ pld [r1] ++ vadd.u16 q10, q8, q9 ++ vext.8 d4, d0, d1, #1 ++ .if \no_rnd ++ vadd.u16 q10, q10, q11 ++ .endif ++ vaddl.u8 q8, d0, d4 ++ \vshrn d5, q10, #2 ++ vld1.64 {d2, d3}, [ip], lr ++ vadd.u16 q10, q8, q9 ++ pld [ip] ++ .if \no_rnd ++ vadd.u16 q10, q10, q11 ++ .endif ++ vst1.64 {d5}, [r0,:64], r2 ++ \vshrn d7, q10, #2 ++ vext.8 d6, d2, d3, #1 ++ vaddl.u8 q9, d2, d6 ++ vst1.64 {d7}, [r0,:64], r2 ++ bgt 1b ++ pop {pc} ++ .endm ++ ++ .macro extern name ++ .global \name ++ .type \name, %function ++ .func \name ++\name: ++ .endm ++ ++ .macro defun name suf rnd_op args:vararg ++ extern ff_\name\suf\()_neon ++ \name \rnd_op \args ++ .endfunc ++ .endm ++ ++ .macro defun2 name args:vararg ++ defun \name ++ defun \name \args ++ .endm ++ ++ extern ff_put_h264_qpel16_mc00_neon ++ mov r3, #16 ++ .endfunc ++ ++ defun put_pixels16 ++ defun2 put_pixels16_x2, _no_rnd, vhadd.u8 ++ defun2 put_pixels16_y2, _no_rnd, vhadd.u8 ++ defun2 put_pixels16_xy2, _no_rnd, vshrn.u16, 1 ++ ++ extern ff_put_h264_qpel8_mc00_neon ++ mov r3, #8 ++ .endfunc ++ ++ defun put_pixels8 ++ defun2 put_pixels8_x2, _no_rnd, vhadd.u8 ++ defun2 put_pixels8_y2, _no_rnd, vhadd.u8 ++ defun2 put_pixels8_xy2, _no_rnd, vshrn.u16, 1 diff --git a/packages/mplayer/files/mru-neon-simple-idct.diff b/packages/mplayer/files/mru-neon-simple-idct.diff new file mode 100644 index 0000000000..772a1fd972 --- /dev/null +++ b/packages/mplayer/files/mru-neon-simple-idct.diff @@ -0,0 +1,501 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Thu, 26 Jun 2008 18:37:40 +0000 (+0100) +Subject: ARM: NEON optimised simple_idct +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=215b9eaa8cf0195908c92f373c018320736ec106 + +ARM: NEON optimised simple_idct +--- + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 27746df..7fa02fa 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -436,6 +436,7 @@ ASM_OBJS-$(HAVE_ARMV6) += armv4l/simple_idct_armv6.o \ + OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ + + ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \ ++ armv4l/simple_idct_neon.o \ + + OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \ + sparc/simple_idct_vis.o \ +diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c +index 89b51e7..942c0de 100644 +--- a/libavcodec/armv4l/dsputil_arm.c ++++ b/libavcodec/armv4l/dsputil_arm.c +@@ -43,6 +43,12 @@ extern void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, + extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, + DCTELEM *data); + ++extern void ff_simple_idct_neon(DCTELEM *data); ++extern void ff_simple_idct_put_neon(uint8_t *dest, int line_size, ++ DCTELEM *data); ++extern void ff_simple_idct_add_neon(uint8_t *dest, int line_size, ++ DCTELEM *data); ++ + /* XXX: local hack */ + static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); + static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); +@@ -233,6 +239,8 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) + if(idct_algo == FF_IDCT_AUTO){ + #if defined(HAVE_IPP) + idct_algo = FF_IDCT_IPP; ++#elif defined(HAVE_NEON) ++ idct_algo = FF_IDCT_SIMPLENEON; + #elif defined(HAVE_ARMV6) + idct_algo = FF_IDCT_SIMPLEARMV6; + #elif defined(HAVE_ARMV5TE) +@@ -273,6 +281,13 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) + c->idct = simple_idct_ipp; + c->idct_permutation_type= FF_NO_IDCT_PERM; + #endif ++#ifdef HAVE_NEON ++ } else if (idct_algo==FF_IDCT_SIMPLENEON){ ++ c->idct_put= ff_simple_idct_put_neon; ++ c->idct_add= ff_simple_idct_add_neon; ++ c->idct = ff_simple_idct_neon; ++ c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; ++#endif + } + } + +diff --git a/libavcodec/armv4l/simple_idct_neon.S b/libavcodec/armv4l/simple_idct_neon.S +new file mode 100644 +index 0000000..44701f8 +--- /dev/null ++++ b/libavcodec/armv4l/simple_idct_neon.S +@@ -0,0 +1,411 @@ ++/* ++ * ARM NEON IDCT ++ * ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * Based on Simple IDCT ++ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W4c ((1<<(COL_SHIFT-1))/W4) ++#define ROW_SHIFT 11 ++#define COL_SHIFT 20 ++ ++#define w1 d0[0] ++#define w2 d0[1] ++#define w3 d0[2] ++#define w4 d0[3] ++#define w5 d1[0] ++#define w6 d1[1] ++#define w7 d1[2] ++#define w4c d1[3] ++ ++ .fpu neon ++ ++ .macro idct_col4_top ++ vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ ++ vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ ++ vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ ++ vadd.i32 q11, q15, q7 ++ vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ ++ vadd.i32 q12, q15, q8 ++ vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ ++ vsub.i32 q13, q15, q8 ++ vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ ++ vsub.i32 q14, q15, q7 ++ ++ vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ ++ vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ ++ vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ ++ vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ ++ .endm ++ ++ .text ++ .align ++ .type idct_row4_neon, %function ++ .func idct_row4_neon ++idct_row4_neon: ++ vmov.i32 q15, #(1<<(ROW_SHIFT-1)) ++ vld1.64 {d2-d5}, [a3,:128]! ++ vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ ++ vld1.64 {d6,d7}, [a3,:128]! ++ vorr d10, d3, d5 ++ vld1.64 {d8,d9}, [a3,:128]! ++ add a3, a3, #-64 ++ ++ vorr d11, d7, d9 ++ vorr d10, d10, d11 ++ vmov a4, v1, d10 ++ ++ idct_col4_top ++ ++ orrs a4, a4, v1 ++ beq 1f ++ ++ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ ++ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ ++ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ ++ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ ++ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ ++ vadd.i32 q11, q11, q7 ++ vsub.i32 q12, q12, q7 ++ vsub.i32 q13, q13, q7 ++ vadd.i32 q14, q14, q7 ++ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ ++ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ ++ vmlal.s16 q9, d9, w7 ++ vmlsl.s16 q10, d9, w5 ++ vmlal.s16 q5, d9, w3 ++ vmlsl.s16 q6, d9, w1 ++ vadd.i32 q11, q11, q7 ++ vsub.i32 q12, q12, q8 ++ vadd.i32 q13, q13, q8 ++ vsub.i32 q14, q14, q7 ++ ++1: vadd.i32 q3, q11, q9 ++ vadd.i32 q4, q12, q10 ++ vshrn.i32 d2, q3, #ROW_SHIFT ++ vshrn.i32 d4, q4, #ROW_SHIFT ++ vadd.i32 q7, q13, q5 ++ vadd.i32 q8, q14, q6 ++ vtrn.16 d2, d4 ++ vshrn.i32 d6, q7, #ROW_SHIFT ++ vshrn.i32 d8, q8, #ROW_SHIFT ++ vsub.i32 q14, q14, q6 ++ vsub.i32 q11, q11, q9 ++ vtrn.16 d6, d8 ++ vsub.i32 q13, q13, q5 ++ vshrn.i32 d3, q14, #ROW_SHIFT ++ vtrn.32 d2, d6 ++ vsub.i32 q12, q12, q10 ++ vtrn.32 d4, d8 ++ vshrn.i32 d5, q13, #ROW_SHIFT ++ vshrn.i32 d7, q12, #ROW_SHIFT ++ vshrn.i32 d9, q11, #ROW_SHIFT ++ ++ vtrn.16 d3, d5 ++ vtrn.16 d7, d9 ++ vtrn.32 d3, d7 ++ vtrn.32 d5, d9 ++ ++ vst1.64 {d2-d5}, [a3,:128]! ++ vst1.64 {d6-d9}, [a3,:128]! ++ ++ bx lr ++ .endfunc ++ ++ .align ++ .type idct_col4_neon, %function ++ .func idct_col4_neon ++idct_col4_neon: ++ mov ip, #16 ++ vld1.64 {d2}, [a3,:64], ip /* d2 = col[0] */ ++ vdup.16 d30, w4c ++ vld1.64 {d4}, [a3,:64], ip /* d3 = col[1] */ ++ vadd.i16 d30, d30, d2 ++ vld1.64 {d6}, [a3,:64], ip /* d4 = col[2] */ ++ vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ ++ vld1.64 {d8}, [a3,:64], ip /* d5 = col[3] */ ++ ++ ldrd v1, [a3] ++ ldrd v3, [a3, #16] ++ orrs v1, v1, v2 ++ ++ idct_col4_top ++ addeq a3, a3, #16 ++ beq 1f ++ ++ vld1.64 {d3}, [a3,:64], ip /* d6 = col[4] */ ++ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ ++ vadd.i32 q11, q11, q7 ++ vsub.i32 q12, q12, q7 ++ vsub.i32 q13, q13, q7 ++ vadd.i32 q14, q14, q7 ++ ++1: orrs v3, v3, v4 ++ ldrd v1, [a3, #16] ++ addeq a3, a3, #16 ++ beq 2f ++ ++ vld1.64 {d5}, [a3,:64], ip /* d7 = col[5] */ ++ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ ++ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ ++ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ ++ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ ++ ++2: orrs v1, v1, v2 ++ ldrd v1, [a3, #16] ++ addeq a3, a3, #16 ++ beq 3f ++ ++ vld1.64 {d7}, [a3,:64], ip /* d8 = col[6] */ ++ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ ++ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ ++ vadd.i32 q11, q11, q7 ++ vsub.i32 q14, q14, q7 ++ vsub.i32 q12, q12, q8 ++ vadd.i32 q13, q13, q8 ++ ++3: orrs v1, v1, v2 ++ addeq a3, a3, #16 ++ beq 4f ++ ++ vld1.64 {d9}, [a3,:64], ip /* d9 = col[7] */ ++ vmlal.s16 q9, d9, w7 ++ vmlsl.s16 q10, d9, w5 ++ vmlal.s16 q5, d9, w3 ++ vmlsl.s16 q6, d9, w1 ++ ++4: vadd.i32 q3, q11, q9 ++ vadd.i32 q4, q12, q10 ++ vadd.i32 q7, q13, q5 ++ vadd.i32 q8, q14, q6 ++ vsub.i32 q11, q11, q9 ++ vsub.i32 q12, q12, q10 ++ vsub.i32 q13, q13, q5 ++ vsub.i32 q14, q14, q6 ++ ++ bx lr ++ .endfunc ++ ++ .macro idct_col4_st16 ++ vshr.s32 q2, q3, #COL_SHIFT ++ vshr.s32 q3, q4, #COL_SHIFT ++ vmovn.i32 d2, q2 ++ vshr.s32 q4, q7, #COL_SHIFT ++ vmovn.i32 d3, q3 ++ vshr.s32 q5, q8, #COL_SHIFT ++ vmovn.i32 d4, q4 ++ vshr.s32 q6, q14, #COL_SHIFT ++ vmovn.i32 d5, q5 ++ vshr.s32 q7, q13, #COL_SHIFT ++ vmovn.i32 d6, q6 ++ vshr.s32 q8, q12, #COL_SHIFT ++ vmovn.i32 d7, q7 ++ vshr.s32 q9, q11, #COL_SHIFT ++ vmovn.i32 d8, q8 ++ vmovn.i32 d9, q9 ++ ++ mov ip, #16 ++ vst1.64 {d2}, [a3,:64], ip ++ vst1.64 {d3}, [a3,:64], ip ++ vst1.64 {d4}, [a3,:64], ip ++ vst1.64 {d5}, [a3,:64], ip ++ vst1.64 {d6}, [a3,:64], ip ++ vst1.64 {d7}, [a3,:64], ip ++ vst1.64 {d8}, [a3,:64], ip ++ vst1.64 {d9}, [a3,:64], ip ++ .endm ++ ++ .align ++ .type idct_col4_add8, %function ++ .func idct_col4_add8 ++idct_col4_add8: ++ mov ip, a1 ++ ++ vshr.s32 q2, q3, #COL_SHIFT ++ vshr.s32 q3, q4, #COL_SHIFT ++ vmovn.i32 d2, q2 ++ vshr.s32 q4, q7, #COL_SHIFT ++ vmovn.i32 d3, q3 ++ vshr.s32 q5, q8, #COL_SHIFT ++ vmovn.i32 d4, q4 ++ vshr.s32 q6, q14, #COL_SHIFT ++ vmovn.i32 d5, q5 ++ vld1.32 {d10[0]}, [a1,:32], a2 ++ vshr.s32 q7, q13, #COL_SHIFT ++ vld1.32 {d10[1]}, [a1,:32], a2 ++ vmovn.i32 d6, q6 ++ vld1.32 {d11[0]}, [a1,:32], a2 ++ vshr.s32 q8, q12, #COL_SHIFT ++ vld1.32 {d11[1]}, [a1,:32], a2 ++ vaddw.u8 q1, q1, d10 ++ vld1.32 {d12[0]}, [a1,:32], a2 ++ vmovn.i32 d7, q7 ++ vld1.32 {d12[1]}, [a1,:32], a2 ++ vqmovun.s16 d2, q1 ++ vld1.32 {d13[0]}, [a1,:32], a2 ++ vshr.s32 q9, q11, #COL_SHIFT ++ vaddw.u8 q2, q2, d11 ++ vld1.32 {d13[1]}, [a1,:32], a2 ++ vaddw.u8 q3, q3, d12 ++ vst1.32 {d2[0]}, [ip,:32], a2 ++ vqmovun.s16 d3, q2 ++ vst1.32 {d2[1]}, [ip,:32], a2 ++ vmovn.i32 d8, q8 ++ vmovn.i32 d9, q9 ++ vst1.32 {d3[0]}, [ip,:32], a2 ++ vqmovun.s16 d4, q3 ++ vst1.32 {d3[1]}, [ip,:32], a2 ++ vaddw.u8 q4, q4, d13 ++ vst1.32 {d4[0]}, [ip,:32], a2 ++ vqmovun.s16 d5, q4 ++ vst1.32 {d4[1]}, [ip,:32], a2 ++ vst1.32 {d5[0]}, [ip,:32], a2 ++ vst1.32 {d5[1]}, [ip,:32], a2 ++ ++ bx lr ++ .endfunc ++ ++ .type idct_col4_st8, %function ++ .func idct_col4_st8 ++idct_col4_st8: ++ vshr.s32 q2, q3, #COL_SHIFT ++ vshr.s32 q3, q4, #COL_SHIFT ++ vmovn.i32 d2, q2 ++ vshr.s32 q4, q7, #COL_SHIFT ++ vmovn.i32 d3, q3 ++ vshr.s32 q5, q8, #COL_SHIFT ++ vqmovun.s16 d2, q1 ++ vmovn.i32 d4, q4 ++ vshr.s32 q6, q14, #COL_SHIFT ++ vst1.32 {d2[0]}, [a1,:32], a2 ++ vmovn.i32 d5, q5 ++ vshr.s32 q7, q13, #COL_SHIFT ++ vst1.32 {d2[1]}, [a1,:32], a2 ++ vmovn.i32 d6, q6 ++ vqmovun.s16 d3, q2 ++ vshr.s32 q8, q12, #COL_SHIFT ++ vmovn.i32 d7, q7 ++ vshr.s32 q9, q11, #COL_SHIFT ++ vst1.32 {d3[0]}, [a1,:32], a2 ++ vqmovun.s16 d4, q3 ++ vst1.32 {d3[1]}, [a1,:32], a2 ++ vmovn.i32 d8, q8 ++ vmovn.i32 d9, q9 ++ vst1.32 {d4[0]}, [a1,:32], a2 ++ vst1.32 {d4[1]}, [a1,:32], a2 ++ vqmovun.s16 d5, q4 ++ vst1.32 {d5[0]}, [a1,:32], a2 ++ vst1.32 {d5[1]}, [a1,:32], a2 ++ ++ bx lr ++ .endfunc ++ ++ .align 4 ++const: .short W1, W2, W3, W4, W5, W6, W7, W4c ++ ++ .macro idct_start data ++ push {v1-v4, lr} ++ pld [\data] ++ pld [\data, #64] ++ dmb ++ vpush {d8-d15} ++ adr a4, const ++ vld1.64 {d0,d1}, [a4,:128] ++ .endm ++ ++ .macro idct_end ++ vpop {d8-d15} ++ pop {v1-v4, pc} ++ .endm ++ ++ .align ++ .global ff_simple_idct_neon ++ .type ff_simple_idct_neon, %function ++ .func ff_simple_idct_neon ++/* void ff_simple_idct_neon(DCTELEM *data); */ ++ff_simple_idct_neon: ++ idct_start a1 ++ ++ mov a3, a1 ++ bl idct_row4_neon ++ bl idct_row4_neon ++ add a3, a3, #-128 ++ bl idct_col4_neon ++ add a3, a3, #-128 ++ idct_col4_st16 ++ add a3, a3, #-120 ++ bl idct_col4_neon ++ add a3, a3, #-128 ++ idct_col4_st16 ++ ++ idct_end ++ .endfunc ++ ++ .align ++ .global ff_simple_idct_put_neon ++ .type ff_simple_idct_put_neon, %function ++ .func ff_simple_idct_put_neon ++/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ ++ff_simple_idct_put_neon: ++ idct_start a3 ++ ++ bl idct_row4_neon ++ bl idct_row4_neon ++ add a3, a3, #-128 ++ bl idct_col4_neon ++ bl idct_col4_st8 ++ sub a1, a1, a2, lsl #3 ++ add a1, a1, #4 ++ add a3, a3, #-120 ++ bl idct_col4_neon ++ bl idct_col4_st8 ++ ++ idct_end ++ .endfunc ++ ++ .align ++ .global ff_simple_idct_add_neon ++ .type ff_simple_idct_add_neon, %function ++ .func ff_simple_idct_add_neon ++/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ ++ff_simple_idct_add_neon: ++ idct_start a3 ++ ++ bl idct_row4_neon ++ bl idct_row4_neon ++ add a3, a3, #-128 ++ bl idct_col4_neon ++ bl idct_col4_add8 ++ sub a1, a1, a2, lsl #3 ++ add a1, a1, #4 ++ add a3, a3, #-120 ++ bl idct_col4_neon ++ bl idct_col4_add8 ++ ++ idct_end ++ .endfunc +diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h +index 76522c4..43e2ef3 100644 +--- a/libavcodec/avcodec.h ++++ b/libavcodec/avcodec.h +@@ -1352,6 +1352,7 @@ typedef struct AVCodecContext { + #define FF_IDCT_SIMPLEVIS 18 + #define FF_IDCT_WMV2 19 + #define FF_IDCT_FAAN 20 ++#define FF_IDCT_SIMPLENEON 21 + + /** + * slice count +diff --git a/libavcodec/utils.c b/libavcodec/utils.c +index cf00d25..3d1afcf 100644 +--- a/libavcodec/utils.c ++++ b/libavcodec/utils.c +@@ -549,6 +549,7 @@ static const AVOption options[]={ + {"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"simplearmv6", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV6, INT_MIN, INT_MAX, V|E|D, "idct"}, ++{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"}, diff --git a/packages/mplayer/files/mru-neon-vector-fmul-window.diff b/packages/mplayer/files/mru-neon-vector-fmul-window.diff new file mode 100644 index 0000000000..03ac55bc56 --- /dev/null +++ b/packages/mplayer/files/mru-neon-vector-fmul-window.diff @@ -0,0 +1,86 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Sun, 3 Aug 2008 16:46:43 +0000 (+0100) +Subject: ARM: NEON optimised vector_fmul_window +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=98feb31064dccfd16ce189ff4aec9ccedddf6b04 + +ARM: NEON optimised vector_fmul_window +--- + +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index f9d32c0..6c44940 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -91,6 +91,10 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); + void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); + ++void ff_vector_fmul_window_neon(float *dst, const float *src0, ++ const float *src1, const float *win, ++ float add_bias, int len); ++ + void ff_float_to_int16_neon(int16_t *, const float *, long); + void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); + +@@ -164,6 +168,8 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + ++ c->vector_fmul_window = ff_vector_fmul_window_neon; ++ + c->float_to_int16 = ff_float_to_int16_neon; + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; + +diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S +index 6a54803..49a09b8 100644 +--- a/libavcodec/armv4l/dsputil_neon_s.S ++++ b/libavcodec/armv4l/dsputil_neon_s.S +@@ -324,6 +324,49 @@ extern ff_float_to_int16_interleave_neon + pop {r4,r5,pc} + .endfunc + ++extern ff_vector_fmul_window_neon ++ vld1.32 {d16[],d17[]}, [sp,:32] ++ push {r4,r5,lr} ++ ldr lr, [sp, #16] ++ sub r2, r2, #8 ++ sub r5, lr, #2 ++ add r2, r2, r5, lsl #2 ++ add r4, r3, r5, lsl #3 ++ add ip, r0, r5, lsl #3 ++ mov r5, #-16 ++ dmb ++ vld1.64 {d0,d1}, [r1,:128]! ++ vld1.64 {d2,d3}, [r2,:128], r5 ++ vld1.64 {d4,d5}, [r3,:128]! ++ vld1.64 {d6,d7}, [r4,:128], r5 ++1: vmov q10, q8 ++ vmov q11, q8 ++ vmla.f32 q11, q0, q2 ++ vrev64.32 q3, q3 ++ vswp d6, d7 ++ vmla.f32 q10, q0, q3 ++ vrev64.32 q1, q1 ++ vswp d2, d3 ++ subs lr, lr, #4 ++ vmla.f32 q11, q1, q3 ++ vmls.f32 q10, q1, q2 ++ beq 2f ++ vld1.64 {d0,d1}, [r1,:128]! ++ vld1.64 {d2,d3}, [r2,:128], r5 ++ vld1.64 {d4,d5}, [r3,:128]! ++ vld1.64 {d6,d7}, [r4,:128], r5 ++ vrev64.32 q11, q11 ++ vswp d22, d23 ++ vst1.64 {d20,d21}, [r0,:128]! ++ vst1.64 {d22,d23}, [ip,:128], r5 ++ b 1b ++2: vrev64.32 q11, q11 ++ vswp d22, d23 ++ vst1.64 {d20,d21}, [r0,:128]! ++ vst1.64 {d22,d23}, [ip,:128], r5 ++ pop {r4,r5,pc} ++ .endfunc ++ + #ifdef CONFIG_VORBIS_DECODER + extern ff_vorbis_inverse_coupling_neon + vmov.i32 q10, #(1<<31) diff --git a/packages/mplayer/files/mru-neon-vector-fmul.diff b/packages/mplayer/files/mru-neon-vector-fmul.diff new file mode 100644 index 0000000000..2710f10443 --- /dev/null +++ b/packages/mplayer/files/mru-neon-vector-fmul.diff @@ -0,0 +1,56 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Sun, 3 Aug 2008 17:13:06 +0000 (+0100) +Subject: ARM: NEON optimised vector_fmul +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=ba46eb14e3be96b627fd096aacaa4dbb2e186281 + +ARM: NEON optimised vector_fmul +--- + +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index 6c44940..c6fc173 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -91,6 +91,7 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); + void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); + ++void ff_vector_fmul_neon(float *dst, const float *src, int len); + void ff_vector_fmul_window_neon(float *dst, const float *src0, + const float *src1, const float *win, + float add_bias, int len); +@@ -168,6 +169,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + ++ c->vector_fmul = ff_vector_fmul_neon; + c->vector_fmul_window = ff_vector_fmul_window_neon; + + c->float_to_int16 = ff_float_to_int16_neon; +diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S +index 49a09b8..7310700 100644 +--- a/libavcodec/armv4l/dsputil_neon_s.S ++++ b/libavcodec/armv4l/dsputil_neon_s.S +@@ -324,6 +324,23 @@ extern ff_float_to_int16_interleave_neon + pop {r4,r5,pc} + .endfunc + ++extern ff_vector_fmul_neon ++ mov r3, r0 ++ vld1.64 {d0-d3}, [r0,:128]! ++ vld1.64 {d4-d7}, [r1,:128]! ++ dmb ++1: subs r2, r2, #8 ++ vmul.f32 q8, q0, q2 ++ vmul.f32 q9, q1, q3 ++ beq 2f ++ vld1.64 {d0-d3}, [r0,:128]! ++ vld1.64 {d4-d7}, [r1,:128]! ++ vst1.64 {d16-d19}, [r3,:128]! ++ b 1b ++2: vst1.64 {d16-d19}, [r3,:128]! ++ bx lr ++ .endfunc ++ + extern ff_vector_fmul_window_neon + vld1.32 {d16[],d17[]}, [sp,:32] + push {r4,r5,lr} diff --git a/packages/mplayer/files/mru-neon-vorbis-inverse.diff b/packages/mplayer/files/mru-neon-vorbis-inverse.diff new file mode 100644 index 0000000000..6cd5dc0134 --- /dev/null +++ b/packages/mplayer/files/mru-neon-vorbis-inverse.diff @@ -0,0 +1,68 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Fri, 1 Aug 2008 02:28:34 +0000 (+0100) +Subject: ARM: NEON optimised vorbis_inverse_coupling +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=ac234c5ad52d8478be5aaa7c276e423873453d8b + +ARM: NEON optimised vorbis_inverse_coupling +--- + +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index b584e5b..f9d32c0 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -94,6 +94,8 @@ void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); + void ff_float_to_int16_neon(int16_t *, const float *, long); + void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); + ++void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); ++ + void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + { + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; +@@ -164,4 +166,8 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + + c->float_to_int16 = ff_float_to_int16_neon; + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; ++ ++#ifdef CONFIG_VORBIS_DECODER ++ c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; ++#endif + } +diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S +index 44f75ba..6a54803 100644 +--- a/libavcodec/armv4l/dsputil_neon_s.S ++++ b/libavcodec/armv4l/dsputil_neon_s.S +@@ -19,6 +19,8 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "config.h" ++ + .fpu neon + .text + +@@ -321,3 +323,24 @@ extern ff_float_to_int16_interleave_neon + bne 3b + pop {r4,r5,pc} + .endfunc ++ ++#ifdef CONFIG_VORBIS_DECODER ++extern ff_vorbis_inverse_coupling_neon ++ vmov.i32 q10, #(1<<31) ++ dmb ++1: vld1.64 {d2,d3}, [r1,:128] ++ vld1.64 {d0,d1}, [r0,:128] ++ vcle.f32 q8, q1, #0 ++ vand q9, q0, q10 ++ veor q1, q1, q9 ++ vand q2, q1, q8 ++ vbic q3, q1, q8 ++ vadd.f32 q1, q0, q2 ++ vsub.f32 q0, q0, q3 ++ subs r2, r2, #4 ++ vst1.64 {d0,d1}, [r1,:128]! ++ vst1.64 {d2,d3}, [r0,:128]! ++ bgt 1b ++ bx lr ++ .endfunc ++#endif |