diff options
Diffstat (limited to 'packages/mplayer/files/mru-neon-h264-chrome.diff')
-rw-r--r-- | packages/mplayer/files/mru-neon-h264-chrome.diff | 364 |
1 files changed, 364 insertions, 0 deletions
diff --git a/packages/mplayer/files/mru-neon-h264-chrome.diff b/packages/mplayer/files/mru-neon-h264-chrome.diff new file mode 100644 index 0000000000..cb6c4ff991 --- /dev/null +++ b/packages/mplayer/files/mru-neon-h264-chrome.diff @@ -0,0 +1,364 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Fri, 11 Jul 2008 01:20:07 +0000 (+0100) +Subject: ARM: NEON optimised {put,avg}_h264_chroma_mc[48] +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=d3aa8f93b8a0061e0c3ac12aeed055961abfc113 + +ARM: NEON optimised {put,avg}_h264_chroma_mc[48] +--- + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 7fa02fa..36ba158 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -437,6 +437,7 @@ OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ + + ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \ + armv4l/simple_idct_neon.o \ ++ armv4l/h264dsp_neon.o \ + + OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \ + sparc/simple_idct_vis.o \ +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +index 8a10dde..a6d86cd 100644 +--- a/libavcodec/armv4l/dsputil_neon.c ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -42,6 +42,12 @@ void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); + void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); + ++void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); ++void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); ++ ++void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); ++void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); ++ + void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + { + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; +@@ -62,6 +68,12 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + ++ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; ++ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; ++ ++ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; ++ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; ++ + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon; + } +diff --git a/libavcodec/armv4l/h264dsp_neon.S b/libavcodec/armv4l/h264dsp_neon.S +new file mode 100644 +index 0000000..28d9aa7 +--- /dev/null ++++ b/libavcodec/armv4l/h264dsp_neon.S +@@ -0,0 +1,308 @@ ++/* ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ .fpu neon ++ ++/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ ++ .macro h264_chroma_mc8 avg=0 ++ push {r4-r7, lr} ++ ldrd r4, [sp, #20] ++.if \avg ++ mov lr, r0 ++.endif ++ pld [r1] ++ pld [r1, r2] ++ ++ muls r7, r4, r5 ++ rsb r6, r7, r5, lsl #3 ++ rsb ip, r7, r4, lsl #3 ++ sub r4, r7, r4, lsl #3 ++ sub r4, r4, r5, lsl #3 ++ add r4, r4, #64 ++ ++ dmb ++ ++ beq 2f ++ ++ add r5, r1, r2 ++ ++ vdup.8 d0, r4 ++ lsl r4, r2, #1 ++ vdup.8 d1, ip ++ vld1.64 {d4, d5}, [r1], r4 ++ vdup.8 d2, r6 ++ vld1.64 {d6, d7}, [r5], r4 ++ vdup.8 d3, r7 ++ ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ ++1: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d5, d1 ++ vld1.64 {d4, d5}, [r1], r4 ++ vmlal.u8 q8, d6, d2 ++ vext.8 d5, d4, d5, #1 ++ vmlal.u8 q8, d7, d3 ++ vmull.u8 q9, d6, d0 ++ subs r3, r3, #2 ++ vmlal.u8 q9, d7, d1 ++ vmlal.u8 q9, d4, d2 ++ vmlal.u8 q9, d5, d3 ++ vrshrn.u16 d16, q8, #6 ++ vld1.64 {d6, d7}, [r5], r4 ++ pld [r1] ++ vrshrn.u16 d17, q9, #6 ++.if \avg ++ vld1.64 {d20}, [lr,:64], r2 ++ vld1.64 {d21}, [lr,:64], r2 ++ vrhadd.u8 q8, q8, q10 ++.endif ++ vext.8 d7, d6, d7, #1 ++ vst1.64 {d16}, [r0,:64], r2 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 1b ++ ++ pop {r4-r7, pc} ++ ++2: tst r6, r6 ++ add ip, ip, r6 ++ vdup.8 d0, r4 ++ vdup.8 d1, ip ++ ++ beq 4f ++ ++ add r5, r1, r2 ++ lsl r4, r2, #1 ++ vld1.64 {d4}, [r1], r4 ++ vld1.64 {d6}, [r5], r4 ++ ++3: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d6, d1 ++ vld1.64 {d4}, [r1], r4 ++ vmull.u8 q9, d6, d0 ++ vmlal.u8 q9, d4, d1 ++ vld1.64 {d6}, [r5], r4 ++ vrshrn.u16 d16, q8, #6 ++ vrshrn.u16 d17, q9, #6 ++.if \avg ++ vld1.64 {d20}, [lr,:64], r2 ++ vld1.64 {d21}, [lr,:64], r2 ++ vrhadd.u8 q8, q8, q10 ++.endif ++ subs r3, r3, #2 ++ pld [r1] ++ vst1.64 {d16}, [r0,:64], r2 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 3b ++ ++ pop {r4-r7, pc} ++ ++4: vld1.64 {d4, d5}, [r1], r2 ++ vld1.64 {d6, d7}, [r1], r2 ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ ++5: pld [r1] ++ subs r3, r3, #2 ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d5, d1 ++ vld1.64 {d4, d5}, [r1], r2 ++ vmull.u8 q9, d6, d0 ++ vmlal.u8 q9, d7, d1 ++ pld [r1] ++ vext.8 d5, d4, d5, #1 ++ vrshrn.u16 d16, q8, #6 ++ vrshrn.u16 d17, q9, #6 ++.if \avg ++ vld1.64 {d20}, [lr,:64], r2 ++ vld1.64 {d21}, [lr,:64], r2 ++ vrhadd.u8 q8, q8, q10 ++.endif ++ vld1.64 {d6, d7}, [r1], r2 ++ vext.8 d7, d6, d7, #1 ++ vst1.64 {d16}, [r0,:64], r2 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 5b ++ ++ pop {r4-r7, pc} ++ .endm ++ ++/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ ++ .macro h264_chroma_mc4 avg=0 ++ push {r4-r7, lr} ++ ldrd r4, [sp, #20] ++.if \avg ++ mov lr, r0 ++.endif ++ pld [r1] ++ pld [r1, r2] ++ ++ muls r7, r4, r5 ++ rsb r6, r7, r5, lsl #3 ++ rsb ip, r7, r4, lsl #3 ++ sub r4, r7, r4, lsl #3 ++ sub r4, r4, r5, lsl #3 ++ add r4, r4, #64 ++ ++ dmb ++ ++ beq 2f ++ ++ add r5, r1, r2 ++ ++ vdup.8 d0, r4 ++ lsl r4, r2, #1 ++ vdup.8 d1, ip ++ vld1.64 {d4}, [r1], r4 ++ vdup.8 d2, r6 ++ vld1.64 {d6}, [r5], r4 ++ vdup.8 d3, r7 ++ ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d4, d5 ++ vtrn.32 d6, d7 ++ ++ vtrn.32 d0, d1 ++ vtrn.32 d2, d3 ++ ++1: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d6, d2 ++ vld1.64 {d4}, [r1], r4 ++ vext.8 d5, d4, d5, #1 ++ vtrn.32 d4, d5 ++ vmull.u8 q9, d6, d0 ++ vmlal.u8 q9, d4, d2 ++ vld1.64 {d6}, [r5], r4 ++ vadd.i16 d16, d16, d17 ++ vadd.i16 d17, d18, d19 ++ vrshrn.u16 d16, q8, #6 ++ subs r3, r3, #2 ++ pld [r1] ++.if \avg ++ vld1.32 {d20[0]}, [lr,:32], r2 ++ vld1.32 {d20[1]}, [lr,:32], r2 ++ vrhadd.u8 d16, d16, d20 ++.endif ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d6, d7 ++ vst1.32 {d16[0]}, [r0,:32], r2 ++ vst1.32 {d16[1]}, [r0,:32], r2 ++ bgt 1b ++ ++ pop {r4-r7, pc} ++ ++2: tst r6, r6 ++ add ip, ip, r6 ++ vdup.8 d0, r4 ++ vdup.8 d1, ip ++ vtrn.32 d0, d1 ++ ++ beq 4f ++ ++ vext.32 d1, d0, d1, #1 ++ add r5, r1, r2 ++ lsl r4, r2, #1 ++ vld1.32 {d4[0]}, [r1], r4 ++ vld1.32 {d4[1]}, [r5], r4 ++ ++3: pld [r5] ++ vmull.u8 q8, d4, d0 ++ vld1.32 {d4[0]}, [r1], r4 ++ vmull.u8 q9, d4, d1 ++ vld1.32 {d4[1]}, [r5], r4 ++ vadd.i16 d16, d16, d17 ++ vadd.i16 d17, d18, d19 ++ vrshrn.u16 d16, q8, #6 ++.if \avg ++ vld1.32 {d20[0]}, [lr,:32], r2 ++ vld1.32 {d20[1]}, [lr,:32], r2 ++ vrhadd.u8 d16, d16, d20 ++.endif ++ subs r3, r3, #2 ++ pld [r1] ++ vst1.32 {d16[0]}, [r0,:32], r2 ++ vst1.32 {d16[1]}, [r0,:32], r2 ++ bgt 3b ++ ++ pop {r4-r7, pc} ++ ++4: vld1.64 {d4}, [r1], r2 ++ vld1.64 {d6}, [r1], r2 ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d4, d5 ++ vtrn.32 d6, d7 ++ ++5: vmull.u8 q8, d4, d0 ++ vmull.u8 q9, d6, d0 ++ subs r3, r3, #2 ++ vld1.64 {d4}, [r1], r2 ++ vext.8 d5, d4, d5, #1 ++ vtrn.32 d4, d5 ++ vadd.i16 d16, d16, d17 ++ vadd.i16 d17, d18, d19 ++ pld [r1] ++ vrshrn.u16 d16, q8, #6 ++.if \avg ++ vld1.32 {d20[0]}, [lr,:32], r2 ++ vld1.32 {d20[1]}, [lr,:32], r2 ++ vrhadd.u8 d16, d16, d20 ++.endif ++ vld1.64 {d6}, [r1], r2 ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d6, d7 ++ pld [r1] ++ vst1.32 {d16[0]}, [r0,:32], r2 ++ vst1.32 {d16[1]}, [r0,:32], r2 ++ bgt 5b ++ ++ pop {r4-r7, pc} ++ .endm ++ ++ .text ++ .align ++ ++ .global ff_put_h264_chroma_mc8_neon ++ .func ff_put_h264_chroma_mc8_neon ++ff_put_h264_chroma_mc8_neon: ++ h264_chroma_mc8 ++ .endfunc ++ ++ .global ff_avg_h264_chroma_mc8_neon ++ .func ff_avg_h264_chroma_mc8_neon ++ff_avg_h264_chroma_mc8_neon: ++ h264_chroma_mc8 avg=1 ++ .endfunc ++ ++ .global ff_put_h264_chroma_mc4_neon ++ .func ff_put_h264_chroma_mc4_neon ++ff_put_h264_chroma_mc4_neon: ++ h264_chroma_mc4 ++ .endfunc ++ ++ .global ff_avg_h264_chroma_mc4_neon ++ .func ff_avg_h264_chroma_mc4_neon ++ff_avg_h264_chroma_mc4_neon: ++ h264_chroma_mc4 avg=1 ++ .endfunc |