diff options
Diffstat (limited to 'packages/mplayer/files/mru-neon-put-pixels.diff')
-rw-r--r-- | packages/mplayer/files/mru-neon-put-pixels.diff | 376 |
1 files changed, 376 insertions, 0 deletions
diff --git a/packages/mplayer/files/mru-neon-put-pixels.diff b/packages/mplayer/files/mru-neon-put-pixels.diff new file mode 100644 index 0000000000..85650d913b --- /dev/null +++ b/packages/mplayer/files/mru-neon-put-pixels.diff @@ -0,0 +1,376 @@ +From: Mans Rullgard <mans@mansr.com> +Date: Fri, 13 Jun 2008 01:21:58 +0000 (+0100) +Subject: ARM: NEON optimised put_pixels functions +X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=86410ed1948118a29c70946d5294df9feb04dfef + +ARM: NEON optimised put_pixels functions +--- + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index d91185e..27746df 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -433,6 +433,10 @@ ASM_OBJS-$(HAVE_ARMV5TE) += armv4l/simple_idct_armv5te.o \ + + ASM_OBJS-$(HAVE_ARMV6) += armv4l/simple_idct_armv6.o \ + ++OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ ++ ++ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \ ++ + OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \ + sparc/simple_idct_vis.o \ + +diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c +index 100b89e..89b51e7 100644 +--- a/libavcodec/armv4l/dsputil_arm.c ++++ b/libavcodec/armv4l/dsputil_arm.c +@@ -26,6 +26,7 @@ + + extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); + extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx); ++extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); + + extern void j_rev_dct_ARM(DCTELEM *data); + extern void simple_idct_ARM(DCTELEM *data); +@@ -302,4 +303,7 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) + #ifdef HAVE_ARMVFP + ff_float_init_arm_vfp(c, avctx); + #endif ++#ifdef HAVE_NEON ++ ff_dsputil_init_neon(c, avctx); ++#endif + } +diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c +new file mode 100644 +index 0000000..8a10dde +--- /dev/null ++++ b/libavcodec/armv4l/dsputil_neon.c +@@ -0,0 +1,67 @@ ++/* ++ * ARM NEON optimised DSP functions ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include <stdint.h> ++ ++#include "libavcodec/avcodec.h" ++#include "libavcodec/dsputil.h" ++ ++void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); ++ ++void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); ++void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); ++ ++void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) ++{ ++ c->put_pixels_tab[0][0] = ff_put_pixels16_neon; ++ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; ++ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; ++ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; ++ c->put_pixels_tab[1][0] = ff_put_pixels8_neon; ++ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; ++ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; ++ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; ++ ++ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; ++ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; ++ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; ++ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; ++ ++ c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon; ++ c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon; ++} +diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S +new file mode 100644 +index 0000000..fc5e401 +--- /dev/null ++++ b/libavcodec/armv4l/dsputil_neon_s.S +@@ -0,0 +1,254 @@ ++/* ++ * ARM NEON optimised DSP functions ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ .fpu neon ++ .text ++ ++ .macro put_pixels16 ++ dmb ++1: vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d2, d3}, [r1], r2 ++ vld1.64 {d4, d5}, [r1], r2 ++ vld1.64 {d6, d7}, [r1], r2 ++ pld [r1] ++ subs r3, r3, #4 ++ vst1.64 {d0, d1}, [r0,:128], r2 ++ vst1.64 {d2, d3}, [r0,:128], r2 ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ vst1.64 {d6, d7}, [r0,:128], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels16_x2 vhadd=vrhadd.u8 ++ dmb ++1: vld1.64 {d0-d2}, [r1], r2 ++ vld1.64 {d4-d6}, [r1], r2 ++ pld [r1] ++ subs r3, r3, #2 ++ vext.8 q1, q0, q1, #1 ++ vext.8 q3, q2, q3, #1 ++ \vhadd q0, q0, q1 ++ \vhadd q2, q2, q3 ++ vst1.64 {d0, d1}, [r0,:128], r2 ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels16_y2 vhadd=vrhadd.u8 ++ push {lr} ++ add ip, r1, r2 ++ lsl lr, r2, #1 ++ vld1.64 {d0, d1}, [r1], lr ++ vld1.64 {d2, d3}, [ip], lr ++ dmb ++1: subs r3, r3, #2 ++ \vhadd q2, q0, q1 ++ vld1.64 {d0, d1}, [r1], lr ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ \vhadd q2, q0, q1 ++ vld1.64 {d2, d3}, [ip], lr ++ vst1.64 {d4, d5}, [r0,:128], r2 ++ bne 1b ++ pop {pc} ++ .endm ++ ++ .macro put_pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 ++ push {lr} ++ lsl lr, r2, #1 ++ add ip, r1, r2 ++ vld1.64 {d0-d2}, [r1], lr ++ vld1.64 {d4-d6}, [ip], lr ++ .if \no_rnd ++ vmov.i16 q13, #1 ++ .endif ++ pld [r1] ++ pld [ip] ++ vext.8 q1, q0, q1, #1 ++ vext.8 q3, q2, q3, #1 ++ vaddl.u8 q8, d0, d2 ++ vaddl.u8 q10, d1, d3 ++ vaddl.u8 q9, d4, d6 ++ vaddl.u8 q11, d5, d7 ++ dmb ++1: subs r3, r3, #2 ++ vld1.64 {d0-d2}, [r1], lr ++ vadd.u16 q12, q8, q9 ++ pld [r1] ++ .if \no_rnd ++ vadd.u16 q12, q12, q13 ++ .endif ++ vext.8 q15, q0, q1, #1 ++ vadd.u16 q1 , q10, q11 ++ \vshrn d28, q12, #2 ++ .if \no_rnd ++ vadd.u16 q1, q1, q13 ++ .endif ++ \vshrn d29, q1, #2 ++ vaddl.u8 q8, d0, d30 ++ vld1.64 {d2-d4}, [ip], lr ++ vaddl.u8 q10, d1, d31 ++ vst1.64 {d28,d29}, [r0,:128], r2 ++ vadd.u16 q12, q8, q9 ++ pld [ip] ++ .if \no_rnd ++ vadd.u16 q12, q12, q13 ++ .endif ++ vext.8 q2, q1, q2, #1 ++ vadd.u16 q0, q10, q11 ++ \vshrn d30, q12, #2 ++ .if \no_rnd ++ vadd.u16 q0, q0, q13 ++ .endif ++ \vshrn d31, q0, #2 ++ vaddl.u8 q9, d2, d4 ++ vaddl.u8 q11, d3, d5 ++ vst1.64 {d30,d31}, [r0,:128], r2 ++ bgt 1b ++ pop {pc} ++ .endm ++ ++ .macro put_pixels8 ++ dmb ++1: vld1.64 {d0}, [r1], r2 ++ vld1.64 {d1}, [r1], r2 ++ vld1.64 {d2}, [r1], r2 ++ vld1.64 {d3}, [r1], r2 ++ subs r3, r3, #4 ++ vst1.64 {d0}, [r0,:64], r2 ++ vst1.64 {d1}, [r0,:64], r2 ++ vst1.64 {d2}, [r0,:64], r2 ++ vst1.64 {d3}, [r0,:64], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels8_x2 vhadd=vrhadd.u8 ++ dmb ++1: vld1.64 {d0, d1}, [r1], r2 ++ vld1.64 {d2, d3}, [r1], r2 ++ pld [r1] ++ subs r3, r3, #2 ++ vext.8 d1, d0, d1, #1 ++ vext.8 d3, d2, d3, #1 ++ vswp d1, d2 ++ \vhadd q0, q0, q1 ++ vst1.64 {d0}, [r0,:64], r2 ++ vst1.64 {d1}, [r0,:64], r2 ++ bne 1b ++ bx lr ++ .endm ++ ++ .macro put_pixels8_y2 vhadd=vrhadd.u8 ++ push {lr} ++ add ip, r1, r2 ++ lsl lr, r2, #1 ++ vld1.64 {d0}, [r1], lr ++ vld1.64 {d1}, [ip], lr ++ dmb ++1: subs r3, r3, #2 ++ \vhadd d4, d0, d1 ++ vld1.64 {d0}, [r1], lr ++ vst1.64 {d4}, [r0,:64], r2 ++ \vhadd d4, d0, d1 ++ vld1.64 {d1}, [ip], lr ++ vst1.64 {d4}, [r0,:64], r2 ++ bne 1b ++ pop {pc} ++ .endm ++ ++ .macro put_pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 ++ push {lr} ++ lsl lr, r2, #1 ++ add ip, r1, r2 ++ vld1.64 {d0, d1}, [r1], lr ++ vld1.64 {d2, d3}, [ip], lr ++ .if \no_rnd ++ vmov.i16 q11, #1 ++ .endif ++ pld [r1] ++ pld [ip] ++ vext.8 d4, d0, d1, #1 ++ vext.8 d6, d2, d3, #1 ++ vaddl.u8 q8, d0, d4 ++ vaddl.u8 q9, d2, d6 ++ dmb ++1: subs r3, r3, #2 ++ vld1.64 {d0, d1}, [r1], lr ++ pld [r1] ++ vadd.u16 q10, q8, q9 ++ vext.8 d4, d0, d1, #1 ++ .if \no_rnd ++ vadd.u16 q10, q10, q11 ++ .endif ++ vaddl.u8 q8, d0, d4 ++ \vshrn d5, q10, #2 ++ vld1.64 {d2, d3}, [ip], lr ++ vadd.u16 q10, q8, q9 ++ pld [ip] ++ .if \no_rnd ++ vadd.u16 q10, q10, q11 ++ .endif ++ vst1.64 {d5}, [r0,:64], r2 ++ \vshrn d7, q10, #2 ++ vext.8 d6, d2, d3, #1 ++ vaddl.u8 q9, d2, d6 ++ vst1.64 {d7}, [r0,:64], r2 ++ bgt 1b ++ pop {pc} ++ .endm ++ ++ .macro extern name ++ .global \name ++ .type \name, %function ++ .func \name ++\name: ++ .endm ++ ++ .macro defun name suf rnd_op args:vararg ++ extern ff_\name\suf\()_neon ++ \name \rnd_op \args ++ .endfunc ++ .endm ++ ++ .macro defun2 name args:vararg ++ defun \name ++ defun \name \args ++ .endm ++ ++ extern ff_put_h264_qpel16_mc00_neon ++ mov r3, #16 ++ .endfunc ++ ++ defun put_pixels16 ++ defun2 put_pixels16_x2, _no_rnd, vhadd.u8 ++ defun2 put_pixels16_y2, _no_rnd, vhadd.u8 ++ defun2 put_pixels16_xy2, _no_rnd, vshrn.u16, 1 ++ ++ extern ff_put_h264_qpel8_mc00_neon ++ mov r3, #8 ++ .endfunc ++ ++ defun put_pixels8 ++ defun2 put_pixels8_x2, _no_rnd, vhadd.u8 ++ defun2 put_pixels8_y2, _no_rnd, vhadd.u8 ++ defun2 put_pixels8_xy2, _no_rnd, vshrn.u16, 1 |