summaryrefslogtreecommitdiff
path: root/packages/mplayer/files/mru-neon-put-pixels.diff
diff options
context:
space:
mode:
Diffstat (limited to 'packages/mplayer/files/mru-neon-put-pixels.diff')
-rw-r--r--packages/mplayer/files/mru-neon-put-pixels.diff376
1 files changed, 376 insertions, 0 deletions
diff --git a/packages/mplayer/files/mru-neon-put-pixels.diff b/packages/mplayer/files/mru-neon-put-pixels.diff
new file mode 100644
index 0000000000..85650d913b
--- /dev/null
+++ b/packages/mplayer/files/mru-neon-put-pixels.diff
@@ -0,0 +1,376 @@
+From: Mans Rullgard <mans@mansr.com>
+Date: Fri, 13 Jun 2008 01:21:58 +0000 (+0100)
+Subject: ARM: NEON optimised put_pixels functions
+X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=86410ed1948118a29c70946d5294df9feb04dfef
+
+ARM: NEON optimised put_pixels functions
+---
+
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index d91185e..27746df 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -433,6 +433,10 @@ ASM_OBJS-$(HAVE_ARMV5TE) += armv4l/simple_idct_armv5te.o \
+
+ ASM_OBJS-$(HAVE_ARMV6) += armv4l/simple_idct_armv6.o \
+
++OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \
++
++ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \
++
+ OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \
+ sparc/simple_idct_vis.o \
+
+diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c
+index 100b89e..89b51e7 100644
+--- a/libavcodec/armv4l/dsputil_arm.c
++++ b/libavcodec/armv4l/dsputil_arm.c
+@@ -26,6 +26,7 @@
+
+ extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
+ extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
++extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
+
+ extern void j_rev_dct_ARM(DCTELEM *data);
+ extern void simple_idct_ARM(DCTELEM *data);
+@@ -302,4 +303,7 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
+ #ifdef HAVE_ARMVFP
+ ff_float_init_arm_vfp(c, avctx);
+ #endif
++#ifdef HAVE_NEON
++ ff_dsputil_init_neon(c, avctx);
++#endif
+ }
+diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c
+new file mode 100644
+index 0000000..8a10dde
+--- /dev/null
++++ b/libavcodec/armv4l/dsputil_neon.c
+@@ -0,0 +1,67 @@
++/*
++ * ARM NEON optimised DSP functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavcodec/avcodec.h"
++#include "libavcodec/dsputil.h"
++
++void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
++void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
++
++void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
++void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
++
++void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
++{
++ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
++ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
++ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
++ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
++ c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
++ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
++ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
++ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
++
++ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
++ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
++ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
++ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
++ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
++ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
++ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
++ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
++
++ c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon;
++ c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon;
++}
+diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S
+new file mode 100644
+index 0000000..fc5e401
+--- /dev/null
++++ b/libavcodec/armv4l/dsputil_neon_s.S
+@@ -0,0 +1,254 @@
++/*
++ * ARM NEON optimised DSP functions
++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++ .fpu neon
++ .text
++
++ .macro put_pixels16
++ dmb
++1: vld1.64 {d0, d1}, [r1], r2
++ vld1.64 {d2, d3}, [r1], r2
++ vld1.64 {d4, d5}, [r1], r2
++ vld1.64 {d6, d7}, [r1], r2
++ pld [r1]
++ subs r3, r3, #4
++ vst1.64 {d0, d1}, [r0,:128], r2
++ vst1.64 {d2, d3}, [r0,:128], r2
++ vst1.64 {d4, d5}, [r0,:128], r2
++ vst1.64 {d6, d7}, [r0,:128], r2
++ bne 1b
++ bx lr
++ .endm
++
++ .macro put_pixels16_x2 vhadd=vrhadd.u8
++ dmb
++1: vld1.64 {d0-d2}, [r1], r2
++ vld1.64 {d4-d6}, [r1], r2
++ pld [r1]
++ subs r3, r3, #2
++ vext.8 q1, q0, q1, #1
++ vext.8 q3, q2, q3, #1
++ \vhadd q0, q0, q1
++ \vhadd q2, q2, q3
++ vst1.64 {d0, d1}, [r0,:128], r2
++ vst1.64 {d4, d5}, [r0,:128], r2
++ bne 1b
++ bx lr
++ .endm
++
++ .macro put_pixels16_y2 vhadd=vrhadd.u8
++ push {lr}
++ add ip, r1, r2
++ lsl lr, r2, #1
++ vld1.64 {d0, d1}, [r1], lr
++ vld1.64 {d2, d3}, [ip], lr
++ dmb
++1: subs r3, r3, #2
++ \vhadd q2, q0, q1
++ vld1.64 {d0, d1}, [r1], lr
++ vst1.64 {d4, d5}, [r0,:128], r2
++ \vhadd q2, q0, q1
++ vld1.64 {d2, d3}, [ip], lr
++ vst1.64 {d4, d5}, [r0,:128], r2
++ bne 1b
++ pop {pc}
++ .endm
++
++ .macro put_pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
++ push {lr}
++ lsl lr, r2, #1
++ add ip, r1, r2
++ vld1.64 {d0-d2}, [r1], lr
++ vld1.64 {d4-d6}, [ip], lr
++ .if \no_rnd
++ vmov.i16 q13, #1
++ .endif
++ pld [r1]
++ pld [ip]
++ vext.8 q1, q0, q1, #1
++ vext.8 q3, q2, q3, #1
++ vaddl.u8 q8, d0, d2
++ vaddl.u8 q10, d1, d3
++ vaddl.u8 q9, d4, d6
++ vaddl.u8 q11, d5, d7
++ dmb
++1: subs r3, r3, #2
++ vld1.64 {d0-d2}, [r1], lr
++ vadd.u16 q12, q8, q9
++ pld [r1]
++ .if \no_rnd
++ vadd.u16 q12, q12, q13
++ .endif
++ vext.8 q15, q0, q1, #1
++ vadd.u16 q1 , q10, q11
++ \vshrn d28, q12, #2
++ .if \no_rnd
++ vadd.u16 q1, q1, q13
++ .endif
++ \vshrn d29, q1, #2
++ vaddl.u8 q8, d0, d30
++ vld1.64 {d2-d4}, [ip], lr
++ vaddl.u8 q10, d1, d31
++ vst1.64 {d28,d29}, [r0,:128], r2
++ vadd.u16 q12, q8, q9
++ pld [ip]
++ .if \no_rnd
++ vadd.u16 q12, q12, q13
++ .endif
++ vext.8 q2, q1, q2, #1
++ vadd.u16 q0, q10, q11
++ \vshrn d30, q12, #2
++ .if \no_rnd
++ vadd.u16 q0, q0, q13
++ .endif
++ \vshrn d31, q0, #2
++ vaddl.u8 q9, d2, d4
++ vaddl.u8 q11, d3, d5
++ vst1.64 {d30,d31}, [r0,:128], r2
++ bgt 1b
++ pop {pc}
++ .endm
++
++ .macro put_pixels8
++ dmb
++1: vld1.64 {d0}, [r1], r2
++ vld1.64 {d1}, [r1], r2
++ vld1.64 {d2}, [r1], r2
++ vld1.64 {d3}, [r1], r2
++ subs r3, r3, #4
++ vst1.64 {d0}, [r0,:64], r2
++ vst1.64 {d1}, [r0,:64], r2
++ vst1.64 {d2}, [r0,:64], r2
++ vst1.64 {d3}, [r0,:64], r2
++ bne 1b
++ bx lr
++ .endm
++
++ .macro put_pixels8_x2 vhadd=vrhadd.u8
++ dmb
++1: vld1.64 {d0, d1}, [r1], r2
++ vld1.64 {d2, d3}, [r1], r2
++ pld [r1]
++ subs r3, r3, #2
++ vext.8 d1, d0, d1, #1
++ vext.8 d3, d2, d3, #1
++ vswp d1, d2
++ \vhadd q0, q0, q1
++ vst1.64 {d0}, [r0,:64], r2
++ vst1.64 {d1}, [r0,:64], r2
++ bne 1b
++ bx lr
++ .endm
++
++ .macro put_pixels8_y2 vhadd=vrhadd.u8
++ push {lr}
++ add ip, r1, r2
++ lsl lr, r2, #1
++ vld1.64 {d0}, [r1], lr
++ vld1.64 {d1}, [ip], lr
++ dmb
++1: subs r3, r3, #2
++ \vhadd d4, d0, d1
++ vld1.64 {d0}, [r1], lr
++ vst1.64 {d4}, [r0,:64], r2
++ \vhadd d4, d0, d1
++ vld1.64 {d1}, [ip], lr
++ vst1.64 {d4}, [r0,:64], r2
++ bne 1b
++ pop {pc}
++ .endm
++
++ .macro put_pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
++ push {lr}
++ lsl lr, r2, #1
++ add ip, r1, r2
++ vld1.64 {d0, d1}, [r1], lr
++ vld1.64 {d2, d3}, [ip], lr
++ .if \no_rnd
++ vmov.i16 q11, #1
++ .endif
++ pld [r1]
++ pld [ip]
++ vext.8 d4, d0, d1, #1
++ vext.8 d6, d2, d3, #1
++ vaddl.u8 q8, d0, d4
++ vaddl.u8 q9, d2, d6
++ dmb
++1: subs r3, r3, #2
++ vld1.64 {d0, d1}, [r1], lr
++ pld [r1]
++ vadd.u16 q10, q8, q9
++ vext.8 d4, d0, d1, #1
++ .if \no_rnd
++ vadd.u16 q10, q10, q11
++ .endif
++ vaddl.u8 q8, d0, d4
++ \vshrn d5, q10, #2
++ vld1.64 {d2, d3}, [ip], lr
++ vadd.u16 q10, q8, q9
++ pld [ip]
++ .if \no_rnd
++ vadd.u16 q10, q10, q11
++ .endif
++ vst1.64 {d5}, [r0,:64], r2
++ \vshrn d7, q10, #2
++ vext.8 d6, d2, d3, #1
++ vaddl.u8 q9, d2, d6
++ vst1.64 {d7}, [r0,:64], r2
++ bgt 1b
++ pop {pc}
++ .endm
++
++ .macro extern name
++ .global \name
++ .type \name, %function
++ .func \name
++\name:
++ .endm
++
++ .macro defun name suf rnd_op args:vararg
++ extern ff_\name\suf\()_neon
++ \name \rnd_op \args
++ .endfunc
++ .endm
++
++ .macro defun2 name args:vararg
++ defun \name
++ defun \name \args
++ .endm
++
++ extern ff_put_h264_qpel16_mc00_neon
++ mov r3, #16
++ .endfunc
++
++ defun put_pixels16
++ defun2 put_pixels16_x2, _no_rnd, vhadd.u8
++ defun2 put_pixels16_y2, _no_rnd, vhadd.u8
++ defun2 put_pixels16_xy2, _no_rnd, vshrn.u16, 1
++
++ extern ff_put_h264_qpel8_mc00_neon
++ mov r3, #8
++ .endfunc
++
++ defun put_pixels8
++ defun2 put_pixels8_x2, _no_rnd, vhadd.u8
++ defun2 put_pixels8_y2, _no_rnd, vhadd.u8
++ defun2 put_pixels8_xy2, _no_rnd, vshrn.u16, 1