From: Mans Rullgard Date: Fri, 13 Jun 2008 01:21:58 +0000 (+0100) Subject: ARM: NEON optimised put_pixels functions X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=86410ed1948118a29c70946d5294df9feb04dfef ARM: NEON optimised put_pixels functions --- diff --git a/libavcodec/Makefile b/libavcodec/Makefile index d91185e..27746df 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -433,6 +433,10 @@ ASM_OBJS-$(HAVE_ARMV5TE) += armv4l/simple_idct_armv5te.o \ ASM_OBJS-$(HAVE_ARMV6) += armv4l/simple_idct_armv6.o \ +OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ + +ASM_OBJS-$(HAVE_NEON) += armv4l/dsputil_neon_s.o \ + OBJS-$(HAVE_VIS) += sparc/dsputil_vis.o \ sparc/simple_idct_vis.o \ diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c index 100b89e..89b51e7 100644 --- a/libavcodec/armv4l/dsputil_arm.c +++ b/libavcodec/armv4l/dsputil_arm.c @@ -26,6 +26,7 @@ extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx); +extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); extern void j_rev_dct_ARM(DCTELEM *data); extern void simple_idct_ARM(DCTELEM *data); @@ -302,4 +303,7 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) #ifdef HAVE_ARMVFP ff_float_init_arm_vfp(c, avctx); #endif +#ifdef HAVE_NEON + ff_dsputil_init_neon(c, avctx); +#endif } diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c new file mode 100644 index 0000000..8a10dde --- /dev/null +++ b/libavcodec/armv4l/dsputil_neon.c @@ -0,0 +1,67 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" + +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + +void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); + +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_neon; +} diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S new file mode 100644 index 0000000..fc5e401 --- /dev/null +++ b/libavcodec/armv4l/dsputil_neon_s.S @@ -0,0 +1,254 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .fpu neon + .text + + .macro put_pixels16 + dmb +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 + vld1.64 {d4, d5}, [r1], r2 + vld1.64 {d6, d7}, [r1], r2 + pld [r1] + subs r3, r3, #4 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d2, d3}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro put_pixels16_x2 vhadd=vrhadd.u8 + dmb +1: vld1.64 {d0-d2}, [r1], r2 + vld1.64 {d4-d6}, [r1], r2 + pld [r1] + subs r3, r3, #2 + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + \vhadd q0, q0, q1 + \vhadd q2, q2, q3 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro put_pixels16_y2 vhadd=vrhadd.u8 + push {lr} + add ip, r1, r2 + lsl lr, r2, #1 + vld1.64 {d0, d1}, [r1], lr + vld1.64 {d2, d3}, [ip], lr + dmb +1: subs r3, r3, #2 + \vhadd q2, q0, q1 + vld1.64 {d0, d1}, [r1], lr + vst1.64 {d4, d5}, [r0,:128], r2 + \vhadd q2, q0, q1 + vld1.64 {d2, d3}, [ip], lr + vst1.64 {d4, d5}, [r0,:128], r2 + bne 1b + pop {pc} + .endm + + .macro put_pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 + push {lr} + lsl lr, r2, #1 + add ip, r1, r2 + vld1.64 {d0-d2}, [r1], lr + vld1.64 {d4-d6}, [ip], lr + .if \no_rnd + vmov.i16 q13, #1 + .endif + pld [r1] + pld [ip] + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + vaddl.u8 q8, d0, d2 + vaddl.u8 q10, d1, d3 + vaddl.u8 q9, d4, d6 + vaddl.u8 q11, d5, d7 + dmb +1: subs r3, r3, #2 + vld1.64 {d0-d2}, [r1], lr + vadd.u16 q12, q8, q9 + pld [r1] + .if \no_rnd + vadd.u16 q12, q12, q13 + .endif + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + \vshrn d28, q12, #2 + .if \no_rnd + vadd.u16 q1, q1, q13 + .endif + \vshrn d29, q1, #2 + vaddl.u8 q8, d0, d30 + vld1.64 {d2-d4}, [ip], lr + vaddl.u8 q10, d1, d31 + vst1.64 {d28,d29}, [r0,:128], r2 + vadd.u16 q12, q8, q9 + pld [ip] + .if \no_rnd + vadd.u16 q12, q12, q13 + .endif + vext.8 q2, q1, q2, #1 + vadd.u16 q0, q10, q11 + \vshrn d30, q12, #2 + .if \no_rnd + vadd.u16 q0, q0, q13 + .endif + \vshrn d31, q0, #2 + vaddl.u8 q9, d2, d4 + vaddl.u8 q11, d3, d5 + vst1.64 {d30,d31}, [r0,:128], r2 + bgt 1b + pop {pc} + .endm + + .macro put_pixels8 + dmb +1: vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + subs r3, r3, #4 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + vst1.64 {d2}, [r0,:64], r2 + vst1.64 {d3}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro put_pixels8_x2 vhadd=vrhadd.u8 + dmb +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 + pld [r1] + subs r3, r3, #2 + vext.8 d1, d0, d1, #1 + vext.8 d3, d2, d3, #1 + vswp d1, d2 + \vhadd q0, q0, q1 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro put_pixels8_y2 vhadd=vrhadd.u8 + push {lr} + add ip, r1, r2 + lsl lr, r2, #1 + vld1.64 {d0}, [r1], lr + vld1.64 {d1}, [ip], lr + dmb +1: subs r3, r3, #2 + \vhadd d4, d0, d1 + vld1.64 {d0}, [r1], lr + vst1.64 {d4}, [r0,:64], r2 + \vhadd d4, d0, d1 + vld1.64 {d1}, [ip], lr + vst1.64 {d4}, [r0,:64], r2 + bne 1b + pop {pc} + .endm + + .macro put_pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 + push {lr} + lsl lr, r2, #1 + add ip, r1, r2 + vld1.64 {d0, d1}, [r1], lr + vld1.64 {d2, d3}, [ip], lr + .if \no_rnd + vmov.i16 q11, #1 + .endif + pld [r1] + pld [ip] + vext.8 d4, d0, d1, #1 + vext.8 d6, d2, d3, #1 + vaddl.u8 q8, d0, d4 + vaddl.u8 q9, d2, d6 + dmb +1: subs r3, r3, #2 + vld1.64 {d0, d1}, [r1], lr + pld [r1] + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 + .if \no_rnd + vadd.u16 q10, q10, q11 + .endif + vaddl.u8 q8, d0, d4 + \vshrn d5, q10, #2 + vld1.64 {d2, d3}, [ip], lr + vadd.u16 q10, q8, q9 + pld [ip] + .if \no_rnd + vadd.u16 q10, q10, q11 + .endif + vst1.64 {d5}, [r0,:64], r2 + \vshrn d7, q10, #2 + vext.8 d6, d2, d3, #1 + vaddl.u8 q9, d2, d6 + vst1.64 {d7}, [r0,:64], r2 + bgt 1b + pop {pc} + .endm + + .macro extern name + .global \name + .type \name, %function + .func \name +\name: + .endm + + .macro defun name suf rnd_op args:vararg + extern ff_\name\suf\()_neon + \name \rnd_op \args + .endfunc + .endm + + .macro defun2 name args:vararg + defun \name + defun \name \args + .endm + + extern ff_put_h264_qpel16_mc00_neon + mov r3, #16 + .endfunc + + defun put_pixels16 + defun2 put_pixels16_x2, _no_rnd, vhadd.u8 + defun2 put_pixels16_y2, _no_rnd, vhadd.u8 + defun2 put_pixels16_xy2, _no_rnd, vshrn.u16, 1 + + extern ff_put_h264_qpel8_mc00_neon + mov r3, #8 + .endfunc + + defun put_pixels8 + defun2 put_pixels8_x2, _no_rnd, vhadd.u8 + defun2 put_pixels8_y2, _no_rnd, vhadd.u8 + defun2 put_pixels8_xy2, _no_rnd, vshrn.u16, 1