1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
From: Mans Rullgard <mans@mansr.com>
Date: Fri, 1 Aug 2008 02:28:34 +0000 (+0100)
Subject: ARM: NEON optimised vorbis_inverse_coupling
X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=ac234c5ad52d8478be5aaa7c276e423873453d8b
ARM: NEON optimised vorbis_inverse_coupling
---
diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c
index b584e5b..f9d32c0 100644
--- a/libavcodec/armv4l/dsputil_neon.c
+++ b/libavcodec/armv4l/dsputil_neon.c
@@ -94,6 +94,8 @@ void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_float_to_int16_neon(int16_t *, const float *, long);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
+
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
{
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
@@ -164,4 +166,8 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+
+#ifdef CONFIG_VORBIS_DECODER
+ c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
+#endif
}
diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S
index 44f75ba..6a54803 100644
--- a/libavcodec/armv4l/dsputil_neon_s.S
+++ b/libavcodec/armv4l/dsputil_neon_s.S
@@ -19,6 +19,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "config.h"
+
.fpu neon
.text
@@ -321,3 +323,24 @@ extern ff_float_to_int16_interleave_neon
bne 3b
pop {r4,r5,pc}
.endfunc
+
+#ifdef CONFIG_VORBIS_DECODER
+extern ff_vorbis_inverse_coupling_neon
+ vmov.i32 q10, #(1<<31)
+ dmb
+1: vld1.64 {d2,d3}, [r1,:128]
+ vld1.64 {d0,d1}, [r0,:128]
+ vcle.f32 q8, q1, #0
+ vand q9, q0, q10
+ veor q1, q1, q9
+ vand q2, q1, q8
+ vbic q3, q1, q8
+ vadd.f32 q1, q0, q2
+ vsub.f32 q0, q0, q3
+ subs r2, r2, #4
+ vst1.64 {d0,d1}, [r1,:128]!
+ vst1.64 {d2,d3}, [r0,:128]!
+ bgt 1b
+ bx lr
+ .endfunc
+#endif
|