1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
From: Mans Rullgard <mans@mansr.com>
Date: Sun, 3 Aug 2008 17:13:06 +0000 (+0100)
Subject: ARM: NEON optimised vector_fmul
X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=ba46eb14e3be96b627fd096aacaa4dbb2e186281
ARM: NEON optimised vector_fmul
---
diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c
index 6c44940..c6fc173 100644
--- a/libavcodec/armv4l/dsputil_neon.c
+++ b/libavcodec/armv4l/dsputil_neon.c
@@ -91,6 +91,7 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_vector_fmul_neon(float *dst, const float *src, int len);
void ff_vector_fmul_window_neon(float *dst, const float *src0,
const float *src1, const float *win,
float add_bias, int len);
@@ -168,6 +169,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+ c->vector_fmul = ff_vector_fmul_neon;
c->vector_fmul_window = ff_vector_fmul_window_neon;
c->float_to_int16 = ff_float_to_int16_neon;
diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S
index 49a09b8..7310700 100644
--- a/libavcodec/armv4l/dsputil_neon_s.S
+++ b/libavcodec/armv4l/dsputil_neon_s.S
@@ -324,6 +324,23 @@ extern ff_float_to_int16_interleave_neon
pop {r4,r5,pc}
.endfunc
+extern ff_vector_fmul_neon
+ mov r3, r0
+ vld1.64 {d0-d3}, [r0,:128]!
+ vld1.64 {d4-d7}, [r1,:128]!
+ dmb
+1: subs r2, r2, #8
+ vmul.f32 q8, q0, q2
+ vmul.f32 q9, q1, q3
+ beq 2f
+ vld1.64 {d0-d3}, [r0,:128]!
+ vld1.64 {d4-d7}, [r1,:128]!
+ vst1.64 {d16-d19}, [r3,:128]!
+ b 1b
+2: vst1.64 {d16-d19}, [r3,:128]!
+ bx lr
+ .endfunc
+
extern ff_vector_fmul_window_neon
vld1.32 {d16[],d17[]}, [sp,:32]
push {r4,r5,lr}
|