1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
From: Mans Rullgard <mans@mansr.com>
Date: Tue, 29 Jul 2008 21:13:14 +0000 (+0100)
Subject: ARM: work around Cortex-A8 erratum 451034
X-Git-Url: http://git.mansr.com/?p=ffmpeg.mru;a=commitdiff_plain;h=c6bbb0c33f6f681b8265a43f8744735de5a9d45e
ARM: work around Cortex-A8 erratum 451034
On Cortex-A8 r1p0 and r1p1, executing a NEON store with an integer
store in the store buffer, can cause a processor deadlock under
certain conditions.
A DMB instruction at the start of every NEON function ensures that
the integer store buffer is always empty before executing any NEON
store, thus avoiding the deadlock condition.
See ARM Cortex-A8 Errata Notice (PR120-PRDC-008070) for full details.
---
diff --git b/mythtv/libs/libavcodec/armv4l/dsputil_neon.c b/mythtv/libs/libavcodec/armv4l/dsputil_neon.c
index fa0602d..4fbadfc 100644
--- b/mythtv/libs/libavcodec/armv4l/dsputil_neon.c
+++ b/mythtv/libs/libavcodec/armv4l/dsputil_neon.c
@@ -28,6 +28,7 @@ extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
int h, int x, int y);
#define PUT_PIXELS_16_X2(vhadd) \
+ "dmb \n\t" \
"1: \n\t" \
"vld1.64 {d0,d1,d2}, [%[p]], %[line_size] \n\t" \
"vld1.64 {d4,d5,d6}, [%[p]], %[line_size] \n\t" \
@@ -46,6 +47,7 @@ extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
"lsl %[l2], %[line_size], #1 \n\t" \
"vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \
"vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \
+ "dmb \n\t" \
"1: \n\t" \
"subs %[h], %[h], #2 \n\t" \
vhadd".u8 q2, q0, q1 \n\t" \
@@ -69,6 +71,7 @@ extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
"vaddl.u8 q10, d1, d3 \n\t" \
"vaddl.u8 q9, d4, d6 \n\t" \
"vaddl.u8 q11, d5, d7 \n\t" \
+ "dmb \n\t" \
"1: \n\t" \
"subs %[h], %[h], #2 \n\t" \
"vld1.64 {d0,d1,d2}, [%[p0]], %[l2] \n\t" \
@@ -98,6 +101,7 @@ extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
"bgt 1b \n\t"
#define PUT_PIXELS_8_X2(vhadd) \
+ "dmb \n\t" \
"1: \n\t" \
"vld1.64 {d0,d1}, [%[p]], %[line_size] \n\t" \
"vld1.64 {d2,d3}, [%[p]], %[line_size] \n\t" \
@@ -116,6 +120,7 @@ extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
"lsl %[l2], %[line_size], #1 \n\t" \
"vld1.64 {d0}, [%[p0]], %[l2] \n\t" \
"vld1.64 {d1}, [%[p1]], %[l2] \n\t" \
+ "dmb \n\t" \
"1: \n\t" \
"subs %[h], %[h], #2 \n\t" \
vhadd".u8 d4, d0, d1 \n\t" \
@@ -137,6 +142,7 @@ extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
"vext.8 d6, d2, d3, #1 \n\t" \
"vaddl.u8 q8, d0, d4 \n\t" \
"vaddl.u8 q9, d2, d6 \n\t" \
+ "dmb \n\t" \
"1: \n\t" \
"subs %[h], %[h], #2 \n\t" \
"vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \
@@ -161,6 +167,7 @@ static void put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
asm volatile(
+ "dmb \n\t"
"1: \n\t"
"vld1.64 {d0,d1}, [%[pixels]], %[line_size] \n\t"
"vld1.64 {d2,d3}, [%[pixels]], %[line_size] \n\t"
@@ -224,6 +231,7 @@ static void put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
asm volatile(
+ "dmb \n\t"
"1: \n\t"
"vld1.64 {d0}, [%[p]], %[line_size] \n\t"
"vld1.64 {d1}, [%[p]], %[line_size] \n\t"
diff --git b/mythtv/libs/libavcodec/armv4l/h264dsp_neon.S b/mythtv/libs/libavcodec/armv4l/h264dsp_neon.S
index a766867..8171ee2 100644
--- b/mythtv/libs/libavcodec/armv4l/h264dsp_neon.S
+++ b/mythtv/libs/libavcodec/armv4l/h264dsp_neon.S
@@ -40,6 +40,8 @@ ff_put_h264_chroma_mc8_neon:
sub r4, r4, r5, lsl #3
add r4, r4, #64
+ dmb
+
beq 2f
add r5, r1, r2
diff --git b/mythtv/libs/libavcodec/armv4l/simple_idct_neon.S b/mythtv/libs/libavcodec/armv4l/simple_idct_neon.S
index 943e04f..abda6b2 100644
--- b/mythtv/libs/libavcodec/armv4l/simple_idct_neon.S
+++ b/mythtv/libs/libavcodec/armv4l/simple_idct_neon.S
@@ -307,9 +307,10 @@ idct_col4_st8:
const: .short W1, W2, W3, W4, W5, W6, W7, W4c
.macro idct_start data
+ push {v1-v3, lr}
pld [\data]
pld [\data, #64]
- push {v1-v3, lr}
+ dmb
vpush {d8-d15}
adr a4, const
vld1.64 {d0,d1}, [a4,:128]
|