1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
|
From d0044bfbd596f22ed1560579ea6537b39f3dc1af Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 29 Oct 2009 19:06:42 +0000
Subject: ARM: Don't emit prefetch code if prefetch distance is set to 0
Also it is now possible to disable prefetch globally with
a configuration macro
---
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index bca499a..35e6a7e 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -219,33 +219,33 @@
vshrn.u16 d7, q2, #3
vsli.u16 q2, q2, #5
vshll.u8 q14, d16, #8
- add PF_X, PF_X, #8
+ PF add PF_X, PF_X, #8
vshll.u8 q8, d19, #8
- tst PF_CTL, #0xF
+ PF tst PF_CTL, #0xF
vsri.u8 d6, d6, #5
- addne PF_X, PF_X, #8
+ PF addne PF_X, PF_X, #8
vmvn.8 d3, d3
- subne PF_CTL, PF_CTL, #1
+ PF subne PF_CTL, PF_CTL, #1
vsri.u8 d7, d7, #6
vshrn.u16 d30, q2, #2
vmull.u8 q10, d3, d6
- pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vmull.u8 q11, d3, d7
vmull.u8 q12, d3, d30
- pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vsri.u16 q14, q8, #5
- cmp PF_X, ORIG_W
+ PF cmp PF_X, ORIG_W
vshll.u8 q9, d18, #8
vrshr.u16 q13, q10, #8
- subge PF_X, PF_X, ORIG_W
+ PF subge PF_X, PF_X, ORIG_W
vrshr.u16 q3, q11, #8
vrshr.u16 q15, q12, #8
- subges PF_CTL, PF_CTL, #0x10
+ PF subges PF_CTL, PF_CTL, #0x10
vsri.u16 q14, q9, #11
- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vraddhn.u16 d20, q10, q13
vraddhn.u16 d23, q11, q3
- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vraddhn.u16 d22, q12, q15
vst1.16 {d28, d29}, [DST_W, :128]!
.endm
@@ -323,20 +323,20 @@ generate_composite_function \
.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
vsri.u16 q14, q8, #5
- add PF_X, PF_X, #8
- tst PF_CTL, #0xF
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
vld4.8 {d0, d1, d2, d3}, [SRC]!
- addne PF_X, PF_X, #8
- subne PF_CTL, PF_CTL, #1
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
vsri.u16 q14, q9, #11
- cmp PF_X, ORIG_W
- pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vshll.u8 q8, d1, #8
vst1.16 {d28, d29}, [DST_W, :128]!
- subge PF_X, PF_X, ORIG_W
- subges PF_CTL, PF_CTL, #0x10
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
vshll.u8 q14, d2, #8
- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vshll.u8 q9, d0, #8
.endm
@@ -363,20 +363,20 @@ generate_composite_function \
.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
vld1.8 {d0, d1, d2, d3}, [SRC]!
- add PF_X, PF_X, #32
- tst PF_CTL, #0xF
+ PF add PF_X, PF_X, #32
+ PF tst PF_CTL, #0xF
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- addne PF_X, PF_X, #32
- subne PF_CTL, PF_CTL, #1
+ PF addne PF_X, PF_X, #32
+ PF subne PF_CTL, PF_CTL, #1
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- cmp PF_X, ORIG_W
- pld [PF_SRC, PF_X, lsl #src_bpp_shift]
- pld [PF_DST, PF_X, lsl #dst_bpp_shift]
- subge PF_X, PF_X, ORIG_W
- subges PF_CTL, PF_CTL, #0x10
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@@ -418,32 +418,32 @@ generate_composite_function \
.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vrshr.u16 q14, q8, #8
- add PF_X, PF_X, #8
- tst PF_CTL, #0xF
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
- addne PF_X, PF_X, #8
- subne PF_CTL, PF_CTL, #1
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
- cmp PF_X, ORIG_W
+ PF cmp PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
vqadd.u8 q15, q1, q15
vld4.8 {d0, d1, d2, d3}, [SRC]!
- pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vmvn.8 d22, d3
- pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- subge PF_X, PF_X, ORIG_W
+ PF subge PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- subges PF_CTL, PF_CTL, #0x10
+ PF subges PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index d276ab9..a2941ae 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -58,6 +58,11 @@
#define RESPECT_STRICT_ALIGNMENT 1
/*
+ * If set to nonzero value, prefetch is globally disabled
+ */
+#define PREFETCH_GLOBALLY_DISABLED 0
+
+/*
* Definitions of supplementary pixld/pixst macros (for partial load/store of
* pixel data)
*/
@@ -218,37 +223,43 @@
* pixels processing like simple copy. Anyway, having prefetch is a must
* when working with graphics data.
*/
+.macro PF a, x:vararg
+.if (ADVANCED_PREFETCH_ENABLED != 0) && (PREFETCH_GLOBALLY_DISABLED == 0)
+ a x
+.endif
+.endm
+
.macro cache_preload std_increment, boost_increment
.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
.if regs_shortage
- ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+ PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
.endif
.if std_increment != 0
- add PF_X, PF_X, #std_increment
+ PF add PF_X, PF_X, #std_increment
.endif
- tst PF_CTL, #0xF
- addne PF_X, PF_X, #boost_increment
- subne PF_CTL, PF_CTL, #1
- cmp PF_X, ORIG_W
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #boost_increment
+ PF subne PF_CTL, PF_CTL, #1
+ PF cmp PF_X, ORIG_W
.if src_bpp_shift >= 0
- pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
.endif
.if dst_r_bpp != 0
- pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
.endif
.if mask_bpp_shift >= 0
- pld [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
.endif
- subge PF_X, PF_X, ORIG_W
- subges PF_CTL, PF_CTL, #0x10
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
.if src_bpp_shift >= 0
- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endif
.if dst_r_bpp != 0
- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
.endif
.if mask_bpp_shift >= 0
- ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
.endif
.endif
.endm
@@ -297,6 +308,12 @@ fname:
PF_DST .req r12
PF_MASK .req r14
+.if prefetch_distance == 0
+ .set ADVANCED_PREFETCH_ENABLED, 0
+.else
+ .set ADVANCED_PREFETCH_ENABLED, 1
+.endif
+
.if mask_bpp == 0
ORIG_W .req r7 /* saved original width */
DUMMY .req r8 /* temporary register */
@@ -374,12 +391,12 @@ fname:
ldr MASK_STRIDE, [sp, #52]
.endif
mov DST_R, DST_W
- mov PF_SRC, SRC
- mov PF_DST, DST_R
- mov PF_MASK, MASK
- mov PF_CTL, H, lsl #4
- /* pf_ctl = 10 | ((h - 1) << 4) */
- add PF_CTL, #(prefetch_distance - 0x10)
+ PF mov PF_SRC, SRC
+ PF mov PF_DST, DST_R
+ PF mov PF_MASK, MASK
+ /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+ PF mov PF_CTL, H, lsl #4
+ PF add PF_CTL, #(prefetch_distance - 0x10)
init
.if regs_shortage
@@ -412,7 +429,7 @@ fname:
.else
add DST_R, DST_R, #lowbit
.endif
- add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+ PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
sub W, W, #(lowbit * 8 / dst_w_bpp)
1:
.endif
@@ -444,7 +461,7 @@ fname:
(src_basereg - pixblock_size * src_bpp / 64), SRC
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
- add PF_X, PF_X, #pixblock_size
+ PF add PF_X, PF_X, #pixblock_size
process_pixblock_head
cache_preload 0, pixblock_size
subs W, W, #(pixblock_size * 2)
@@ -468,7 +485,7 @@ fname:
pixld chunk_size, src_bpp, src_basereg, SRC
pixld chunk_size, mask_bpp, mask_basereg, MASK
pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
- add PF_X, PF_X, #chunk_size
+ PF add PF_X, PF_X, #chunk_size
1:
.endif
.endr
--
cgit v0.8.2
|