recipes/xorg-lib/pixman/prefetch.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298

From d0044bfbd596f22ed1560579ea6537b39f3dc1af Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 29 Oct 2009 19:06:42 +0000
Subject: ARM: Don't emit prefetch code if prefetch distance is set to 0

Also it is now possible to disable prefetch globally with
a configuration macro
---
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index bca499a..35e6a7e 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -219,33 +219,33 @@
     vshrn.u16   d7, q2, #3
     vsli.u16    q2, q2, #5
         vshll.u8    q14, d16, #8
-                                    add PF_X, PF_X, #8
+                                    PF add PF_X, PF_X, #8
         vshll.u8    q8, d19, #8
-                                    tst PF_CTL, #0xF
+                                    PF tst PF_CTL, #0xF
     vsri.u8     d6, d6, #5
-                                    addne PF_X, PF_X, #8
+                                    PF addne PF_X, PF_X, #8
     vmvn.8      d3, d3
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF subne PF_CTL, PF_CTL, #1
     vsri.u8     d7, d7, #6
     vshrn.u16   d30, q2, #2
     vmull.u8    q10, d3, d6
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vmull.u8    q11, d3, d7
     vmull.u8    q12, d3, d30
-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vsri.u16    q14, q8, #5
-                                    cmp PF_X, ORIG_W
+                                    PF cmp PF_X, ORIG_W
         vshll.u8    q9, d18, #8
     vrshr.u16   q13, q10, #8
-                                    subge PF_X, PF_X, ORIG_W
+                                    PF subge PF_X, PF_X, ORIG_W
     vrshr.u16   q3, q11, #8
     vrshr.u16   q15, q12, #8
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF subges PF_CTL, PF_CTL, #0x10
         vsri.u16    q14, q9, #11
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vraddhn.u16 d20, q10, q13
     vraddhn.u16 d23, q11, q3
-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vraddhn.u16 d22, q12, q15
         vst1.16     {d28, d29}, [DST_W, :128]!
 .endm
@@ -323,20 +323,20 @@ generate_composite_function \
 
 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
         vsri.u16    q14, q8, #5
-                                    add PF_X, PF_X, #8
-                                    tst PF_CTL, #0xF
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
     vld4.8      {d0, d1, d2, d3}, [SRC]!
-                                    addne PF_X, PF_X, #8
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
         vsri.u16    q14, q9, #11
-                                    cmp PF_X, ORIG_W
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vshll.u8    q8, d1, #8
         vst1.16     {d28, d29}, [DST_W, :128]!
-                                    subge PF_X, PF_X, ORIG_W
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
     vshll.u8    q14, d2, #8
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vshll.u8    q9, d0, #8
 .endm
 
@@ -363,20 +363,20 @@ generate_composite_function \
 
 .macro pixman_composite_add_8000_8000_process_pixblock_tail_head
     vld1.8      {d0, d1, d2, d3}, [SRC]!
-                                    add PF_X, PF_X, #32
-                                    tst PF_CTL, #0xF
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-                                    addne PF_X, PF_X, #32
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    cmp PF_X, ORIG_W
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    subge PF_X, PF_X, ORIG_W
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
     vqadd.u8    q14, q0, q2
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vqadd.u8    q15, q1, q3
 .endm
 
@@ -418,32 +418,32 @@ generate_composite_function \
 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vrshr.u16   q14, q8, #8
-                                    add PF_X, PF_X, #8
-                                    tst PF_CTL, #0xF
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    addne PF_X, PF_X, #8
-                                    subne PF_CTL, PF_CTL, #1
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    cmp PF_X, ORIG_W
+                                    PF cmp PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
         vqadd.u8    q14, q0, q14
         vqadd.u8    q15, q1, q15
     vld4.8      {d0, d1, d2, d3}, [SRC]!
-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vmvn.8      d22, d3
-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    subge PF_X, PF_X, ORIG_W
+                                    PF subge PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    subges PF_CTL, PF_CTL, #0x10
+                                    PF subges PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vmull.u8    q10, d22, d6
-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index d276ab9..a2941ae 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -58,6 +58,11 @@
 #define RESPECT_STRICT_ALIGNMENT 1
 
 /*
+ * If set to nonzero value, prefetch is globally disabled
+ */
+#define PREFETCH_GLOBALLY_DISABLED 0 
+
+/*
  * Definitions of supplementary pixld/pixst macros (for partial load/store of
  * pixel data)
  */
@@ -218,37 +223,43 @@
  * pixels processing like simple copy. Anyway, having prefetch is a must
  * when working with graphics data.
  */
+.macro PF a, x:vararg
+.if (ADVANCED_PREFETCH_ENABLED != 0) && (PREFETCH_GLOBALLY_DISABLED == 0)
+    a x
+.endif
+.endm
+
 .macro cache_preload std_increment, boost_increment
 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
 .if regs_shortage
-    ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
 .endif
 .if std_increment != 0
-    add PF_X, PF_X, #std_increment
+    PF add PF_X, PF_X, #std_increment
 .endif
-    tst PF_CTL, #0xF
-    addne PF_X, PF_X, #boost_increment
-    subne PF_CTL, PF_CTL, #1
-    cmp PF_X, ORIG_W
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
 .if src_bpp_shift >= 0
-    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 .endif
 .if dst_r_bpp != 0
-    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 .endif
 .if mask_bpp_shift >= 0
-    pld [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
 .endif
-    subge PF_X, PF_X, ORIG_W
-    subges PF_CTL, PF_CTL, #0x10
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
 .if src_bpp_shift >= 0
-    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endif
 .if dst_r_bpp != 0
-    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 .endif
 .if mask_bpp_shift >= 0
-    ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
 .endif
 .endif
 .endm
@@ -297,6 +308,12 @@ fname:
     PF_DST      .req        r12
     PF_MASK     .req        r14
 
+.if prefetch_distance == 0
+    .set ADVANCED_PREFETCH_ENABLED, 0
+.else
+    .set ADVANCED_PREFETCH_ENABLED, 1
+.endif
+
 .if mask_bpp == 0
     ORIG_W      .req        r7      /* saved original width */
     DUMMY       .req        r8      /* temporary register */
@@ -374,12 +391,12 @@ fname:
     ldr         MASK_STRIDE, [sp, #52]
 .endif
     mov         DST_R, DST_W
-    mov         PF_SRC, SRC
-    mov         PF_DST, DST_R
-    mov         PF_MASK, MASK
-    mov         PF_CTL, H, lsl #4
-    /* pf_ctl = 10 | ((h - 1) << 4) */
-    add         PF_CTL, #(prefetch_distance - 0x10)
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
 
     init
 .if regs_shortage
@@ -412,7 +429,7 @@ fname:
 .else
     add         DST_R, DST_R, #lowbit
 .endif
-    add         PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
     sub         W, W, #(lowbit * 8 / dst_w_bpp)
 1:
 .endif
@@ -444,7 +461,7 @@ fname:
                 (src_basereg - pixblock_size * src_bpp / 64), SRC
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    add         PF_X, PF_X, #pixblock_size
+    PF add      PF_X, PF_X, #pixblock_size
     process_pixblock_head
     cache_preload 0, pixblock_size
     subs        W, W, #(pixblock_size * 2)
@@ -468,7 +485,7 @@ fname:
     pixld       chunk_size, src_bpp, src_basereg, SRC
     pixld       chunk_size, mask_bpp, mask_basereg, MASK
     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
-    add         PF_X, PF_X, #chunk_size
+    PF add      PF_X, PF_X, #chunk_size
 1:
 .endif
 .endr
--
cgit v0.8.2