1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
|
From e0017c2a676b267900e48c6f32a6e973395c83d3 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 31 May 2010 16:24:43 +0000
Subject: ARM: 'neon_combine_out_reverse_u' combiner
---
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index f30869e..44fbfce 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -597,7 +597,7 @@ generate_composite_function_single_scanline \
/******************************************************************************/
-.macro pixman_composite_over_8888_8888_process_pixblock_head
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
vmvn.8 d24, d3 /* get inverted alpha */
/* do alpha blending */
vmull.u8 q8, d24, d4
@@ -606,7 +606,7 @@ generate_composite_function_single_scanline \
vmull.u8 q11, d24, d7
.endm
-.macro pixman_composite_over_8888_8888_process_pixblock_tail
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
vrshr.u16 q14, q8, #8
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
@@ -615,6 +615,56 @@ generate_composite_function_single_scanline \
vraddhn.u16 d29, q15, q9
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vld4.8 {d0, d1, d2, d3}, [SRC]!
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail
vqadd.u8 q14, q0, q14
vqadd.u8 q15, q1, q15
.endm
@@ -1416,7 +1466,7 @@ generate_composite_function_single_scanline \
/******************************************************************************/
-.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
/* expecting source data in {d0, d1, d2, d3} */
/* destination data in {d4, d5, d6, d7} */
/* solid mask is in d15 */
@@ -1442,7 +1492,7 @@ generate_composite_function_single_scanline \
vmull.u8 q11, d24, d7
.endm
-.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
vrshr.u16 q14, q8, #8
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
@@ -1451,6 +1501,49 @@ generate_composite_function_single_scanline \
vraddhn.u16 d29, q15, q9
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_8888_init
+ vpush {d8-d15}
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_8888_cleanup
+ vpop {d8-d15}
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ vld4.8 {d0, d1, d2, d3}, [SRC]!
+ cache_preload 8, 8
+ vld4.8 {d12, d13, d14, d15}, [MASK]!
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ pixman_composite_out_reverse_8888_8888_8888_init, \
+ pixman_composite_out_reverse_8888_8888_8888_cleanup, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
vqadd.u8 q14, q0, q14
vqadd.u8 q15, q1, q15
.endm
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 394dcea..1be9606 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -335,6 +335,7 @@ neon_combine_##name##_u (pixman_implementation_t *imp, \
BIND_COMBINE_U (over)
BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
void
pixman_fetch_scanline_r5g6b5_asm_neon (int width,
@@ -382,6 +383,7 @@ _pixman_implementation_create_arm_neon (void)
imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
_pixman_bits_override_accessors (PIXMAN_r5g6b5,
neon_fetch_scanline_r5g6b5,
--
cgit v0.8.3-6-g21f6
|