diff options
Diffstat (limited to 'recipes/xorg-lib/pixman')
-rw-r--r-- | recipes/xorg-lib/pixman/neon-24bpp.patch | 264 | ||||
-rw-r--r-- | recipes/xorg-lib/pixman/prefetch.patch | 298 |
2 files changed, 562 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman/neon-24bpp.patch b/recipes/xorg-lib/pixman/neon-24bpp.patch new file mode 100644 index 0000000000..edfd367626 --- /dev/null +++ b/recipes/xorg-lib/pixman/neon-24bpp.patch @@ -0,0 +1,264 @@ +From b101c115102b83bb1fc4e28de6136dd4940796bc Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Fri, 30 Oct 2009 17:02:14 +0000 +Subject: ARM: initial 24bpp support + +--- +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index 35e6a7e..7f91ced 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -977,3 +977,32 @@ generate_composite_function \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_0888_0888_process_pixblock_head ++.endm ++ ++.macro pixman_composite_src_0888_0888_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_0888_0888_process_pixblock_tail_head ++ vst3.8 {d0, d1, d2}, [DST_W]! ++ vld3.8 {d0, d1, d2}, [SRC]! ++ cache_preload 8, 8 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ ++ FLAG_DST_WRITEONLY, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_0888_0888_process_pixblock_head, \ ++ pixman_composite_src_0888_0888_process_pixblock_tail, \ ++ pixman_composite_src_0888_0888_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ +diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h +index a2941ae..1653ef4 100644 +--- a/pixman/pixman-arm-neon-asm.h ++++ b/pixman/pixman-arm-neon-asm.h +@@ -95,6 +95,14 @@ + op&.&elem_size {d®1[idx]}, [&mem_operand&]! + .endm + ++.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand ++ op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! ++.endm ++ ++.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand ++ op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! ++.endm ++ + .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits + .if numbytes == 32 + pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ +@@ -134,6 +142,18 @@ + .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits ++.elseif (bpp == 24) && (numpix == 8) ++ pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand ++.elseif (bpp == 24) && (numpix == 4) ++ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand ++ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand ++ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand ++ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand ++.elseif (bpp == 24) && (numpix == 2) ++ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand ++ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand ++.elseif (bpp == 24) && (numpix == 1) ++ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand + .else + pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits + .endif +@@ -145,6 +165,18 @@ + .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits ++.elseif (bpp == 24) && (numpix == 8) ++ pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand ++.elseif (bpp == 24) && (numpix == 4) ++ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand ++ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand ++ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand ++ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand ++.elseif (bpp == 24) && (numpix == 2) ++ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand ++ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand ++.elseif (bpp == 24) && (numpix == 1) ++ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand + .else + pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits + .endif +@@ -334,6 +366,8 @@ fname: + + .if src_bpp == 32 + .set src_bpp_shift, 2 ++.elseif src_bpp == 24 ++ .set src_bpp_shift, 0 + .elseif src_bpp == 16 + .set src_bpp_shift, 1 + .elseif src_bpp == 8 +@@ -345,6 +379,8 @@ fname: + .endif + .if mask_bpp == 32 + .set mask_bpp_shift, 2 ++.elseif mask_bpp == 24 ++ .set mask_bpp_shift, 0 + .elseif mask_bpp == 8 + .set mask_bpp_shift, 0 + .elseif mask_bpp == 0 +@@ -354,6 +390,8 @@ fname: + .endif + .if dst_w_bpp == 32 + .set dst_bpp_shift, 2 ++.elseif dst_w_bpp == 24 ++ .set dst_bpp_shift, 0 + .elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 +@@ -398,6 +436,19 @@ fname: + PF mov PF_CTL, H, lsl #4 + PF add PF_CTL, #(prefetch_distance - 0x10) + ++.if src_bpp == 24 ++ sub SRC_STRIDE, SRC_STRIDE, W ++ sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 ++.endif ++.if mask_bpp == 24 ++ sub MASK_STRIDE, MASK_STRIDE, W ++ sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 ++.endif ++.if dst_w_bpp == 24 ++ sub DST_STRIDE, DST_STRIDE, W ++ sub DST_STRIDE, DST_STRIDE, W, lsl #1 ++.endif ++ + init + .if regs_shortage + push {r0, r1} +@@ -412,7 +463,8 @@ fname: + cmp W, #(pixblock_size * 2) + blt 8f + 0: +- /* ensure 16 byte alignment of the destination buffer */ ++ /* ensure 16 byte alignment of the destination buffer, except for 24bpp */ ++.if dst_w_bpp != 24 + tst DST_R, #0xF + beq 2f + +@@ -454,6 +506,7 @@ fname: + .endif + .endr + 2: ++.endif + + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R +@@ -520,11 +573,13 @@ fname: + .if mask_bpp != 0 + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift + .endif ++.if (dst_w_bpp != 24) + sub DST_W, DST_W, W, lsl #dst_bpp_shift +-.if src_bpp != 0 ++.endif ++.if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift + .endif +-.if mask_bpp != 0 ++.if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift + .endif + subs H, H, #1 +@@ -539,7 +594,7 @@ fname: + cleanup + pop {r4-r12, pc} /* exit */ + +-8: /* handle small rectangle, width up to 15 pixels */ ++8: /* handle small rectangle, width up to (pixblock_size * 2 - 1) pixels */ + tst W, #pixblock_size + beq 1f + pixld pixblock_size, dst_r_bpp, \ +@@ -592,11 +647,13 @@ fname: + .if mask_bpp != 0 + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift + .endif ++.if (dst_w_bpp != 24) + sub DST_W, DST_W, W, lsl #dst_bpp_shift +-.if src_bpp != 0 ++.endif ++.if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift + .endif +-.if mask_bpp != 0 ++.if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift + .endif + subs H, H, #1 +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index 2811099..f3f38a9 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -2065,6 +2065,43 @@ neon_composite_src_8888_8888 (pixman_implementation_t *imp, + } + + void ++pixman_composite_src_0888_0888_asm_neon (int32_t w, ++ int32_t h, ++ uint8_t *dst, ++ int32_t dst_stride, ++ uint8_t *src, ++ int32_t src_stride); ++ ++static void ++neon_composite_src_0888_0888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint8_t *dst_line; ++ uint8_t *src_line; ++ int32_t dst_stride, src_stride; ++ ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, ++ src_stride, src_line, 3); ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, ++ dst_stride, dst_line, 3); ++ ++ pixman_composite_src_0888_0888_asm_neon (width, height, ++ dst_line, dst_stride, ++ src_line, src_stride); ++} ++ ++void + pixman_composite_over_8888_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, +@@ -2449,6 +2486,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] = + { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_0565_0565, 0 }, ++ { PIXMAN_OP_SRC, PIXMAN_r8g8b8, PIXMAN_null, PIXMAN_r8g8b8, neon_composite_src_0888_0888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 }, +-- +cgit v0.8.2 diff --git a/recipes/xorg-lib/pixman/prefetch.patch b/recipes/xorg-lib/pixman/prefetch.patch new file mode 100644 index 0000000000..c2e856ec25 --- /dev/null +++ b/recipes/xorg-lib/pixman/prefetch.patch @@ -0,0 +1,298 @@ +From d0044bfbd596f22ed1560579ea6537b39f3dc1af Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Thu, 29 Oct 2009 19:06:42 +0000 +Subject: ARM: Don't emit prefetch code if prefetch distance is set to 0 + +Also it is now possible to disable prefetch globally with +a configuration macro +--- +diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S +index bca499a..35e6a7e 100644 +--- a/pixman/pixman-arm-neon-asm.S ++++ b/pixman/pixman-arm-neon-asm.S +@@ -219,33 +219,33 @@ + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vshll.u8 q14, d16, #8 +- add PF_X, PF_X, #8 ++ PF add PF_X, PF_X, #8 + vshll.u8 q8, d19, #8 +- tst PF_CTL, #0xF ++ PF tst PF_CTL, #0xF + vsri.u8 d6, d6, #5 +- addne PF_X, PF_X, #8 ++ PF addne PF_X, PF_X, #8 + vmvn.8 d3, d3 +- subne PF_CTL, PF_CTL, #1 ++ PF subne PF_CTL, PF_CTL, #1 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + vmull.u8 q10, d3, d6 +- pld [PF_SRC, PF_X, lsl #src_bpp_shift] ++ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 +- pld [PF_DST, PF_X, lsl #dst_bpp_shift] ++ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vsri.u16 q14, q8, #5 +- cmp PF_X, ORIG_W ++ PF cmp PF_X, ORIG_W + vshll.u8 q9, d18, #8 + vrshr.u16 q13, q10, #8 +- subge PF_X, PF_X, ORIG_W ++ PF subge PF_X, PF_X, ORIG_W + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 +- subges PF_CTL, PF_CTL, #0x10 ++ PF subges PF_CTL, PF_CTL, #0x10 + vsri.u16 q14, q9, #11 +- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 +- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vraddhn.u16 d22, q12, q15 + vst1.16 {d28, d29}, [DST_W, :128]! + .endm +@@ -323,20 +323,20 @@ generate_composite_function \ + + .macro pixman_composite_src_8888_0565_process_pixblock_tail_head + vsri.u16 q14, q8, #5 +- add PF_X, PF_X, #8 +- tst PF_CTL, #0xF ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF + vld4.8 {d0, d1, d2, d3}, [SRC]! +- addne PF_X, PF_X, #8 +- subne PF_CTL, PF_CTL, #1 ++ PF addne PF_X, PF_X, #8 ++ PF subne PF_CTL, PF_CTL, #1 + vsri.u16 q14, q9, #11 +- cmp PF_X, ORIG_W +- pld [PF_SRC, PF_X, lsl #src_bpp_shift] ++ PF cmp PF_X, ORIG_W ++ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! +- subge PF_X, PF_X, ORIG_W +- subges PF_CTL, PF_CTL, #0x10 ++ PF subge PF_X, PF_X, ORIG_W ++ PF subges PF_CTL, PF_CTL, #0x10 + vshll.u8 q14, d2, #8 +- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vshll.u8 q9, d0, #8 + .endm + +@@ -363,20 +363,20 @@ generate_composite_function \ + + .macro pixman_composite_add_8000_8000_process_pixblock_tail_head + vld1.8 {d0, d1, d2, d3}, [SRC]! +- add PF_X, PF_X, #32 +- tst PF_CTL, #0xF ++ PF add PF_X, PF_X, #32 ++ PF tst PF_CTL, #0xF + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! +- addne PF_X, PF_X, #32 +- subne PF_CTL, PF_CTL, #1 ++ PF addne PF_X, PF_X, #32 ++ PF subne PF_CTL, PF_CTL, #1 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! +- cmp PF_X, ORIG_W +- pld [PF_SRC, PF_X, lsl #src_bpp_shift] +- pld [PF_DST, PF_X, lsl #dst_bpp_shift] +- subge PF_X, PF_X, ORIG_W +- subges PF_CTL, PF_CTL, #0x10 ++ PF cmp PF_X, ORIG_W ++ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] ++ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] ++ PF subge PF_X, PF_X, ORIG_W ++ PF subges PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 +- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 + .endm + +@@ -418,32 +418,32 @@ generate_composite_function \ + .macro pixman_composite_over_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 +- add PF_X, PF_X, #8 +- tst PF_CTL, #0xF ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 +- addne PF_X, PF_X, #8 +- subne PF_CTL, PF_CTL, #1 ++ PF addne PF_X, PF_X, #8 ++ PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 +- cmp PF_X, ORIG_W ++ PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + vld4.8 {d0, d1, d2, d3}, [SRC]! +- pld [PF_SRC, PF_X, lsl #src_bpp_shift] ++ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 +- pld [PF_DST, PF_X, lsl #dst_bpp_shift] ++ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +- subge PF_X, PF_X, ORIG_W ++ PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 +- subges PF_CTL, PF_CTL, #0x10 ++ PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 +- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 +- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 + .endm + +diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h +index d276ab9..a2941ae 100644 +--- a/pixman/pixman-arm-neon-asm.h ++++ b/pixman/pixman-arm-neon-asm.h +@@ -58,6 +58,11 @@ + #define RESPECT_STRICT_ALIGNMENT 1 + + /* ++ * If set to nonzero value, prefetch is globally disabled ++ */ ++#define PREFETCH_GLOBALLY_DISABLED 0 ++ ++/* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data) + */ +@@ -218,37 +223,43 @@ + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with graphics data. + */ ++.macro PF a, x:vararg ++.if (ADVANCED_PREFETCH_ENABLED != 0) && (PREFETCH_GLOBALLY_DISABLED == 0) ++ a x ++.endif ++.endm ++ + .macro cache_preload std_increment, boost_increment + .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) + .if regs_shortage +- ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ ++ PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ + .endif + .if std_increment != 0 +- add PF_X, PF_X, #std_increment ++ PF add PF_X, PF_X, #std_increment + .endif +- tst PF_CTL, #0xF +- addne PF_X, PF_X, #boost_increment +- subne PF_CTL, PF_CTL, #1 +- cmp PF_X, ORIG_W ++ PF tst PF_CTL, #0xF ++ PF addne PF_X, PF_X, #boost_increment ++ PF subne PF_CTL, PF_CTL, #1 ++ PF cmp PF_X, ORIG_W + .if src_bpp_shift >= 0 +- pld [PF_SRC, PF_X, lsl #src_bpp_shift] ++ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + .endif + .if dst_r_bpp != 0 +- pld [PF_DST, PF_X, lsl #dst_bpp_shift] ++ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + .endif + .if mask_bpp_shift >= 0 +- pld [PF_MASK, PF_X, lsl #mask_bpp_shift] ++ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + .endif +- subge PF_X, PF_X, ORIG_W +- subges PF_CTL, PF_CTL, #0x10 ++ PF subge PF_X, PF_X, ORIG_W ++ PF subges PF_CTL, PF_CTL, #0x10 + .if src_bpp_shift >= 0 +- ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + .endif + .if dst_r_bpp != 0 +- ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + .endif + .if mask_bpp_shift >= 0 +- ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! ++ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + .endif + .endif + .endm +@@ -297,6 +308,12 @@ fname: + PF_DST .req r12 + PF_MASK .req r14 + ++.if prefetch_distance == 0 ++ .set ADVANCED_PREFETCH_ENABLED, 0 ++.else ++ .set ADVANCED_PREFETCH_ENABLED, 1 ++.endif ++ + .if mask_bpp == 0 + ORIG_W .req r7 /* saved original width */ + DUMMY .req r8 /* temporary register */ +@@ -374,12 +391,12 @@ fname: + ldr MASK_STRIDE, [sp, #52] + .endif + mov DST_R, DST_W +- mov PF_SRC, SRC +- mov PF_DST, DST_R +- mov PF_MASK, MASK +- mov PF_CTL, H, lsl #4 +- /* pf_ctl = 10 | ((h - 1) << 4) */ +- add PF_CTL, #(prefetch_distance - 0x10) ++ PF mov PF_SRC, SRC ++ PF mov PF_DST, DST_R ++ PF mov PF_MASK, MASK ++ /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ ++ PF mov PF_CTL, H, lsl #4 ++ PF add PF_CTL, #(prefetch_distance - 0x10) + + init + .if regs_shortage +@@ -412,7 +429,7 @@ fname: + .else + add DST_R, DST_R, #lowbit + .endif +- add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) ++ PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) + sub W, W, #(lowbit * 8 / dst_w_bpp) + 1: + .endif +@@ -444,7 +461,7 @@ fname: + (src_basereg - pixblock_size * src_bpp / 64), SRC + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- add PF_X, PF_X, #pixblock_size ++ PF add PF_X, PF_X, #pixblock_size + process_pixblock_head + cache_preload 0, pixblock_size + subs W, W, #(pixblock_size * 2) +@@ -468,7 +485,7 @@ fname: + pixld chunk_size, src_bpp, src_basereg, SRC + pixld chunk_size, mask_bpp, mask_basereg, MASK + pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R +- add PF_X, PF_X, #chunk_size ++ PF add PF_X, PF_X, #chunk_size + 1: + .endif + .endr +-- +cgit v0.8.2 |