From b101c115102b83bb1fc4e28de6136dd4940796bc Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Fri, 30 Oct 2009 17:02:14 +0000 Subject: ARM: initial 24bpp support --- diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 35e6a7e..7f91ced 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -977,3 +977,32 @@ generate_composite_function \ pixman_composite_over_8888_n_8888_process_pixblock_head, \ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ pixman_composite_over_8888_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0888_process_pixblock_head +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head + vst3.8 {d0, d1, d2}, [DST_W]! + vld3.8 {d0, d1, d2}, [SRC]! + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0888_0888_process_pixblock_head, \ + pixman_composite_src_0888_0888_process_pixblock_tail, \ + pixman_composite_src_0888_0888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h index a2941ae..1653ef4 100644 --- a/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman-arm-neon-asm.h @@ -95,6 +95,14 @@ op&.&elem_size {d®1[idx]}, [&mem_operand&]! .endm +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand + op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! +.endm + +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand + op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! +.endm + .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits .if numbytes == 32 pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ @@ -134,6 +142,18 @@ .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand .else pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits .endif @@ -145,6 +165,18 @@ .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand .else pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits .endif @@ -334,6 +366,8 @@ fname: .if src_bpp == 32 .set src_bpp_shift, 2 +.elseif src_bpp == 24 + .set src_bpp_shift, 0 .elseif src_bpp == 16 .set src_bpp_shift, 1 .elseif src_bpp == 8 @@ -345,6 +379,8 @@ fname: .endif .if mask_bpp == 32 .set mask_bpp_shift, 2 +.elseif mask_bpp == 24 + .set mask_bpp_shift, 0 .elseif mask_bpp == 8 .set mask_bpp_shift, 0 .elseif mask_bpp == 0 @@ -354,6 +390,8 @@ fname: .endif .if dst_w_bpp == 32 .set dst_bpp_shift, 2 +.elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 .elseif dst_w_bpp == 16 .set dst_bpp_shift, 1 .elseif dst_w_bpp == 8 @@ -398,6 +436,19 @@ fname: PF mov PF_CTL, H, lsl #4 PF add PF_CTL, #(prefetch_distance - 0x10) +.if src_bpp == 24 + sub SRC_STRIDE, SRC_STRIDE, W + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 +.endif +.if mask_bpp == 24 + sub MASK_STRIDE, MASK_STRIDE, W + sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 +.endif +.if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 +.endif + init .if regs_shortage push {r0, r1} @@ -412,7 +463,8 @@ fname: cmp W, #(pixblock_size * 2) blt 8f 0: - /* ensure 16 byte alignment of the destination buffer */ + /* ensure 16 byte alignment of the destination buffer, except for 24bpp */ +.if dst_w_bpp != 24 tst DST_R, #0xF beq 2f @@ -454,6 +506,7 @@ fname: .endif .endr 2: +.endif pixld_a pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R @@ -520,11 +573,13 @@ fname: .if mask_bpp != 0 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift .endif +.if (dst_w_bpp != 24) sub DST_W, DST_W, W, lsl #dst_bpp_shift -.if src_bpp != 0 +.endif +.if (src_bpp != 24) && (src_bpp != 0) sub SRC, SRC, W, lsl #src_bpp_shift .endif -.if mask_bpp != 0 +.if (mask_bpp != 24) && (mask_bpp != 0) sub MASK, MASK, W, lsl #mask_bpp_shift .endif subs H, H, #1 @@ -539,7 +594,7 @@ fname: cleanup pop {r4-r12, pc} /* exit */ -8: /* handle small rectangle, width up to 15 pixels */ +8: /* handle small rectangle, width up to (pixblock_size * 2 - 1) pixels */ tst W, #pixblock_size beq 1f pixld pixblock_size, dst_r_bpp, \ @@ -592,11 +647,13 @@ fname: .if mask_bpp != 0 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift .endif +.if (dst_w_bpp != 24) sub DST_W, DST_W, W, lsl #dst_bpp_shift -.if src_bpp != 0 +.endif +.if (src_bpp != 24) && (src_bpp != 0) sub SRC, SRC, W, lsl #src_bpp_shift .endif -.if mask_bpp != 0 +.if (mask_bpp != 24) && (mask_bpp != 0) sub MASK, MASK, W, lsl #mask_bpp_shift .endif subs H, H, #1 diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 2811099..f3f38a9 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -2065,6 +2065,43 @@ neon_composite_src_8888_8888 (pixman_implementation_t *imp, } void +pixman_composite_src_0888_0888_asm_neon (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride); + +static void +neon_composite_src_0888_0888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line; + uint8_t *src_line; + int32_t dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, + src_stride, src_line, 3); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, + dst_stride, dst_line, 3); + + pixman_composite_src_0888_0888_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride); +} + +void pixman_composite_over_8888_8888_asm_neon (int32_t w, int32_t h, uint32_t *dst, @@ -2449,6 +2486,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] = { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888, 0 }, { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888, 0 }, { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_0565_0565, 0 }, + { PIXMAN_OP_SRC, PIXMAN_r8g8b8, PIXMAN_null, PIXMAN_r8g8b8, neon_composite_src_0888_0888, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 }, -- cgit v0.8.2