From b101c115102b83bb1fc4e28de6136dd4940796bc Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Fri, 30 Oct 2009 17:02:14 +0000
Subject: ARM: initial 24bpp support

---
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 35e6a7e..7f91ced 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -977,3 +977,32 @@ generate_composite_function \
     pixman_composite_over_8888_n_8888_process_pixblock_head, \
     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
     pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    vld3.8 {d0, d1, d2}, [SRC]!
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index a2941ae..1653ef4 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -95,6 +95,14 @@
     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
 .endm
 
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+
 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
 .if numbytes == 32
     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
@@ -134,6 +142,18 @@
 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
                       %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
 .else
     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
 .endif
@@ -145,6 +165,18 @@
 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
                       %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
 .else
     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
 .endif
@@ -334,6 +366,8 @@ fname:
 
 .if src_bpp == 32
     .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
 .elseif src_bpp == 16
     .set src_bpp_shift, 1
 .elseif src_bpp == 8
@@ -345,6 +379,8 @@ fname:
 .endif
 .if mask_bpp == 32
     .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
 .elseif mask_bpp == 8
     .set mask_bpp_shift, 0
 .elseif mask_bpp == 0
@@ -354,6 +390,8 @@ fname:
 .endif
 .if dst_w_bpp == 32
     .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
 .elseif dst_w_bpp == 16
     .set dst_bpp_shift, 1
 .elseif dst_w_bpp == 8
@@ -398,6 +436,19 @@ fname:
     PF mov      PF_CTL, H, lsl #4
     PF add      PF_CTL, #(prefetch_distance - 0x10)
 
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
     init
 .if regs_shortage
     push        {r0, r1}
@@ -412,7 +463,8 @@ fname:
     cmp         W, #(pixblock_size * 2)
     blt         8f
 0:
-    /* ensure 16 byte alignment of the destination buffer */
+    /* ensure 16 byte alignment of the destination buffer, except for 24bpp */
+.if dst_w_bpp != 24
     tst         DST_R, #0xF
     beq         2f
 
@@ -454,6 +506,7 @@ fname:
 .endif
 .endr
 2:
+.endif
 
     pixld_a     pixblock_size, dst_r_bpp, \
                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
@@ -520,11 +573,13 @@ fname:
 .if mask_bpp != 0
     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
 .endif
+.if (dst_w_bpp != 24)
     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
-.if src_bpp != 0
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
     sub         SRC, SRC, W, lsl #src_bpp_shift
 .endif
-.if mask_bpp != 0
+.if (mask_bpp != 24) && (mask_bpp != 0)
     sub         MASK, MASK, W, lsl #mask_bpp_shift
 .endif
     subs        H, H, #1
@@ -539,7 +594,7 @@ fname:
     cleanup
     pop         {r4-r12, pc}  /* exit */
 
-8: /* handle small rectangle, width up to 15 pixels */
+8: /* handle small rectangle, width up to (pixblock_size * 2 - 1) pixels */
     tst         W, #pixblock_size
     beq         1f
     pixld       pixblock_size, dst_r_bpp, \
@@ -592,11 +647,13 @@ fname:
 .if mask_bpp != 0
     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
 .endif
+.if (dst_w_bpp != 24)
     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
-.if src_bpp != 0
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
     sub         SRC, SRC, W, lsl #src_bpp_shift
 .endif
-.if mask_bpp != 0
+.if (mask_bpp != 24) && (mask_bpp != 0)
     sub         MASK, MASK, W, lsl #mask_bpp_shift
 .endif
     subs        H, H, #1
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 2811099..f3f38a9 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -2065,6 +2065,43 @@ neon_composite_src_8888_8888 (pixman_implementation_t *imp,
 }
 
 void
+pixman_composite_src_0888_0888_asm_neon (int32_t   w,
+                                         int32_t   h,
+                                         uint8_t  *dst,
+                                         int32_t   dst_stride,
+                                         uint8_t  *src,
+                                         int32_t   src_stride);
+
+static void
+neon_composite_src_0888_0888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint8_t *dst_line;
+    uint8_t *src_line;
+    int32_t dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t,
+                           src_stride, src_line, 3);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t,
+                           dst_stride, dst_line, 3);
+
+    pixman_composite_src_0888_0888_asm_neon (width, height,
+                                             dst_line, dst_stride,
+                                             src_line, src_stride);
+}
+
+void
 pixman_composite_over_8888_8888_asm_neon (int32_t   w,
                                           int32_t   h,
                                           uint32_t *dst,
@@ -2449,6 +2486,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] =
     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_src_8888_8888,    0 },
     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_src_8888_8888,    0 },
     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_0565_0565,    0 },
+    { PIXMAN_OP_SRC,  PIXMAN_r8g8b8,   PIXMAN_null,     PIXMAN_r8g8b8,   neon_composite_src_0888_0888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
--
cgit v0.8.2