From: Siarhei Siamashka Date: Mon, 27 Jul 2009 04:48:04 +0000 (+0300) Subject: ARM: NEON optimized version of composite_over_8888_0565 X-Git-Url: http://siarhei.siamashka.name/gitweb/?p=pixman.git;a=commitdiff_plain;h=17d8ab82858511f212dfb30c347255393eb12b0c ARM: NEON optimized version of composite_over_8888_0565 --- diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 9404c70..f1dcf1f 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -1447,6 +1447,274 @@ neon_composite_src_16_16 (pixman_implementation_t * impl, } } +static inline void +neon_composite_over_8888_0565_internal (uint32_t *src, + uint16_t *dst, + int32_t w, + int32_t h, + int32_t src_stride, + int32_t dst_stride) +{ + int32_t dst_newline_delta = (dst_stride - w) * 2; + int32_t src_newline_delta = (src_stride - w) * 4; + asm volatile ( + + ".macro process_pixblock_head size\n" + /* load pixel data from memory */ + " .if \\size == 8\n" + " vld1.32 {d0, d1, d2, d3}, [%[src]]!\n" + " vld1.16 {d4, d5}, [%[dst_r]]!\n" + " .elseif \\size == 4\n" + " vld1.32 {d0, d1}, [%[src]]!\n" + " vld1.16 {d4}, [%[dst_r]]!\n" + " .elseif \\size == 2\n" + " vld1.32 {d0}, [%[src]]!\n" + " vld1.16 {d4[0]}, [%[dst_r]]!\n" + " vld1.16 {d4[1]}, [%[dst_r]]!\n" + " .elseif \\size == 1\n" + " vld1.32 {d0[0]}, [%[src]]!\n" + " vld1.16 {d4[0]}, [%[dst_r]]!\n" + " .endif\n" + /* deinterleave and convert both source and destination + to "planar" 8-bit format */ + " vshrn.u16 d16, q2, #8\n" + " vuzp.8 d0, d1\n" + " vshrn.u16 d17, q2, #3\n" + " vuzp.8 d2, d3\n" + " vsli.u16 q2, q2, #5\n" + " vuzp.8 d1, d3\n" + " vsri.u8 d16, d16, #5\n" + " vuzp.8 d0, d2\n" + " vmvn.8 d3, d3\n" + " vsri.u8 d17, d17, #6\n" + " vshrn.u16 d18, q2, #2\n" + /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ + /* destination: d16 - red, d17 - green, d18 - blue */ + /* now do alpha blending */ + " vmull.u8 q10, d3, d16\n" + "pld [%[src], #128]\n" + " vmull.u8 q11, d3, d17\n" + "pld [%[dst_r], #64]\n" + " vmull.u8 q12, d3, d18\n" + " vrshr.u16 q13, q10, #8\n" + " vrshr.u16 q8, q11, #8\n" + " vrshr.u16 q9, q12, #8\n" + " vraddhn.u16 d20, q10, q13\n" + " vraddhn.u16 d21, q11, q8\n" + " vraddhn.u16 d22, q12, q9\n" + ".endm\n" + + ".macro process_pixblock_tail size\n" + /* result is ready in d28, d29, d30 (R, G, B) */ + " vqadd.u8 d28, d2, d20\n" + " vqadd.u8 d29, d1, d21\n" + " vqadd.u8 d30, d0, d22\n" + /* convert it to r5g6b5 */ + " vshll.u8 q3, d28, #8\n" + " vshll.u8 q14, d29, #8\n" + " vshll.u8 q15, d30, #8\n" + " vsri.u16 q3, q14, #5\n" + " vsri.u16 q3, q15, #11\n" + /* store pixel data to memory */ + " .if \\size == 8\n" + " vst1.16 {d6, d7}, [%[dst_w], :128]!\n" + " .elseif \\size == 4\n" + " vst1.16 {d6}, [%[dst_w]]!\n" + " .elseif \\size == 2\n" + " vst1.16 {d6[0]}, [%[dst_w]]!\n" + " vst1.16 {d6[1]}, [%[dst_w]]!\n" + " .elseif \\size == 1\n" + " vst1.16 {d6[0]}, [%[dst_w]]!\n" + " .endif\n" + ".endm\n" + + /* "tail" of the previous block and "head" of the next block + are merged and interleaved for better instructions scheduling */ + ".macro process_pixblock_tail_head_8\n" + " vqadd.u8 d28, d2, d20\n" + " vld1.16 {d4, d5}, [%[dst_r], :128]!\n" + " vqadd.u8 d29, d1, d21\n" /* TODO: try to join these into a */ + " vqadd.u8 d30, d0, d22\n" /* single 128-bit operation */ + " vshrn.u16 d16, q2, #8\n" + " vld1.32 {d0, d1, d2, d3}, [%[src]]!\n" /* TODO: maybe split */ + " vshrn.u16 d17, q2, #3\n" + " vsli.u16 q2, q2, #5\n" + " vuzp.8 d0, d1\n" + " vshll.u8 q3, d28, #8\n" + " vuzp.8 d2, d3\n" + " vshll.u8 q14, d29, #8\n" + " vuzp.8 d1, d3\n" + " vsri.u8 d16, d16, #5\n" + " vuzp.8 d0, d2\n" + " vmvn.8 d3, d3\n" + " vsri.u8 d17, d17, #6\n" + " vshrn.u16 d18, q2, #2\n" + " vmull.u8 q10, d3, d16\n" + "pld [%[src], #128]\n" + " vmull.u8 q11, d3, d17\n" + "pld [%[dst_r], #64]\n" + " vmull.u8 q12, d3, d18\n" + " vsri.u16 d6, d28, #5\n" + " vsri.u16 d7, d29, #5\n" + " vshll.u8 q15, d30, #8\n" + " vrshr.u16 q13, q10, #8\n" + " vrshr.u16 q8, q11, #8\n" + " vrshr.u16 q9, q12, #8\n" + " vsri.u16 d6, d30, #11\n" + " vsri.u16 d7, d31, #11\n" + " vraddhn.u16 d20, q10, q13\n" + " vraddhn.u16 d21, q11, q8\n" + " vraddhn.u16 d22, q12, q9\n" + " vst1.16 {d6, d7}, [%[dst_w], :128]!\n" + ".endm\n" + + "subs %[h], %[h], #1\n" + "blt 9f\n" + "0:\n" + "cmp %[w], #8\n" + "blt 8f\n" + + /* ensure 16 byte alignment of the destination buffer */ + "tst %[dst_r], #0xF\n" + "beq 2f\n" + "tst %[dst_r], #2\n" + "beq 1f\n" + "vld1.32 {d3[0]}, [%[src]]!\n" + "vld1.16 {d5[2]}, [%[dst_r]]!\n" + "sub %[w], %[w], #1\n" + "1:\n" + "tst %[dst_r], #4\n" + "beq 1f\n" + "vld1.32 {d2}, [%[src]]!\n" + "vld1.16 {d5[0]}, [%[dst_r]]!\n" + "vld1.16 {d5[1]}, [%[dst_r]]!\n" + "sub %[w], %[w], #2\n" + "1:\n" + "tst %[dst_r], #8\n" + "beq 1f\n" + "vld1.32 {d0, d1}, [%[src]]!\n" + "vld1.16 {d4}, [%[dst_r]]!\n" + "sub %[w], %[w], #4\n" + "1:\n" + "process_pixblock_head -1\n" + "process_pixblock_tail -1\n" + "tst %[dst_w], #2\n" + "beq 1f\n" + "vst1.16 {d7[2]}, [%[dst_w]]!\n" + "1:\n" + "tst %[dst_w], #4\n" + "beq 1f\n" + "vst1.16 {d7[0]}, [%[dst_w]]!\n" + "vst1.16 {d7[1]}, [%[dst_w]]!\n" + "1:\n" + "tst %[dst_w], #8\n" + "beq 2f\n" + "vst1.16 {d6}, [%[dst_w]]!\n" + "2:\n" + + "subs %[w], %[w], #8\n" + "blt 8f\n" + "process_pixblock_head 8\n" + "subs %[w], %[w], #8\n" + "blt 2f\n" + "1:\n" /* innermost pipelined loop */ + "process_pixblock_tail_head_8\n" + "subs %[w], %[w], #8\n" + "bge 1b\n" + "2:\n" + "process_pixblock_tail 8\n" + + "8:\n" + /* process up to 7 remaining pixels */ + "tst %[w], #7\n" + "beq 2f\n" + "tst %[w], #4\n" + "beq 1f\n" + "vld1.32 {d0, d1}, [%[src]]!\n" + "vld1.16 {d4}, [%[dst_r]]!\n" + "1:\n" + "tst %[w], #2\n" + "beq 1f\n" + "vld1.32 {d2}, [%[src]]!\n" + "vld1.16 {d5[0]}, [%[dst_r]]!\n" + "vld1.16 {d5[1]}, [%[dst_r]]!\n" + "1:\n" + "tst %[w], #1\n" + "beq 1f\n" + "vld1.32 {d3[0]}, [%[src]]!\n" + "vld1.16 {d5[2]}, [%[dst_r]]!\n" + "1:\n" + + "process_pixblock_head -1\n" + "process_pixblock_tail -1\n" + + "tst %[w], #4\n" + "beq 1f\n" + "vst1.16 {d6}, [%[dst_w]]!\n" + "1:\n" + "tst %[w], #2\n" + "beq 1f\n" + "vst1.16 {d7[0]}, [%[dst_w]]!\n" + "vst1.16 {d7[1]}, [%[dst_w]]!\n" + "1:\n" + "tst %[w], #1\n" + "beq 2f\n" + "vst1.16 {d7[2]}, [%[dst_w]]!\n" + "2:\n" + + "add %[src], %[src], %[src_newline_delta]\n" + "add %[dst_r], %[dst_r], %[dst_newline_delta]\n" + "add %[dst_w], %[dst_w], %[dst_newline_delta]\n" + "mov %[w], %[orig_w]\n" + "subs %[h], %[h], #1\n" + "bge 0b\n" + "9:\n" + ".purgem process_pixblock_head\n" + ".purgem process_pixblock_tail\n" + ".purgem process_pixblock_tail_head_8\n" + + : [src] "+&r" (src), [dst_r] "+&r" (dst), [dst_w] "+&r" (dst), + [w] "+&r" (w), [h] "+&r" (h) + : [dst_newline_delta] "r" (dst_newline_delta), + [src_newline_delta] "r" (src_newline_delta), [orig_w] "r" (w) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + /* "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", */ + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" + ); +} + +static void +neon_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line; + uint32_t *src_line; + int32_t dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + neon_composite_over_8888_0565_internal (src_line, + dst_line, + width, + height, + src_stride, + dst_stride); +} + #endif /* USE_GCC_INLINE_ASM */ static void @@ -1908,6 +2176,8 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] = #ifdef USE_GCC_INLINE_ASM { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 }, { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 }, #endif { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },