From 98d458dea913d7d76c48c48de9ef3aee85cced3a Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 12 Oct 2009 22:25:38 +0300 Subject: [PATCH 7/7] ARM: Enabled new NEON optimizations --- pixman/pixman-arm-neon.c | 535 ++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 520 insertions(+), 15 deletions(-) diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index fe57daa..2811099 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -34,6 +34,18 @@ #include #include "pixman-private.h" +/* + * Use GNU assembler optimizations only if we are completely sure that + * the target system has compatible ABI and calling conventions. This + * check can be updated/extended if more systems turn out to be actually + * compatible. + */ +#if defined(__linux__) && defined(__ARM_EABI__) && defined(USE_GCC_INLINE_ASM) +#define USE_GNU_ASSEMBLER_ARM_NEON +#endif + +#ifndef USE_GNU_ASSEMBLER_ARM_NEON + /* Deal with an intrinsic that is defined differently in GCC */ #if !defined(__ARMCC_VERSION) && !defined(__pld) #define __pld(_x) __builtin_prefetch (_x) @@ -1901,17 +1913,7 @@ pixman_fill_neon (uint32_t *bits, #endif } -/* - * Use GNU assembler optimizations only if we are completely sure that - * the target system has compatible ABI and calling conventions. This - * check can be updated/extended if more systems turn out to be actually - * compatible. - */ -#if defined(__linux__) && defined(__ARM_EABI__) && defined(USE_GCC_INLINE_ASM) -#define USE_GNU_ASSEMBLER_ASM -#endif - -#ifdef USE_GNU_ASSEMBLER_ASM +#else /* USE_GNU_ASSEMBLER_ARM_NEON */ void pixman_composite_over_8888_0565_asm_neon (int32_t w, @@ -1941,23 +1943,525 @@ neon_composite_over_8888_0565 (pixman_implementation_t *imp, int32_t dst_stride, src_stride; PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, - src_stride, src_line, 1); + src_stride, src_line, 1); PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, - dst_stride, dst_line, 1); + dst_stride, dst_line, 1); pixman_composite_over_8888_0565_asm_neon (width, height, dst_line, dst_stride, src_line, src_stride); } +void +pixman_composite_src_8888_0565_asm_neon (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint32_t *src, + int32_t src_stride); + +static void +neon_composite_src_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line; + uint32_t *src_line; + int32_t dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, + dst_stride, dst_line, 1); + + pixman_composite_src_8888_0565_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride); +} + +void +pixman_composite_src_0565_0565_asm_neon (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint16_t *src, + int32_t src_stride); + +static void +neon_composite_src_0565_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line; + uint16_t *src_line; + int32_t dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, + dst_stride, dst_line, 1); + + pixman_composite_src_0565_0565_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride); +} + + +void +pixman_composite_src_8888_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t *src, + int32_t src_stride); + +static void +neon_composite_src_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line; + uint32_t *src_line; + int32_t dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, + dst_stride, dst_line, 1); + + pixman_composite_src_8888_8888_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride); +} + +void +pixman_composite_over_8888_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t *src, + int32_t src_stride); + +static void +neon_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line; + uint32_t *src_line; + int32_t dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, + dst_stride, dst_line, 1); + + pixman_composite_over_8888_8888_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride); +} + +void +pixman_composite_add_8000_8000_asm_neon (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride); + +static void +neon_composite_add_8000_8000 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line; + uint8_t *src_line; + int32_t dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, + dst_stride, dst_line, 1); + + pixman_composite_add_8000_8000_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride); +} + +void +pixman_composite_over_n_8_0565_asm_neon (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint32_t src, + int32_t unused, + uint8_t *mask, + int32_t mask_stride); + +static void +neon_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line; + uint8_t *mask_line; + int32_t dst_stride, mask_stride; + uint32_t src; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, + mask_stride, mask_line, 1); + + pixman_composite_over_n_8_0565_asm_neon (width, height, + dst_line, dst_stride, + src, 0, + mask_line, mask_stride); +} + +void +pixman_composite_over_n_8_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t src, + int32_t unused, + uint8_t *mask, + int32_t mask_stride); + +static void +neon_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line; + uint8_t *mask_line; + int32_t dst_stride, mask_stride; + uint32_t src; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, + mask_stride, mask_line, 1); + + pixman_composite_over_n_8_8888_asm_neon (width, height, + dst_line, dst_stride, + src, 0, + mask_line, mask_stride); +} + +void +pixman_composite_add_8_8_8_asm_neon (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint8_t *src, + int32_t src_stride, + uint8_t *mask, + int32_t mask_stride); + +static void +neon_composite_add_8_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *src_line; + uint8_t *dst_line; + uint8_t *mask_line; + int32_t src_stride, dst_stride, mask_stride; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, + mask_stride, mask_line, 1); + + pixman_composite_add_8_8_8_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride, + mask_line, mask_stride); +} + +void +pixman_composite_add_n_8_8_asm_neon (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint32_t src, + int32_t unused, + uint8_t *mask, + int32_t mask_stride); + +static void +neon_composite_add_n_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint8_t *dst_line; + uint8_t *mask_line; + int32_t dst_stride, mask_stride; + uint32_t src; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, + dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, + mask_stride, mask_line, 1); + + pixman_composite_add_n_8_8_asm_neon (width, height, + dst_line, dst_stride, + src, 0, + mask_line, mask_stride); +} + +void +pixman_composite_over_8888_n_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t *src, + int32_t src_stride, + uint32_t mask); + +static void +neon_composite_over_8888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line; + uint32_t *src_line; + int32_t dst_stride, src_stride; + uint32_t mask; + + mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); + + if (mask == 0) + return; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, + src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, + dst_stride, dst_line, 1); + + pixman_composite_over_8888_n_8888_asm_neon (width, height, + dst_line, dst_stride, + src_line, src_stride, + mask); +} + +void +pixman_composite_src_n_8_asm_neon (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint8_t src); + +void +pixman_composite_src_n_0565_asm_neon (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint16_t src); + +void +pixman_composite_src_n_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t src); + +static pixman_bool_t +pixman_fill_neon (uint32_t *bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t _xor) +{ + /* stride is always multiple of 32bit units in pixman */ + uint32_t byte_stride = stride * sizeof(uint32_t); + + switch (bpp) + { + case 8: + pixman_composite_src_n_8_asm_neon ( + width, + height, + (uint8_t *)(((char *) bits) + y * byte_stride + x), + byte_stride, + _xor & 0xff); + return TRUE; + case 16: + pixman_composite_src_n_0565_asm_neon ( + width, + height, + (uint16_t *)(((char *) bits) + y * byte_stride + x * 2), + byte_stride / 2, + _xor & 0xffff); + return TRUE; + case 32: + pixman_composite_src_n_8888_asm_neon ( + width, + height, + (uint32_t *)(((char *) bits) + y * byte_stride + x * 4), + byte_stride / 4, + _xor); + return TRUE; + default: + return FALSE; + } +} + #endif static const pixman_fast_path_t arm_neon_fast_path_array[] = { -#ifdef USE_GNU_ASSEMBLER_ASM +#ifdef USE_GNU_ASSEMBLER_ARM_NEON + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565, 0 }, + { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_0565_0565, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_8888_0565, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_8888_0565, 0 }, + { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_a8, PIXMAN_a8, neon_composite_add_8_8_8, 0 }, + { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_n_8_8, 0 }, + { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_composite_add_8000_8000, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_composite_over_n_8_0565, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_8888_0565, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_8888_0565, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_src_8888_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_src_8888_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_src_8888_8888, 0 }, + { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_0565_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 }, -#endif + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, neon_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_over_8888_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888, 0 }, +#else { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_n_8_8, 0 }, { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_composite_add_8000_8000, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565, 0 }, @@ -1980,6 +2484,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] = { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888, 0 }, +#endif { PIXMAN_OP_NONE }, }; -- 1.6.2.4