From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Fri, 17 Jul 2009 10:22:23 +0000 (+0300)
Subject: Fastpath for nearest neighbour scaled compositing operations.
X-Git-Url: http://siarhei.siamashka.name/gitweb/?p=pixman.git;a=commitdiff_plain;h=247531c6978725a88fd3706129b9d3e339026f54

Fastpath for nearest neighbour scaled compositing operations.

OVER 8888x8888, OVER 8888x0565, SRC 8888x8888, SRC 8888x0565
and SRC 0565x0565 cases are supported.
---

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 7f80578..7f3a6ad 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1261,6 +1261,993 @@ fast_composite_src_scale_nearest (pixman_implementation_t *imp,
     }
 }
 
+/*
+ * Functions, which implement the core inner loops for the nearest neighbour
+ * scaled fastpath compositing operations. The do not need to do clipping
+ * checks, also the loops are unrolled to process two pixels per iteration
+ * for better performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#undef READ
+#undef WRITE
+#define READ(img,x) (*(x))
+#define WRITE(img,ptr,v) ((*(ptr)) = (v))
+
+#define UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(x, a, y) do {                     \
+        UN8x4_MUL_UN8_ADD_UN8x4(x, a, y);                                      \
+        x = CONVERT_8888_TO_0565(x);                                       \
+    } while (0)
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x0565 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint16_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  d;
+    uint32_t  s1, s2;
+    uint8_t   a1, a2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+
+    uint32_t *src;
+    uint16_t *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+
+        if ((y < 0) || (y >= pSrc->bits.height)) {
+            continue;
+        }
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            s2 = READ(pSrc, src + x2);
+
+            a1 = s1 >> 24;
+            a2 = s2 >> 24;
+
+            if (a1 == 0xff)
+                WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            else if (s1) {
+                d = CONVERT_0565_TO_0888(READ(pDst, dst));
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+
+            if (a2 == 0xff)
+                WRITE(pDst, dst, CONVERT_8888_TO_0565(s2));
+            else if (s2) {
+                d = CONVERT_0565_TO_0888(READ(pDst, dst));
+                a2 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a2, s2);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+
+            a1 = s1 >> 24;
+            if (a1 == 0xff)
+                WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            else if (s1) {
+                d = CONVERT_0565_TO_0888(READ(pDst, dst));
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x0565 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint16_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  d;
+    uint32_t  s1, s2;
+    uint8_t   a1, a2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+    int32_t   max_vx, max_vy;
+
+    uint32_t *src;
+    uint16_t *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    max_vx = pSrc->bits.width << 16;
+    max_vy = pSrc->bits.height << 16;
+
+    while (orig_vx < 0) orig_vx += max_vx;
+    while (vy < 0) vy += max_vy;
+    while (orig_vx >= max_vx) orig_vx -= max_vx;
+    while (vy >= max_vy) vy -= max_vy;
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+        while (vy >= max_vy) vy -= max_vy;
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s2 = READ(pSrc, src + x2);
+
+            a1 = s1 >> 24;
+            a2 = s2 >> 24;
+
+            if (a1 == 0xff)
+                WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            else if (s1) {
+                d = CONVERT_0565_TO_0888(READ(pDst, dst));
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+
+            if (a2 == 0xff)
+                WRITE(pDst, dst, CONVERT_8888_TO_0565(s2));
+            else if (s2) {
+                d = CONVERT_0565_TO_0888(READ(pDst, dst));
+                a2 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a2, s2);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            a1 = s1 >> 24;
+            if (a1 == 0xff)
+                WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            else if (s1) {
+                d = CONVERT_0565_TO_0888(READ(pDst, dst));
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4_store_r5g6b5(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x8888 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint32_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  d;
+    uint32_t  s1, s2;
+    uint8_t   a1, a2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+
+    uint32_t *src, *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+
+        if ((y < 0) || (y >= pSrc->bits.height)) {
+            continue;
+        }
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            s2 = READ(pSrc, src + x2);
+
+            a1 = s1 >> 24;
+            a2 = s2 >> 24;
+
+            if (a1 == 0xff)
+                WRITE(pDst, dst, s1);
+            else if (s1) {
+                d = READ(pDst, dst);
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+
+            if (a2 == 0xff)
+                WRITE(pDst, dst, s2);
+            else if (s2) {
+                d = READ(pDst, dst);
+                a2 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4(d, a2, s2);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+
+            a1 = s1 >> 24;
+            if (a1 == 0xff)
+                WRITE(pDst, dst, s1);
+            else if (s1) {
+                d = READ(pDst, dst);
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x8888 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint32_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  d;
+    uint32_t  s1, s2;
+    uint8_t   a1, a2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+    int32_t   max_vx, max_vy;
+
+    uint32_t *src, *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    max_vx = pSrc->bits.width << 16;
+    max_vy = pSrc->bits.height << 16;
+
+    while (orig_vx < 0) orig_vx += max_vx;
+    while (vy < 0) vy += max_vy;
+    while (orig_vx >= max_vx) orig_vx -= max_vx;
+    while (vy >= max_vy) vy -= max_vy;
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+        while (vy >= max_vy) vy -= max_vy;
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s2 = READ(pSrc, src + x2);
+
+            a1 = s1 >> 24;
+            a2 = s2 >> 24;
+
+            if (a1 == 0xff)
+                WRITE(pDst, dst, s1);
+            else if (s1) {
+                d = READ(pDst, dst);
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+
+            if (a2 == 0xff)
+                WRITE(pDst, dst, s2);
+            else if (s2) {
+                d = READ(pDst, dst);
+                a2 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4(d, a2, s2);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            a1 = s1 >> 24;
+            if (a1 == 0xff)
+                WRITE(pDst, dst, s1);
+            else if (s1) {
+                d = READ(pDst, dst);
+                a1 ^= 0xff;
+                UN8x4_MUL_UN8_ADD_UN8x4(d, a1, s1);
+                WRITE(pDst, dst, d);
+            }
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x8888 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint32_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  s1, s2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+
+    uint32_t *src, *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+
+        if ((y < 0) || (y >= pSrc->bits.height)) {
+            memset(dst, 0, width * sizeof(*dst));
+            continue;
+        }
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            s2 = READ(pSrc, src + x2);
+
+            WRITE(pDst, dst, s1);
+            dst++;
+            WRITE(pDst, dst, s2);
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+            WRITE(pDst, dst, s1);
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x8888 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint32_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  s1, s2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+    int32_t   max_vx, max_vy;
+
+    uint32_t *src, *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    max_vx = pSrc->bits.width << 16;
+    max_vy = pSrc->bits.height << 16;
+
+    while (orig_vx < 0) orig_vx += max_vx;
+    while (vy < 0) vy += max_vy;
+    while (orig_vx >= max_vx) orig_vx -= max_vx;
+    while (vy >= max_vy) vy -= max_vy;
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+        while (vy >= max_vy) vy -= max_vy;
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s2 = READ(pSrc, src + x2);
+
+            WRITE(pDst, dst, s1);
+            dst++;
+            WRITE(pDst, dst, s2);
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            WRITE(pDst, dst, s1);
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_0565x0565 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint16_t *dstLine;
+    uint16_t *srcFirstLine;
+    uint16_t  s1, s2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+
+    uint16_t *src, *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint16_t, srcStride, srcFirstLine, 1);
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+
+        if ((y < 0) || (y >= pSrc->bits.height)) {
+            memset(dst, 0, width * sizeof(*dst));
+            continue;
+        }
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            s2 = READ(pSrc, src + x2);
+
+            WRITE(pDst, dst, s1);
+            dst++;
+            WRITE(pDst, dst, s2);
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+            WRITE(pDst, dst, s1);
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_0565x0565 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint16_t *dstLine;
+    uint16_t *srcFirstLine;
+    uint16_t  s1, s2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+    int32_t   max_vx, max_vy;
+
+    uint16_t *src, *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint16_t, srcStride, srcFirstLine, 1);
+
+    max_vx = pSrc->bits.width << 16;
+    max_vy = pSrc->bits.height << 16;
+
+    while (orig_vx < 0) orig_vx += max_vx;
+    while (vy < 0) vy += max_vy;
+    while (orig_vx >= max_vx) orig_vx -= max_vx;
+    while (vy >= max_vy) vy -= max_vy;
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+        while (vy >= max_vy) vy -= max_vy;
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s2 = READ(pSrc, src + x2);
+
+            WRITE(pDst, dst, s1);
+            dst++;
+            WRITE(pDst, dst, s2);
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            WRITE(pDst, dst, s1);
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x0565 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint16_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  s1, s2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+
+    uint32_t *src;
+    uint16_t *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+
+        if ((y < 0) || (y >= pSrc->bits.height)) {
+            memset(dst, 0, width * sizeof(*dst));
+            continue;
+        }
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            s2 = READ(pSrc, src + x2);
+
+            WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            dst++;
+            WRITE(pDst, dst, CONVERT_8888_TO_0565(s2));
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            s1 = READ(pSrc, src + x1);
+            WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            dst++;
+        }
+    }
+}
+
+static void fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x0565 (
+    pixman_image_t *pSrc, pixman_image_t *pDst, int xSrc, int ySrc, int xDst, int yDst,
+    int width, int height, int32_t vx, int32_t vy, int32_t unit_x, int32_t unit_y)
+{
+    uint16_t *dstLine;
+    uint32_t *srcFirstLine;
+    uint32_t  s1, s2;
+    int       w;
+    int       x1, x2, y;
+    int32_t   orig_vx = vx;
+    int32_t   max_vx, max_vy;
+
+    uint32_t *src;
+    uint16_t *dst;
+    int       srcStride, dstStride;
+    PIXMAN_IMAGE_GET_LINE (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+     * transformed from destination space to source space */
+    PIXMAN_IMAGE_GET_LINE (pSrc, 0, 0, uint32_t, srcStride, srcFirstLine, 1);
+
+    max_vx = pSrc->bits.width << 16;
+    max_vy = pSrc->bits.height << 16;
+
+    while (orig_vx < 0) orig_vx += max_vx;
+    while (vy < 0) vy += max_vy;
+    while (orig_vx >= max_vx) orig_vx -= max_vx;
+    while (vy >= max_vy) vy -= max_vy;
+
+    while (--height >= 0)
+    {
+        dst = dstLine;
+        dstLine += dstStride;
+
+        y = vy >> 16;
+        vy += unit_y;
+        while (vy >= max_vy) vy -= max_vy;
+
+        src = srcFirstLine + srcStride * y;
+
+        w = width;
+        vx = orig_vx;
+        while ((w -= 2) >= 0)
+        {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            x2 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s2 = READ(pSrc, src + x2);
+
+            WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            dst++;
+            WRITE(pDst, dst, CONVERT_8888_TO_0565(s2));
+            dst++;
+        }
+        if (w & 1) {
+            x1 = vx >> 16;
+            vx += unit_x;
+            while (vx >= max_vx) vx -= max_vx;
+            s1 = READ(pSrc, src + x1);
+
+            WRITE(pDst, dst, CONVERT_8888_TO_0565(s1));
+            dst++;
+        }
+    }
+}
+
+/*
+ * Check if the clipping boundary is crossed on horizontal scaling
+ */
+static inline pixman_bool_t
+fbTransformVerifyHorizontalClipping(pixman_image_t *pict, int width, int32_t vx, int32_t unit_x)
+{
+    while (--width >= 0) {
+        int x = vx >> 16;
+        if ((x < 0) || (x >= pict->bits.width)) return 1;
+        vx += unit_x;
+    }
+    return 0;
+}
+
+/*
+ * Check if the clipping boundary is crossed on vertical scaling
+ */
+static inline pixman_bool_t
+fbTransformVerifyVerticalClipping(pixman_image_t *pict, int height, int32_t vy, int32_t unit_y)
+{
+    while (--height >= 0) {
+        int y = vy >> 16;
+        if ((y < 0) || (y >= pict->bits.height)) return 1;
+        vy += unit_y;
+    }
+    return 0;
+}
+
+/*
+ * Easy case of transform without rotation or complex clipping
+ * Returns 1 in the case if it was able to handle this operation and 0 otherwise
+ */
+static pixman_bool_t
+fbCompositeTransformNonrotatedAffineTrivialclip (
+			    pixman_op_t     op,
+			    pixman_image_t *pSrc,
+			    pixman_image_t *pMask,
+			    pixman_image_t *pDst,
+			    int16_t         xSrc,
+			    int16_t         ySrc,
+			    int16_t         xMask,
+			    int16_t         yMask,
+			    int16_t         xDst,
+			    int16_t         yDst,
+			    uint16_t        width,
+			    uint16_t        height)
+{
+    pixman_vector_t v, unit;
+    int skipdst_x = 0, skipdst_y = 0;
+
+    /* Handle destination clipping */
+    if (xDst < pDst->common.clip_region.extents.x1) {
+        skipdst_x = pDst->common.clip_region.extents.x1 - xDst;
+        if (skipdst_x >= (int)width)
+            return 1;
+        xDst = pDst->common.clip_region.extents.x1;
+        width -= skipdst_x;
+    }
+
+    if (yDst < pDst->common.clip_region.extents.y1) {
+        skipdst_y = pDst->common.clip_region.extents.y1 - yDst;
+        if (skipdst_y >= (int)height)
+            return 1;
+        yDst = pDst->common.clip_region.extents.y1;
+        height -= skipdst_y;
+    }
+
+    if (xDst >= pDst->common.clip_region.extents.x2 ||
+        yDst >= pDst->common.clip_region.extents.y2)
+    {
+        return 1;
+    }
+
+    if (xDst + width > pDst->common.clip_region.extents.x2)
+        width = pDst->common.clip_region.extents.x2 - xDst;
+    if (yDst + height > pDst->common.clip_region.extents.y2)
+        height = pDst->common.clip_region.extents.y2 - yDst;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed(xSrc) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed(ySrc) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (pSrc->common.transform, &v))
+        return 0;
+
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+    v.vector[0] -= pixman_fixed_e;
+    v.vector[1] -= pixman_fixed_e;
+
+    unit.vector[0] = pSrc->common.transform->matrix[0][0];
+    unit.vector[1] = pSrc->common.transform->matrix[1][1];
+
+    v.vector[0] += unit.vector[0] * skipdst_x;
+    v.vector[1] += unit.vector[1] * skipdst_y;
+
+    /* Check for possible fixed point arithmetics problems/overflows */
+    if (unit.vector[0] <= 0 || unit.vector[1] <= 0)
+        return 0;
+    if (width == 0 || height == 0)
+        return 0;
+    if ((uint32_t)width + (unit.vector[0] >> 16) >= 0x7FFF)
+        return 0;
+    if ((uint32_t)height + (unit.vector[1] >> 16) >= 0x7FFF)
+        return 0;
+
+    /* Horizontal source clipping is only supported for NORMAL repeat */
+    if (pSrc->common.repeat != PIXMAN_REPEAT_NORMAL
+        && fbTransformVerifyHorizontalClipping(pSrc, width, v.vector[0], unit.vector[0])) {
+        return 0;
+    }
+
+    /* Vertical source clipping is only supported for NONE and NORMAL repeat */
+    if (pSrc->common.repeat != PIXMAN_REPEAT_NONE && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL
+        && fbTransformVerifyVerticalClipping(pSrc, height, v.vector[1], unit.vector[1])) {
+        return 0;
+    }
+
+    if (op == PIXMAN_OP_OVER && pSrc->bits.format == PIXMAN_a8r8g8b8
+            && (pDst->bits.format == PIXMAN_x8r8g8b8 || pDst->bits.format == PIXMAN_a8r8g8b8))
+    {
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x8888(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x8888(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+    }
+
+    if (op == PIXMAN_OP_SRC && (pSrc->bits.format == PIXMAN_x8r8g8b8 || pSrc->bits.format == PIXMAN_a8r8g8b8)
+            && (pDst->bits.format == PIXMAN_x8r8g8b8 || pDst->bits.format == pSrc->bits.format))
+    {
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x8888(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x8888(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+    }
+
+    if (op == PIXMAN_OP_OVER && pSrc->bits.format == PIXMAN_a8r8g8b8 && pDst->bits.format == PIXMAN_r5g6b5)
+    {
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipOver_8888x0565(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatOver_8888x0565(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+    }
+
+    if (op == PIXMAN_OP_SRC && pSrc->bits.format == PIXMAN_r5g6b5 && pDst->bits.format == PIXMAN_r5g6b5)
+    {
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_0565x0565(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_0565x0565(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+    }
+
+    if (op == PIXMAN_OP_SRC && (pSrc->bits.format == PIXMAN_x8r8g8b8 || pSrc->bits.format == PIXMAN_a8r8g8b8)
+        && pDst->bits.format == PIXMAN_r5g6b5)
+    {
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat != PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipSrc_8888x0565(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+        if (pSrc->common.filter == PIXMAN_FILTER_NEAREST && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL) {
+            fbCompositeTransformNearestNonrotatedAffineTrivialclipRepeatSrc_8888x0565(
+                pSrc, pDst, xSrc, ySrc, xDst, yDst, width, height,
+                v.vector[0], v.vector[1], unit.vector[0], unit.vector[1]);
+            return 1;
+        }
+    }
+
+    /* No fastpath scaling implemented for this case */
+    return 0;
+}
+
 static void
 fast_path_composite (pixman_implementation_t *imp,
                      pixman_op_t              op,
@@ -1279,6 +2266,30 @@ fast_path_composite (pixman_implementation_t *imp,
     if (src->type == BITS
         && src->common.transform
         && !mask
+        && !src->common.alpha_map && !dest->common.alpha_map
+        && (src->common.filter == PIXMAN_FILTER_NEAREST)
+        && !src->bits.read_func && !src->bits.write_func
+        && !dest->bits.read_func && !dest->bits.write_func)
+    {
+        /* ensure that the transform matrix only has a scale */
+        if (src->common.transform->matrix[0][1] == 0 &&
+            src->common.transform->matrix[1][0] == 0 &&
+            src->common.transform->matrix[2][0] == 0 &&
+            src->common.transform->matrix[2][1] == 0 &&
+            src->common.transform->matrix[2][2] == pixman_fixed_1 &&
+            dest->common.clip_region.data == NULL)
+        {
+            if (fbCompositeTransformNonrotatedAffineTrivialclip (op, src, mask, dest,
+                    src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
+            {
+                return;
+            }
+        }
+    }
+
+    if (src->type == BITS
+        && src->common.transform
+        && !mask
         && op == PIXMAN_OP_SRC
         && !src->common.alpha_map && !dest->common.alpha_map
         && (src->common.filter == PIXMAN_FILTER_NEAREST)