summaryrefslogtreecommitdiff
path: root/packages/xorg-driver/xf86-video-omapfb
diff options
context:
space:
mode:
authorKoen Kooi <koen@openembedded.org>2009-02-03 10:54:18 +0100
committerKoen Kooi <koen@openembedded.org>2009-02-03 10:54:18 +0100
commit0e58397cce01711f0c547d98f6a00224bdaaf2ec (patch)
treec48df19a6afbc10ba2d45d866df8d9d517229e8d /packages/xorg-driver/xf86-video-omapfb
parent1bdacf8052f9034e4a1cf1457ba8b8ab67e95a73 (diff)
xf86-video-omapfb: add patch for armv7a to do pixel conversion with NEON SIMD
Diffstat (limited to 'packages/xorg-driver/xf86-video-omapfb')
-rw-r--r--packages/xorg-driver/xf86-video-omapfb/omapfb-neon.diff146
1 files changed, 146 insertions, 0 deletions
diff --git a/packages/xorg-driver/xf86-video-omapfb/omapfb-neon.diff b/packages/xorg-driver/xf86-video-omapfb/omapfb-neon.diff
new file mode 100644
index 0000000000..325ca66f0c
--- /dev/null
+++ b/packages/xorg-driver/xf86-video-omapfb/omapfb-neon.diff
@@ -0,0 +1,146 @@
+--- /tmp/image-format-conversions.h 2009-02-03 10:18:04.000000000 +0100
++++ git/src/image-format-conversions.h 2009-02-03 10:19:18.000000000 +0100
+@@ -30,6 +30,8 @@
+ /* Basic C implementation of YV12/I420 to UYVY conversion */
+ void uv12_to_uyvy(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
+
++/* NEON implementation of YV12/I420 to UYVY conversion */
++void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
+
+ #endif /* __IMAGE_FORMAT_CONVERSIONS_H__ */
+
+--- /tmp/image-format-conversions.c 2009-02-03 10:18:04.000000000 +0100
++++ git/src/image-format-conversions.c 2009-02-03 10:16:47.000000000 +0100
+@@ -2,6 +2,7 @@
+ * Copyright 2008 Kalle Vahlman, <zuh@iki.fi>
+ * Ilpo Ruotsalainen, <lonewolf@iki.fi>
+ * Tuomas Kulve, <tuomas.kulve@movial.com>
++ * Ian Rickards, <ian.rickards@arm.com>
+ *
+ *
+ * Permission to use, copy, modify, distribute and sell this software and its
+@@ -89,3 +90,104 @@
+ }
+ }
+
++void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest)
++{
++ int x, y;
++ uint8_t *dest_even = dest;
++ uint8_t *dest_odd = dest + w * 2;
++ uint8_t *y_p_even = y_p;
++ uint8_t *y_p_odd = y_p + y_pitch;
++
++ /*ErrorF("in uv12_to_uyvy, w: %d, pitch: %d\n", w, pitch);*/
++ if (w<16)
++ {
++ for (y=0; y<h; y+=2)
++ {
++ for (x=0; x<w; x+=2)
++ {
++ /* Output two 2x1 macroblocks to form a 2x2 block from input */
++ uint8_t u_val = *u_p++;
++ uint8_t v_val = *v_p++;
++
++ /* Even row, first pixel */
++ *dest_even++ = u_val;
++ *dest_even++ = *y_p_even++;
++
++ /* Even row, second pixel */
++ *dest_even++ = v_val;
++ *dest_even++ = *y_p_even++;
++
++ /* Odd row, first pixel */
++ *dest_odd++ = u_val;
++ *dest_odd++ = *y_p_odd++;
++
++ /* Odd row, second pixel */
++ *dest_odd++ = v_val;
++ *dest_odd++ = *y_p_odd++;
++ }
++
++ dest_even += w * 2;
++ dest_odd += w * 2;
++
++ u_p += ((uv_pitch << 1) - w) >> 1;
++ v_p += ((uv_pitch << 1) - w) >> 1;
++
++ y_p_even += (y_pitch - w) + y_pitch;
++ y_p_odd += (y_pitch - w) + y_pitch;
++ }
++ }
++ else
++ {
++ for (y=0; y<h; y+=2)
++ {
++ x=w;
++ do {
++ // avoid using d8-d15 (q4-q7) aapcs callee-save registers
++ asm volatile (
++ "1:\n\t"
++ "vld1.u8 {d0}, [%[u_p]]!\n\t"
++ "sub %[x],%[x],#16\n\t"
++ "cmp %[x],#16\n\t"
++ "vld1.u8 {d1}, [%[v_p]]!\n\t"
++ "vld1.u8 {q1}, [%[y_p_even]]!\n\t"
++ "vzip.u8 d0, d1\n\t"
++ "vld1.u8 {q2}, [%[y_p_odd]]!\n\t"
++ // use 2-element struct stores to zip up y with y&v
++ "vst2.u8 {q0,q1}, [%[dest_even]]!\n\t"
++ "vmov.u8 q1, q2\n\t"
++ "vst2.u8 {q0,q1}, [%[dest_odd]]!\n\t"
++ "bhs 1b\n\t"
++ : [u_p] "+r" (u_p), [v_p] "+r" (v_p), [y_p_even] "+r" (y_p_even), [y_p_odd] "+r" (y_p_odd),
++ [dest_even] "+r" (dest_even), [dest_odd] "+r" (dest_odd),
++ [x] "+r" (x)
++ :
++ : "cc", "memory", "d0","d1","d2","d3","d4","d5"
++ );
++ if (x!=0)
++ {
++ // overlap final 16-pixel block to process requested width exactly
++ x = 16-x;
++ u_p -= x/2;
++ v_p -= x/2;
++ y_p_even -= x;
++ y_p_odd -= x;
++ dest_even -= x*2;
++ dest_odd -= x*2;
++ x = 16;
++ // do another 16-pixel block
++ }
++ }
++ while (x!=0);
++
++ dest_even += w * 2;
++ dest_odd += w * 2;
++
++ u_p += ((uv_pitch << 1) - w) >> 1;
++ v_p += ((uv_pitch << 1) - w) >> 1;
++
++ y_p_even += (y_pitch - w) + y_pitch;
++ y_p_odd += (y_pitch - w) + y_pitch;
++ }
++ }
++}
++
+--- /tmp/omapfb-xv-generic.c 2009-02-03 10:52:18.000000000 +0100
++++ git/src/omapfb-xv-generic.c 2009-02-03 10:52:24.000000000 +0100
+@@ -240,7 +240,7 @@
+ uint8_t *yb = buf;
+ uint8_t *ub = yb + (src_y_pitch * src_h);
+ uint8_t *vb = ub + (src_uv_pitch * (src_h / 2));
+- uv12_to_uyvy(src_w & ~15,
++ uv12_to_uyvy_neon(src_w & ~15,
+ src_h & ~15,
+ src_y_pitch,
+ src_uv_pitch,
+@@ -256,7 +256,7 @@
+ uint8_t *yb = buf;
+ uint8_t *vb = yb + (src_y_pitch * src_h);
+ uint8_t *ub = vb + (src_uv_pitch * (src_h / 2));
+- uv12_to_uyvy(src_w & ~15,
++ uv12_to_uyvy_neon(src_w & ~15,
+ src_h & ~15,
+ src_y_pitch,
+ src_uv_pitch,