diff options
author | Koen Kooi <koen@openembedded.org> | 2008-07-24 18:19:14 +0000 |
---|---|---|
committer | Koen Kooi <koen@openembedded.org> | 2008-07-24 18:19:14 +0000 |
commit | 45f4f7874cb1ff999638eab0561ba310bebfd11d (patch) | |
tree | 5e5fecf7041e6193b3bdbdc720441d905cff305a /packages/mythtv/mythtv-0.21/ffmpeg-arm-update.diff | |
parent | cb8ecbc2191f9a29bf4c669bd7161b24b44513ba (diff) |
mythtv 0.21: enable appropriate ARM optimization dependant on cpu and patch in NEON support for video
Diffstat (limited to 'packages/mythtv/mythtv-0.21/ffmpeg-arm-update.diff')
-rw-r--r-- | packages/mythtv/mythtv-0.21/ffmpeg-arm-update.diff | 1669 |
1 files changed, 1669 insertions, 0 deletions
diff --git a/packages/mythtv/mythtv-0.21/ffmpeg-arm-update.diff b/packages/mythtv/mythtv-0.21/ffmpeg-arm-update.diff new file mode 100644 index 0000000000..5abf52fcbb --- /dev/null +++ b/packages/mythtv/mythtv-0.21/ffmpeg-arm-update.diff @@ -0,0 +1,1669 @@ +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c mythtv/libs/libavcodec/armv4l/dsputil_arm.c +--- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/dsputil_arm.c 2008-07-24 19:54:00.753198000 +0200 +@@ -19,12 +19,14 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +-#include "dsputil.h" ++#include "libavcodec/dsputil.h" + #ifdef HAVE_IPP +-#include "ipp.h" ++#include <ipp.h> + #endif + + extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); ++extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx); ++extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); + + extern void j_rev_dct_ARM(DCTELEM *data); + extern void simple_idct_ARM(DCTELEM *data); +@@ -41,6 +43,12 @@ + extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, + DCTELEM *data); + ++extern void ff_simple_idct_neon(DCTELEM *data); ++extern void ff_simple_idct_put_neon(uint8_t *dest, int line_size, ++ DCTELEM *data); ++extern void ff_simple_idct_add_neon(uint8_t *dest, int line_size, ++ DCTELEM *data); ++ + /* XXX: local hack */ + static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); + static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); +@@ -202,6 +210,24 @@ + } + #endif + ++#ifdef HAVE_ARMV5TE ++static void prefetch_arm(void *mem, int stride, int h) ++{ ++ asm volatile( ++ "1: \n\t" ++ "subs %0, %0, #1 \n\t" ++ "pld [%1] \n\t" ++ "add %1, %1, %2 \n\t" ++ "bgt 1b \n\t" ++ : "+r"(h), "+r"(mem) : "r"(stride)); ++} ++#endif ++ ++int mm_support(void) ++{ ++ return ENABLE_IWMMXT * MM_IWMMXT; ++} ++ + void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) + { + int idct_algo= avctx->idct_algo; +@@ -209,49 +235,60 @@ + ff_put_pixels_clamped = c->put_pixels_clamped; + ff_add_pixels_clamped = c->add_pixels_clamped; + +- if(idct_algo == FF_IDCT_AUTO){ ++ if (avctx->lowres == 0) { ++ if(idct_algo == FF_IDCT_AUTO){ + #if defined(HAVE_IPP) +- idct_algo = FF_IDCT_IPP; ++ idct_algo = FF_IDCT_IPP; ++#elif defined(HAVE_NEON) ++ idct_algo = FF_IDCT_SIMPLENEON; + #elif defined(HAVE_ARMV6) +- idct_algo = FF_IDCT_SIMPLEARMV6; ++ idct_algo = FF_IDCT_SIMPLEARMV6; + #elif defined(HAVE_ARMV5TE) +- idct_algo = FF_IDCT_SIMPLEARMV5TE; ++ idct_algo = FF_IDCT_SIMPLEARMV5TE; + #else +- idct_algo = FF_IDCT_ARM; ++ idct_algo = FF_IDCT_ARM; + #endif +- } ++ } + +- if(idct_algo==FF_IDCT_ARM){ +- c->idct_put= j_rev_dct_ARM_put; +- c->idct_add= j_rev_dct_ARM_add; +- c->idct = j_rev_dct_ARM; +- c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */ +- } else if (idct_algo==FF_IDCT_SIMPLEARM){ +- c->idct_put= simple_idct_ARM_put; +- c->idct_add= simple_idct_ARM_add; +- c->idct = simple_idct_ARM; +- c->idct_permutation_type= FF_NO_IDCT_PERM; ++ if(idct_algo==FF_IDCT_ARM){ ++ c->idct_put= j_rev_dct_ARM_put; ++ c->idct_add= j_rev_dct_ARM_add; ++ c->idct = j_rev_dct_ARM; ++ c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */ ++ } else if (idct_algo==FF_IDCT_SIMPLEARM){ ++ c->idct_put= simple_idct_ARM_put; ++ c->idct_add= simple_idct_ARM_add; ++ c->idct = simple_idct_ARM; ++ c->idct_permutation_type= FF_NO_IDCT_PERM; + #ifdef HAVE_ARMV6 +- } else if (idct_algo==FF_IDCT_SIMPLEARMV6){ +- c->idct_put= ff_simple_idct_put_armv6; +- c->idct_add= ff_simple_idct_add_armv6; +- c->idct = ff_simple_idct_armv6; +- c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; ++ } else if (idct_algo==FF_IDCT_SIMPLEARMV6){ ++ c->idct_put= ff_simple_idct_put_armv6; ++ c->idct_add= ff_simple_idct_add_armv6; ++ c->idct = ff_simple_idct_armv6; ++ c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; + #endif + #ifdef HAVE_ARMV5TE +- } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){ +- c->idct_put= simple_idct_put_armv5te; +- c->idct_add= simple_idct_add_armv5te; +- c->idct = simple_idct_armv5te; +- c->idct_permutation_type = FF_NO_IDCT_PERM; ++ } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){ ++ c->idct_put= simple_idct_put_armv5te; ++ c->idct_add= simple_idct_add_armv5te; ++ c->idct = simple_idct_armv5te; ++ c->idct_permutation_type = FF_NO_IDCT_PERM; + #endif + #ifdef HAVE_IPP +- } else if (idct_algo==FF_IDCT_IPP){ +- c->idct_put= simple_idct_ipp_put; +- c->idct_add= simple_idct_ipp_add; +- c->idct = simple_idct_ipp; +- c->idct_permutation_type= FF_NO_IDCT_PERM; ++ } else if (idct_algo==FF_IDCT_IPP){ ++ c->idct_put= simple_idct_ipp_put; ++ c->idct_add= simple_idct_ipp_add; ++ c->idct = simple_idct_ipp; ++ c->idct_permutation_type= FF_NO_IDCT_PERM; ++#endif ++#ifdef HAVE_NEON ++ } else if (idct_algo==FF_IDCT_SIMPLENEON){ ++ c->idct_put= ff_simple_idct_put_neon; ++ c->idct_add= ff_simple_idct_add_neon; ++ c->idct = ff_simple_idct_neon; ++ c->idct_permutation_type = FF_NO_IDCT_PERM; + #endif ++ } + } + + c->put_pixels_tab[0][0] = put_pixels16_arm; +@@ -271,7 +308,17 @@ + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm; + ++#ifdef HAVE_ARMV5TE ++ c->prefetch = prefetch_arm; ++#endif ++ + #ifdef HAVE_IWMMXT + dsputil_init_iwmmxt(c, avctx); + #endif ++#ifdef HAVE_ARMVFP ++ ff_float_init_arm_vfp(c, avctx); ++#endif ++#ifdef HAVE_NEON ++ ff_dsputil_init_neon(c, avctx); ++#endif + } +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S +--- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S 2008-07-24 19:54:00.753198000 +0200 +@@ -19,6 +19,13 @@ + @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + @ + ++#include "config.h" ++ ++#ifndef HAVE_PLD ++.macro pld reg ++.endm ++#endif ++ + .macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 + mov \Rd0, \Rn0, lsr #(\shift * 8) + mov \Rd1, \Rn1, lsr #(\shift * 8) +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c +--- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c 2008-07-24 19:54:00.753198000 +0200 +@@ -19,10 +19,10 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +-#include "dsputil.h" ++#include "libavcodec/dsputil.h" + + #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt +-#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); ++#define SET_RND(regd) asm volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); + #define WAVG2B "wavg2b" + #include "dsputil_iwmmxt_rnd.h" + #undef DEF +@@ -30,7 +30,7 @@ + #undef WAVG2B + + #define DEF(x, y) x ## _ ## y ##_iwmmxt +-#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); ++#define SET_RND(regd) asm volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); + #define WAVG2B "wavg2br" + #include "dsputil_iwmmxt_rnd.h" + #undef DEF +@@ -89,7 +89,7 @@ + { + uint8_t *pixels2 = pixels + line_size; + +- __asm__ __volatile__ ( ++ asm volatile ( + "mov r12, #4 \n\t" + "1: \n\t" + "pld [%[pixels], %[line_size2]] \n\t" +@@ -125,7 +125,7 @@ + + static void clear_blocks_iwmmxt(DCTELEM *blocks) + { +- __asm __volatile( ++ asm volatile( + "wzero wr0 \n\t" + "mov r1, #(128 * 6 / 32) \n\t" + "1: \n\t" +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h +--- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h 2008-07-24 19:54:01.023198000 +0200 +@@ -19,13 +19,14 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +-#ifndef FFMPEG_DSPUTIL_IWMMXT_RND_H +-#define FFMPEG_DSPUTIL_IWMMXT_RND_H ++/* This header intentionally has no multiple inclusion guards. It is meant to ++ * be included multiple times and generates different code depending on the ++ * value of certain #defines. */ + + void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) + { + int stride = line_size; +- __asm__ __volatile__ ( ++ asm volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" +@@ -59,7 +60,7 @@ + void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) + { + int stride = line_size; +- __asm__ __volatile__ ( ++ asm volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" +@@ -101,7 +102,7 @@ + void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) + { + int stride = line_size; +- __asm__ __volatile__ ( ++ asm volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" +@@ -141,7 +142,7 @@ + void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) + { + int stride = line_size; +- __asm__ __volatile__ ( ++ asm volatile ( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" +@@ -200,7 +201,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" +@@ -249,7 +250,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" +@@ -310,7 +311,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" +@@ -371,7 +372,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" +@@ -447,7 +448,7 @@ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" +@@ -501,7 +502,7 @@ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" +@@ -558,7 +559,7 @@ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" +@@ -626,7 +627,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" +@@ -720,7 +721,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" +@@ -862,7 +863,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" +@@ -966,7 +967,7 @@ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version +- __asm__ __volatile__( ++ asm volatile( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" +@@ -1115,5 +1116,3 @@ + : [line_size]"r"(line_size) + : "r12", "memory"); + } +- +-#endif /* FFMPEG_DSPUTIL_IWMMXT_RND_H */ +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c mythtv/libs/libavcodec/armv4l/dsputil_neon.c +--- mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c 1970-01-01 01:00:00.000000000 +0100 ++++ mythtv/libs/libavcodec/armv4l/dsputil_neon.c 2008-07-24 19:54:01.023198000 +0200 +@@ -0,0 +1,397 @@ ++/* ++ * ARM NEON optimised DSP functions ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include <stdint.h> ++ ++#include "libavcodec/avcodec.h" ++#include "libavcodec/dsputil.h" ++ ++extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, ++ int h, int x, int y); ++ ++#define PUT_PIXELS_16_X2(vhadd) \ ++ "1: \n\t" \ ++ "vld1.64 {d0,d1,d2}, [%[p]], %[line_size] \n\t" \ ++ "vld1.64 {d4,d5,d6}, [%[p]], %[line_size] \n\t" \ ++ "pld [%[p]] \n\t" \ ++ "subs %[h], %[h], #2 \n\t" \ ++ "vext.8 q1, q0, q1, #1 \n\t" \ ++ "vext.8 q3, q2, q3, #1 \n\t" \ ++ vhadd".u8 q0, q0, q1 \n\t" \ ++ vhadd".u8 q2, q2, q3 \n\t" \ ++ "vst1.64 {d0,d1}, [%[b],:64], %[line_size] \n\t" \ ++ "vst1.64 {d4,d5}, [%[b],:64], %[line_size] \n\t" \ ++ "bne 1b \n\t" ++ ++#define PUT_PIXELS_16_Y2(vhadd) \ ++ "add %[p1], %[p0], %[line_size] \n\t" \ ++ "lsl %[l2], %[line_size], #1 \n\t" \ ++ "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \ ++ "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \ ++ "1: \n\t" \ ++ "subs %[h], %[h], #2 \n\t" \ ++ vhadd".u8 q2, q0, q1 \n\t" \ ++ "vst1.64 {d4,d5}, [%[b],:128], %[line_size] \n\t" \ ++ "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \ ++ vhadd".u8 q2, q0, q1 \n\t" \ ++ "vst1.64 {d4,d5}, [%[b],:128], %[line_size] \n\t" \ ++ "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \ ++ "bne 1b \n\t" ++ ++#define PUT_PIXELS_16_XY2(vshrn, no_rnd) \ ++ "lsl %[l2], %[line_size], #1 \n\t" \ ++ "add %[p1], %[p0], %[line_size] \n\t" \ ++ "vld1.64 {d0,d1,d2}, [%[p0]], %[l2] \n\t" \ ++ "vld1.64 {d4,d5,d6}, [%[p1]], %[l2] \n\t" \ ++ "pld [%[p0]] \n\t" \ ++ "pld [%[p1]] \n\t" \ ++ "vext.8 q1, q0, q1, #1 \n\t" \ ++ "vext.8 q3, q2, q3, #1 \n\t" \ ++ "vaddl.u8 q8, d0, d2 \n\t" \ ++ "vaddl.u8 q10, d1, d3 \n\t" \ ++ "vaddl.u8 q9, d4, d6 \n\t" \ ++ "vaddl.u8 q11, d5, d7 \n\t" \ ++ "1: \n\t" \ ++ "subs %[h], %[h], #2 \n\t" \ ++ "vld1.64 {d0,d1,d2}, [%[p0]], %[l2] \n\t" \ ++ "vadd.u16 q12, q8, q9 \n\t" \ ++ "pld [%[p0]] \n\t" \ ++ no_rnd "vadd.u16 q12, q12, q13 \n\t" \ ++ "vext.8 q15, q0, q1, #1 \n\t" \ ++ "vadd.u16 q1, q10, q11 \n\t" \ ++ vshrn".u16 d28, q12, #2 \n\t" \ ++ no_rnd "vadd.u16 q1, q1, q13 \n\t" \ ++ vshrn".u16 d29, q1, #2 \n\t" \ ++ "vaddl.u8 q8, d0, d30 \n\t" \ ++ "vld1.64 {d2,d3,d4}, [%[p1]], %[l2] \n\t" \ ++ "vaddl.u8 q10, d1, d31 \n\t" \ ++ "vst1.64 {d28,d29}, [%[b],:128], %[line_size] \n\t" \ ++ "vadd.u16 q12, q8, q9 \n\t" \ ++ "pld [%[p1]] \n\t" \ ++ no_rnd "vadd.u16 q12, q12, q13 \n\t" \ ++ "vext.8 q2, q1, q2, #1 \n\t" \ ++ "vadd.u16 q0, q10, q11 \n\t" \ ++ vshrn".u16 d30, q12, #2 \n\t" \ ++ no_rnd "vadd.u16 q0, q0, q13 \n\t" \ ++ vshrn".u16 d31, q0, #2 \n\t" \ ++ "vaddl.u8 q9, d2, d4 \n\t" \ ++ "vst1.64 {d30,d31}, [%[b],:128], %[line_size] \n\t" \ ++ "vaddl.u8 q11, d3, d5 \n\t" \ ++ "bgt 1b \n\t" ++ ++#define PUT_PIXELS_8_X2(vhadd) \ ++ "1: \n\t" \ ++ "vld1.64 {d0,d1}, [%[p]], %[line_size] \n\t" \ ++ "vld1.64 {d2,d3}, [%[p]], %[line_size] \n\t" \ ++ "pld [%[p]] \n\t" \ ++ "subs %[h], %[h], #2 \n\t" \ ++ "vext.8 d1, d0, d1, #1 \n\t" \ ++ "vext.8 d3, d2, d3, #1 \n\t" \ ++ "vswp d1, d2 \n\t" \ ++ vhadd".u8 q0, q0, q1 \n\t" \ ++ "vst1.64 {d0}, [%[b],:64], %[line_size] \n\t" \ ++ "vst1.64 {d1}, [%[b],:64], %[line_size] \n\t" \ ++ "bne 1b \n\t" ++ ++#define PUT_PIXELS_8_Y2(vhadd) \ ++ "add %[p1], %[p0], %[line_size] \n\t" \ ++ "lsl %[l2], %[line_size], #1 \n\t" \ ++ "vld1.64 {d0}, [%[p0]], %[l2] \n\t" \ ++ "vld1.64 {d1}, [%[p1]], %[l2] \n\t" \ ++ "1: \n\t" \ ++ "subs %[h], %[h], #2 \n\t" \ ++ vhadd".u8 d4, d0, d1 \n\t" \ ++ "vst1.64 {d4}, [%[b],:64], %[line_size] \n\t" \ ++ "vld1.64 {d0}, [%[p0]], %[l2] \n\t" \ ++ vhadd".u8 d4, d0, d1 \n\t" \ ++ "vst1.64 {d4}, [%[b],:64], %[line_size] \n\t" \ ++ "vld1.64 {d1}, [%[p1]], %[l2] \n\t" \ ++ "bne 1b \n\t" ++ ++#define PUT_PIXELS8_XY2(vshrn, no_rnd) \ ++ "lsl %[l2], %[line_size], #1 \n\t" \ ++ "add %[p1], %[p0], %[line_size] \n\t" \ ++ "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \ ++ "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \ ++ "pld [%[p0]] \n\t" \ ++ "pld [%[p1]] \n\t" \ ++ "vext.8 d4, d0, d1, #1 \n\t" \ ++ "vext.8 d6, d2, d3, #1 \n\t" \ ++ "vaddl.u8 q8, d0, d4 \n\t" \ ++ "vaddl.u8 q9, d2, d6 \n\t" \ ++ "1: \n\t" \ ++ "subs %[h], %[h], #2 \n\t" \ ++ "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \ ++ "pld [%[p0]] \n\t" \ ++ "vadd.u16 q10, q8, q9 \n\t" \ ++ "vext.8 d4, d0, d1, #1 \n\t" \ ++ no_rnd "vadd.u16 q10, q10, q11 \n\t" \ ++ "vaddl.u8 q8, d0, d4 \n\t" \ ++ vshrn".u16 d5, q10, #2 \n\t" \ ++ "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \ ++ "vadd.u16 q10, q8, q9 \n\t" \ ++ "pld [%[p1]] \n\t" \ ++ no_rnd "vadd.u16 q10, q10, q11 \n\t" \ ++ "vst1.64 {d5}, [%[b],:64], %[line_size] \n\t" \ ++ vshrn".u16 d7, q10, #2 \n\t" \ ++ "vext.8 d6, d2, d3, #1 \n\t" \ ++ "vaddl.u8 q9, d2, d6 \n\t" \ ++ "vst1.64 {d7}, [%[b],:64], %[line_size] \n\t" \ ++ "bgt 1b \n\t" ++ ++static void put_pixels16_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ asm volatile( ++ "1: \n\t" ++ "vld1.64 {d0,d1}, [%[pixels]], %[line_size] \n\t" ++ "vld1.64 {d2,d3}, [%[pixels]], %[line_size] \n\t" ++ "vld1.64 {d4,d5}, [%[pixels]], %[line_size] \n\t" ++ "vld1.64 {d6,d7}, [%[pixels]], %[line_size] \n\t" ++ "pld [%[pixels]] \n\t" ++ "subs %[h], %[h], #4 \n\t" ++ "vst1.64 {d0,d1}, [%[block],:128], %[line_size] \n\t" ++ "vst1.64 {d2,d3}, [%[block],:128], %[line_size] \n\t" ++ "vst1.64 {d4,d5}, [%[block],:128], %[line_size] \n\t" ++ "vst1.64 {d6,d7}, [%[block],:128], %[line_size] \n\t" ++ "bne 1b \n\t" ++ : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "memory"); ++} ++ ++static void put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ asm volatile( ++ PUT_PIXELS_16_X2("vrhadd") ++ : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory"); ++} ++ ++static void put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ PUT_PIXELS_16_Y2("vrhadd") ++ : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d5", "memory"); ++} ++ ++static void put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ PUT_PIXELS_16_XY2("vrshrn", "@") ++ : [b]"+r"(block), ++ [p0]"+r"(pixels), ++ [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", ++ "d28", "d29", "d30", "d31", ++ "q8", "q9", "q10", "q11", "q12", "memory"); ++} ++ ++static void put_pixels8_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ asm volatile( ++ "1: \n\t" ++ "vld1.64 {d0}, [%[p]], %[line_size] \n\t" ++ "vld1.64 {d1}, [%[p]], %[line_size] \n\t" ++ "vld1.64 {d2}, [%[p]], %[line_size] \n\t" ++ "vld1.64 {d3}, [%[p]], %[line_size] \n\t" ++ "subs %[h], %[h], #4 \n\t" ++ "vst1.64 {d0}, [%[b],:64], %[line_size] \n\t" ++ "vst1.64 {d1}, [%[b],:64], %[line_size] \n\t" ++ "vst1.64 {d2}, [%[b],:64], %[line_size] \n\t" ++ "vst1.64 {d3}, [%[b],:64], %[line_size] \n\t" ++ "bne 1b \n\t" ++ : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "memory"); ++} ++ ++static void put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ asm volatile( ++ PUT_PIXELS_8_X2("vrhadd") ++ : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "memory"); ++} ++ ++static void put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ PUT_PIXELS_8_Y2("vrhadd") ++ : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d4", "memory"); ++} ++ ++static void put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ PUT_PIXELS8_XY2("vrshrn", "@") ++ : [b]"+r"(block), ++ [p0]"+r"(pixels), ++ [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d6", "d7", ++ "q8", "q9", "q10", "memory"); ++} ++ ++static void put_no_rnd_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ asm volatile( ++ PUT_PIXELS_16_X2("vhadd") ++ : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory"); ++} ++ ++static void put_no_rnd_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ PUT_PIXELS_16_Y2("vhadd") ++ : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d5", "memory"); ++} ++ ++static void put_no_rnd_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ "vmov.i16 q13, #1 \n\t" ++ PUT_PIXELS_16_XY2("vshrn", "") ++ : [b]"+r"(block), ++ [p0]"+r"(pixels), ++ [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", ++ "d28", "d29", "d30", "d31", ++ "q8", "q9", "q10", "q11", "q12", "q13", "memory"); ++} ++ ++static void put_no_rnd_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ asm volatile( ++ PUT_PIXELS_8_X2("vhadd") ++ : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "memory"); ++} ++ ++static void put_no_rnd_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ PUT_PIXELS_8_Y2("vhadd") ++ : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d4", "memory"); ++} ++ ++static void put_no_rnd_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels, ++ int line_size, int h) ++{ ++ const uint8_t *p1; ++ int l2; ++ ++ asm volatile( ++ "vmov.i16 q11, #1 \n\t" ++ PUT_PIXELS8_XY2("vshrn", "") ++ : [b]"+r"(block), ++ [p0]"+r"(pixels), ++ [p1]"=&r"(p1), [h]"+r"(h), ++ [l2]"=&r"(l2) ++ : [line_size]"r"(line_size) ++ : "d0", "d1", "d2", "d3", "d4", "d6", "d7", ++ "q8", "q9", "q10", "q11", "memory"); ++} ++ ++static void put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, int stride) ++{ ++ put_pixels16_neon(dst, src, stride, 16); ++} ++ ++void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) ++{ ++ c->put_pixels_tab[0][0] = put_pixels16_neon; ++ c->put_pixels_tab[0][1] = put_pixels16_x2_neon; ++ c->put_pixels_tab[0][2] = put_pixels16_y2_neon; ++ c->put_pixels_tab[0][3] = put_pixels16_xy2_neon; ++ c->put_pixels_tab[1][0] = put_pixels8_neon; ++ c->put_pixels_tab[1][1] = put_pixels8_x2_neon; ++ c->put_pixels_tab[1][2] = put_pixels8_y2_neon; ++ c->put_pixels_tab[1][3] = put_pixels8_xy2_neon; ++ ++ c->put_no_rnd_pixels_tab[0][0] = put_pixels16_neon; ++ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_neon; ++ c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_neon; ++ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_neon; ++ c->put_no_rnd_pixels_tab[1][0] = put_pixels8_neon; ++ c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_neon; ++ c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_neon; ++ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_neon; ++ ++ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; ++ ++ c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_neon; ++} +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c mythtv/libs/libavcodec/armv4l/float_arm_vfp.c +--- mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c 1970-01-01 01:00:00.000000000 +0100 ++++ mythtv/libs/libavcodec/armv4l/float_arm_vfp.c 2008-07-24 19:54:01.023198000 +0200 +@@ -0,0 +1,208 @@ ++/* ++ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavcodec/dsputil.h" ++ ++/* ++ * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle ++ * throughput for almost all the instructions (except for double precision ++ * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles ++ * for arithmetic operations. Scheduling code to avoid pipeline stalls is very ++ * important for performance. One more interesting feature is that VFP has ++ * independent load/store and arithmetics pipelines, so it is possible to make ++ * them work simultaneously and get more than 1 operation per cycle. Load/store ++ * pipeline can process 2 single precision floating point values per cycle and ++ * supports bulk loads and stores for large sets of registers. Arithmetic operations ++ * can be done on vectors, which allows to keep the arithmetics pipeline busy, ++ * while the processor may issue and execute other instructions. Detailed ++ * optimization manuals can be found at http://www.arm.com ++ */ ++ ++/** ++ * ARM VFP optimized implementation of 'vector_fmul_c' function. ++ * Assume that len is a positive number and is multiple of 8 ++ */ ++static void vector_fmul_vfp(float *dst, const float *src, int len) ++{ ++ int tmp; ++ asm volatile( ++ "fmrx %[tmp], fpscr\n\t" ++ "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */ ++ "fmxr fpscr, %[tmp]\n\t" ++ ++ "fldmias %[dst_r]!, {s0-s3}\n\t" ++ "fldmias %[src]!, {s8-s11}\n\t" ++ "fldmias %[dst_r]!, {s4-s7}\n\t" ++ "fldmias %[src]!, {s12-s15}\n\t" ++ "fmuls s8, s0, s8\n\t" ++ "1:\n\t" ++ "subs %[len], %[len], #16\n\t" ++ "fmuls s12, s4, s12\n\t" ++ "fldmiasge %[dst_r]!, {s16-s19}\n\t" ++ "fldmiasge %[src]!, {s24-s27}\n\t" ++ "fldmiasge %[dst_r]!, {s20-s23}\n\t" ++ "fldmiasge %[src]!, {s28-s31}\n\t" ++ "fmulsge s24, s16, s24\n\t" ++ "fstmias %[dst_w]!, {s8-s11}\n\t" ++ "fstmias %[dst_w]!, {s12-s15}\n\t" ++ "fmulsge s28, s20, s28\n\t" ++ "fldmiasgt %[dst_r]!, {s0-s3}\n\t" ++ "fldmiasgt %[src]!, {s8-s11}\n\t" ++ "fldmiasgt %[dst_r]!, {s4-s7}\n\t" ++ "fldmiasgt %[src]!, {s12-s15}\n\t" ++ "fmulsge s8, s0, s8\n\t" ++ "fstmiasge %[dst_w]!, {s24-s27}\n\t" ++ "fstmiasge %[dst_w]!, {s28-s31}\n\t" ++ "bgt 1b\n\t" ++ ++ "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */ ++ "fmxr fpscr, %[tmp]\n\t" ++ : [dst_w] "+&r" (dst), [dst_r] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len), [tmp] "=&r" (tmp) ++ : ++ : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", ++ "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", ++ "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", ++ "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", ++ "cc", "memory"); ++} ++ ++/** ++ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. ++ * Assume that len is a positive number and is multiple of 8 ++ */ ++static void vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len) ++{ ++ src1 += len; ++ asm volatile( ++ "fldmdbs %[src1]!, {s0-s3}\n\t" ++ "fldmias %[src0]!, {s8-s11}\n\t" ++ "fldmdbs %[src1]!, {s4-s7}\n\t" ++ "fldmias %[src0]!, {s12-s15}\n\t" ++ "fmuls s8, s3, s8\n\t" ++ "fmuls s9, s2, s9\n\t" ++ "fmuls s10, s1, s10\n\t" ++ "fmuls s11, s0, s11\n\t" ++ "1:\n\t" ++ "subs %[len], %[len], #16\n\t" ++ "fldmdbsge %[src1]!, {s16-s19}\n\t" ++ "fmuls s12, s7, s12\n\t" ++ "fldmiasge %[src0]!, {s24-s27}\n\t" ++ "fmuls s13, s6, s13\n\t" ++ "fldmdbsge %[src1]!, {s20-s23}\n\t" ++ "fmuls s14, s5, s14\n\t" ++ "fldmiasge %[src0]!, {s28-s31}\n\t" ++ "fmuls s15, s4, s15\n\t" ++ "fmulsge s24, s19, s24\n\t" ++ "fldmdbsgt %[src1]!, {s0-s3}\n\t" ++ "fmulsge s25, s18, s25\n\t" ++ "fstmias %[dst]!, {s8-s13}\n\t" ++ "fmulsge s26, s17, s26\n\t" ++ "fldmiasgt %[src0]!, {s8-s11}\n\t" ++ "fmulsge s27, s16, s27\n\t" ++ "fmulsge s28, s23, s28\n\t" ++ "fldmdbsgt %[src1]!, {s4-s7}\n\t" ++ "fmulsge s29, s22, s29\n\t" ++ "fstmias %[dst]!, {s14-s15}\n\t" ++ "fmulsge s30, s21, s30\n\t" ++ "fmulsge s31, s20, s31\n\t" ++ "fmulsge s8, s3, s8\n\t" ++ "fldmiasgt %[src0]!, {s12-s15}\n\t" ++ "fmulsge s9, s2, s9\n\t" ++ "fmulsge s10, s1, s10\n\t" ++ "fstmiasge %[dst]!, {s24-s27}\n\t" ++ "fmulsge s11, s0, s11\n\t" ++ "fstmiasge %[dst]!, {s28-s31}\n\t" ++ "bgt 1b\n\t" ++ ++ : [dst] "+&r" (dst), [src0] "+&r" (src0), [src1] "+&r" (src1), [len] "+&r" (len) ++ : ++ : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", ++ "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", ++ "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", ++ "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", ++ "cc", "memory"); ++} ++ ++#ifdef HAVE_ARMV6 ++/** ++ * ARM VFP optimized float to int16 conversion. ++ * Assume that len is a positive number and is multiple of 8, destination ++ * buffer is at least 4 bytes aligned (8 bytes alignment is better for ++ * performance), little endian byte sex ++ */ ++void float_to_int16_vfp(int16_t *dst, const float *src, int len) ++{ ++ asm volatile( ++ "fldmias %[src]!, {s16-s23}\n\t" ++ "ftosis s0, s16\n\t" ++ "ftosis s1, s17\n\t" ++ "ftosis s2, s18\n\t" ++ "ftosis s3, s19\n\t" ++ "ftosis s4, s20\n\t" ++ "ftosis s5, s21\n\t" ++ "ftosis s6, s22\n\t" ++ "ftosis s7, s23\n\t" ++ "1:\n\t" ++ "subs %[len], %[len], #8\n\t" ++ "fmrrs r3, r4, {s0, s1}\n\t" ++ "fmrrs r5, r6, {s2, s3}\n\t" ++ "fmrrs r7, r8, {s4, s5}\n\t" ++ "fmrrs ip, lr, {s6, s7}\n\t" ++ "fldmiasgt %[src]!, {s16-s23}\n\t" ++ "ssat r4, #16, r4\n\t" ++ "ssat r3, #16, r3\n\t" ++ "ssat r6, #16, r6\n\t" ++ "ssat r5, #16, r5\n\t" ++ "pkhbt r3, r3, r4, lsl #16\n\t" ++ "pkhbt r4, r5, r6, lsl #16\n\t" ++ "ftosisgt s0, s16\n\t" ++ "ftosisgt s1, s17\n\t" ++ "ftosisgt s2, s18\n\t" ++ "ftosisgt s3, s19\n\t" ++ "ftosisgt s4, s20\n\t" ++ "ftosisgt s5, s21\n\t" ++ "ftosisgt s6, s22\n\t" ++ "ftosisgt s7, s23\n\t" ++ "ssat r8, #16, r8\n\t" ++ "ssat r7, #16, r7\n\t" ++ "ssat lr, #16, lr\n\t" ++ "ssat ip, #16, ip\n\t" ++ "pkhbt r5, r7, r8, lsl #16\n\t" ++ "pkhbt r6, ip, lr, lsl #16\n\t" ++ "stmia %[dst]!, {r3-r6}\n\t" ++ "bgt 1b\n\t" ++ ++ : [dst] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len) ++ : ++ : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", ++ "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", ++ "r3", "r4", "r5", "r6", "r7", "r8", "ip", "lr", ++ "cc", "memory"); ++} ++#endif ++ ++void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx) ++{ ++ c->vector_fmul = vector_fmul_vfp; ++ c->vector_fmul_reverse = vector_fmul_reverse_vfp; ++#ifdef HAVE_ARMV6 ++ c->float_to_int16 = float_to_int16_vfp; ++#endif ++} +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S mythtv/libs/libavcodec/armv4l/h264dsp_neon.S +--- mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S 1970-01-01 01:00:00.000000000 +0100 ++++ mythtv/libs/libavcodec/armv4l/h264dsp_neon.S 2008-07-24 19:54:01.033198000 +0200 +@@ -0,0 +1,148 @@ ++/* ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ .fpu neon ++ ++ .text ++ .align ++ .global ff_put_h264_chroma_mc8_neon ++ .func ff_put_h264_chroma_mc8_neon ++/* void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, ++ int h, int x, int y) */ ++ff_put_h264_chroma_mc8_neon: ++ push {r4-r7} ++ ldrd r4, [sp, #16] ++ ++ pld [r1] ++ pld [r1, r2] ++ ++ muls r7, r4, r5 ++ rsb r6, r7, r5, lsl #3 ++ rsb ip, r7, r4, lsl #3 ++ sub r4, r7, r4, lsl #3 ++ sub r4, r4, r5, lsl #3 ++ add r4, r4, #64 ++ ++ beq 2f ++ ++ add r5, r1, r2 ++ ++ vdup.8 d0, r4 ++ lsl r4, r2, #1 ++ vdup.8 d1, ip ++ vld1.64 {d4,d5}, [r1], r4 ++ vdup.8 d2, r6 ++ vld1.64 {d6,d7}, [r5], r4 ++ vdup.8 d3, r7 ++ ++ mov r6, #32 ++ vext.8 d5, d4, d5, #1 ++ vdup.16 q12, r6 ++ vext.8 d7, d6, d7, #1 ++1: ++ pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d5, d1 ++ vld1.64 {d4,d5}, [r1], r4 ++ vmlal.u8 q8, d6, d2 ++ vext.8 d5, d4, d5, #1 ++ vmlal.u8 q8, d7, d3 ++ vmull.u8 q9, d6, d0 ++ vadd.i16 q8, q8, q12 ++ subs r3, r3, #2 ++ vmlal.u8 q9, d7, d1 ++ vshrn.u16 d16, q8, #6 ++ vld1.64 {d6,d7}, [r5], r4 ++ vmlal.u8 q9, d4, d2 ++ vmlal.u8 q9, d5, d3 ++ pld [r1] ++ vadd.i16 q9, q9, q12 ++ vst1.64 {d16}, [r0,:64], r2 ++ vshrn.u16 d17, q9, #6 ++ vext.8 d7, d6, d7, #1 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 1b ++ ++ pop {r4-r7} ++ bx lr ++ ++2: ++ tst r6, r6 ++ add ip, ip, r6 ++ vdup.8 d0, r4 ++ vdup.8 d1, ip ++ mov r6, #32 ++ vdup.16 q12, r6 ++ ++ beq 4f ++ ++ add r5, r1, r2 ++ lsl r4, r2, #1 ++ vld1.64 {d4}, [r1], r4 ++ vld1.64 {d6}, [r5], r4 ++3: ++ pld [r5] ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d6, d1 ++ vld1.64 {d4}, [r1], r4 ++ vmull.u8 q9, d6, d0 ++ vadd.i16 q8, q8, q12 ++ vmlal.u8 q9, d4, d1 ++ vshrn.u16 d16, q8, #6 ++ vadd.i16 q9, q9, q12 ++ vst1.64 {d16}, [r0,:64], r2 ++ vshrn.u16 d17, q9, #6 ++ subs r3, r3, #2 ++ vld1.64 {d6}, [r5], r4 ++ pld [r1] ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 3b ++ ++ pop {r4-r7} ++ bx lr ++ ++4: ++ vld1.64 {d4,d5}, [r1], r2 ++ vld1.64 {d6,d7}, [r1], r2 ++ vext.8 d5, d4, d5, #1 ++ vext.8 d7, d6, d7, #1 ++5: ++ pld [r1] ++ subs r3, r3, #2 ++ vmull.u8 q8, d4, d0 ++ vmlal.u8 q8, d5, d1 ++ vld1.64 {d4,d5}, [r1], r2 ++ vmull.u8 q9, d6, d0 ++ vmlal.u8 q9, d7, d1 ++ pld [r1] ++ vadd.i16 q8, q8, q12 ++ vadd.i16 q9, q9, q12 ++ vext.8 d5, d4, d5, #1 ++ vshrn.u16 d16, q8, #6 ++ vld1.64 {d6,d7}, [r1], r2 ++ vshrn.u16 d17, q9, #6 ++ vst1.64 {d16}, [r0,:64], r2 ++ vext.8 d7, d6, d7, #1 ++ vst1.64 {d17}, [r0,:64], r2 ++ bgt 5b ++ ++ pop {r4-r7} ++ bx lr ++ .endfunc +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c +--- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c 2008-07-24 19:54:01.263198000 +0200 +@@ -18,9 +18,9 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +-#include "dsputil.h" +-#include "mpegvideo.h" +-#include "avcodec.h" ++#include "libavcodec/avcodec.h" ++#include "libavcodec/dsputil.h" ++#include "libavcodec/mpegvideo.h" + + extern void MPV_common_init_iwmmxt(MpegEncContext *s); + extern void MPV_common_init_armv5te(MpegEncContext *s); +@@ -28,7 +28,7 @@ + void MPV_common_init_armv4l(MpegEncContext *s) + { + /* IWMMXT support is a superset of armv5te, so +- * allow optimised functions for armv5te unless ++ * allow optimized functions for armv5te unless + * a better iwmmxt function exists + */ + #ifdef HAVE_ARMV5TE +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c +--- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c 2008-07-24 19:54:01.263198000 +0200 +@@ -19,9 +19,9 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +-#include "dsputil.h" +-#include "mpegvideo.h" +-#include "avcodec.h" ++#include "libavcodec/avcodec.h" ++#include "libavcodec/dsputil.h" ++#include "libavcodec/mpegvideo.h" + + + #ifdef ENABLE_ARM_TESTS +@@ -65,7 +65,7 @@ + ({ DCTELEM *xblock = xxblock; \ + int xqmul = xxqmul, xqadd = xxqadd, xcount = xxcount, xtmp; \ + int xdata1, xdata2; \ +-__asm__ __volatile__( \ ++asm volatile( \ + "subs %[count], %[count], #2 \n\t" \ + "ble 2f \n\t" \ + "ldrd r4, [%[block], #0] \n\t" \ +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c +--- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c 2008-07-24 19:54:01.273198000 +0200 +@@ -18,9 +18,9 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +-#include "dsputil.h" +-#include "mpegvideo.h" +-#include "avcodec.h" ++#include "libavcodec/avcodec.h" ++#include "libavcodec/dsputil.h" ++#include "libavcodec/mpegvideo.h" + + static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +@@ -48,7 +48,7 @@ + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + +- __asm__ __volatile__ ( ++ asm volatile ( + /* "movd %1, %%mm6 \n\t" //qmul */ + /* "packssdw %%mm6, %%mm6 \n\t" */ + /* "packssdw %%mm6, %%mm6 \n\t" */ +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S mythtv/libs/libavcodec/armv4l/simple_idct_arm.S +--- mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S 2008-07-23 12:19:05.000000000 +0200 ++++ mythtv/libs/libavcodec/armv4l/simple_idct_arm.S 2008-07-24 19:54:01.503198000 +0200 +@@ -79,7 +79,7 @@ + + + __row_loop: +- @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) ++ @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) + ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) + ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] + ldr r3, [r14, #8] @ R3=ROWr32[2] +@@ -421,7 +421,7 @@ + @@ col[40] = ((a2 - b2) >> COL_SHIFT); + @@ col[48] = ((a1 - b1) >> COL_SHIFT); + @@ col[56] = ((a0 - b0) >> COL_SHIFT); +- @@@@@ no optimisation here @@@@@ ++ @@@@@ no optimization here @@@@@ + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + mov r8, r8, asr #COL_SHIFT +diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S mythtv/libs/libavcodec/armv4l/simple_idct_neon.S +--- mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S 1970-01-01 01:00:00.000000000 +0100 ++++ mythtv/libs/libavcodec/armv4l/simple_idct_neon.S 2008-07-24 19:54:01.503198000 +0200 +@@ -0,0 +1,388 @@ ++/* ++ * ARM NEON IDCT ++ * ++ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> ++ * ++ * Based on Simple IDCT ++ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 ++#define W4c ((1<<(COL_SHIFT-1))/W4) ++#define ROW_SHIFT 11 ++#define COL_SHIFT 20 ++ ++#define w1 d0[0] ++#define w2 d0[1] ++#define w3 d0[2] ++#define w4 d0[3] ++#define w5 d1[0] ++#define w6 d1[1] ++#define w7 d1[2] ++#define w4c d1[3] ++ ++ .fpu neon ++ ++ .macro idct_col4_top ++ vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ ++ vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ ++ vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ ++ vadd.i32 q11, q15, q7 ++ vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ ++ vadd.i32 q12, q15, q8 ++ vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ ++ vsub.i32 q13, q15, q8 ++ vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ ++ vsub.i32 q14, q15, q7 ++ ++ vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ ++ vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ ++ vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ ++ vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ ++ .endm ++ ++ .macro idct_col4_mid1 ++ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ ++ vadd.i32 q11, q11, q7 ++ vsub.i32 q12, q12, q7 ++ vsub.i32 q13, q13, q7 ++ vadd.i32 q14, q14, q7 ++ .endm ++ ++ .macro idct_col4_mid2 ++ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ ++ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ ++ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ ++ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ ++ .endm ++ ++ .macro idct_col4_mid3 ++ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ ++ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ ++ vadd.i32 q11, q11, q7 ++ vsub.i32 q12, q12, q8 ++ vadd.i32 q13, q13, q8 ++ vsub.i32 q14, q14, q7 ++ .endm ++ ++ .macro idct_col4_mid4 ++ vmlal.s16 q9, d9, w7 ++ vmlsl.s16 q10, d9, w5 ++ vmlal.s16 q5, d9, w3 ++ vmlsl.s16 q6, d9, w1 ++ .endm ++ ++ .macro idct_col4_mid ++ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ ++ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ ++ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ ++ vadd.i32 q11, q11, q7 ++ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ ++ vsub.i32 q12, q12, q7 ++ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ ++ vsub.i32 q13, q13, q7 ++ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ ++ vadd.i32 q14, q14, q7 ++ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ ++ vadd.i32 q11, q11, q7 ++ vmlal.s16 q9, d9, w7 ++ vsub.i32 q12, q12, q8 ++ vmlsl.s16 q10, d9, w5 ++ vadd.i32 q13, q13, q8 ++ vmlal.s16 q5, d9, w3 ++ vsub.i32 q14, q14, q7 ++ vmlsl.s16 q6, d9, w1 ++ .endm ++ ++ .macro idct_col4_end ++ vadd.i32 q3, q11, q9 ++ vadd.i32 q4, q12, q10 ++ vadd.i32 q7, q13, q5 ++ vadd.i32 q8, q14, q6 ++ vsub.i32 q11, q11, q9 ++ vsub.i32 q12, q12, q10 ++ vsub.i32 q13, q13, q5 ++ vsub.i32 q14, q14, q6 ++ .endm ++ ++ .text ++ .align ++ .type idct_row4_neon, %function ++ .func idct_row4_neon ++idct_row4_neon: ++ vld1.64 {d2,d3}, [a3,:128]! ++ vld1.64 {d4,d5}, [a3,:128]! ++ vld1.64 {d6,d7}, [a3,:128]! ++ vld1.64 {d8,d9}, [a3,:128]! ++ add a3, a3, #-64 ++ ++ vmov.i32 q15, #(1<<(ROW_SHIFT-1)) ++ vorr d10, d3, d5 ++ vtrn.16 q1, q2 ++ vorr d11, d7, d9 ++ vtrn.16 q3, q4 ++ vorr d10, d10, d11 ++ vtrn.32 q1, q3 ++ vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ ++ vtrn.32 q2, q4 ++ vmov a4, v1, d10 ++ ++ idct_col4_top ++ ++ orrs a4, a4, v1 ++ beq 1f ++ idct_col4_mid ++1: ++ vadd.i32 q3, q11, q9 ++ vadd.i32 q4, q12, q10 ++ vshrn.i32 d2, q3, #ROW_SHIFT ++ vadd.i32 q7, q13, q5 ++ vshrn.i32 d4, q4, #ROW_SHIFT ++ vadd.i32 q8, q14, q6 ++ vshrn.i32 d6, q7, #ROW_SHIFT ++ vsub.i32 q11, q11, q9 ++ vshrn.i32 d8, q8, #ROW_SHIFT ++ vsub.i32 q12, q12, q10 ++ vshrn.i32 d9, q11, #ROW_SHIFT ++ vsub.i32 q13, q13, q5 ++ vshrn.i32 d7, q12, #ROW_SHIFT ++ vsub.i32 q14, q14, q6 ++ vshrn.i32 d5, q13, #ROW_SHIFT ++ vshrn.i32 d3, q14, #ROW_SHIFT ++ ++ vtrn.16 q1, q2 ++ vtrn.16 q3, q4 ++ vtrn.32 q1, q3 ++ vtrn.32 q2, q4 ++ ++ vst1.64 {d2,d3}, [a3,:128]! ++ vst1.64 {d4,d5}, [a3,:128]! ++ vst1.64 {d6,d7}, [a3,:128]! ++ vst1.64 {d8,d9}, [a3,:128]! ++ ++ mov pc, lr ++ .endfunc ++ ++ .align ++ .type idct_col4_neon, %function ++ .func idct_col4_neon ++idct_col4_neon: ++ mov ip, #16 ++ vld1.64 {d2}, [a3,:64], ip /* d2 = col[0] */ ++ vld1.64 {d4}, [a3,:64], ip /* d3 = col[1] */ ++ vld1.64 {d6}, [a3,:64], ip /* d4 = col[2] */ ++ vld1.64 {d8}, [a3,:64], ip /* d5 = col[3] */ ++ vld1.64 {d3}, [a3,:64], ip /* d6 = col[4] */ ++ vld1.64 {d5}, [a3,:64], ip /* d7 = col[5] */ ++ vld1.64 {d7}, [a3,:64], ip /* d8 = col[6] */ ++ vld1.64 {d9}, [a3,:64], ip /* d9 = col[7] */ ++ ++ vrev64.32 d11, d3 ++ vrev64.32 d13, d5 ++ vorr d11, d3, d11 ++ vrev64.32 d15, d7 ++ vorr d13, d5, d13 ++ vrev64.32 d17, d9 ++ vorr d15, d7, d15 ++ vmov.32 v1, d11[0] ++ vmov.32 v2, d13[0] ++ vorr d17, d9, d17 ++ vmov.32 v3, d15[0] ++ vmov.32 ip, d17[0] ++ vdup.16 d30, w4c ++ vadd.i16 d30, d30, d2 ++ vmull.s16 q15, d30, w4 /* q15 = W4 * (col[0]+(1<<(COL_SHIFT-1))/W4) */ ++ ++ idct_col4_top ++ tst v1, v1 ++ beq 1f ++ idct_col4_mid1 ++1: tst v2, v2 ++ beq 2f ++ idct_col4_mid2 ++2: tst v3, v3 ++ beq 3f ++ idct_col4_mid3 ++3: tst ip, ip ++ beq 4f ++ idct_col4_mid4 ++4: ++ idct_col4_end ++ ++ vshr.s32 q2, q3, #COL_SHIFT ++ vshr.s32 q3, q4, #COL_SHIFT ++ vmovn.i32 d2, q2 ++ vshr.s32 q4, q7, #COL_SHIFT ++ vmovn.i32 d3, q3 ++ vshr.s32 q5, q8, #COL_SHIFT ++ vmovn.i32 d4, q4 ++ vshr.s32 q6, q14, #COL_SHIFT ++ vmovn.i32 d5, q5 ++ vshr.s32 q7, q13, #COL_SHIFT ++ vmovn.i32 d6, q6 ++ vshr.s32 q8, q12, #COL_SHIFT ++ vmovn.i32 d7, q7 ++ vshr.s32 q9, q11, #COL_SHIFT ++ vmovn.i32 d8, q8 ++ vmovn.i32 d9, q9 ++ ++ mov pc, lr ++ .endfunc ++ ++ .macro idct_col4_st16 ++ mov ip, #16 ++ vst1.64 {d2}, [a3,:64], ip ++ vst1.64 {d3}, [a3,:64], ip ++ vst1.64 {d4}, [a3,:64], ip ++ vst1.64 {d5}, [a3,:64], ip ++ vst1.64 {d6}, [a3,:64], ip ++ vst1.64 {d7}, [a3,:64], ip ++ vst1.64 {d8}, [a3,:64], ip ++ vst1.64 {d9}, [a3,:64], ip ++ .endm ++ ++ .align ++ .type idct_col4_add8, %function ++ .func idct_col4_add8 ++idct_col4_add8: ++ vld1.32 {d10[0]}, [a1,:32], a2 ++ vld1.32 {d10[1]}, [a1,:32], a2 ++ vld1.32 {d11[0]}, [a1,:32], a2 ++ vld1.32 {d11[1]}, [a1,:32], a2 ++ vld1.32 {d12[0]}, [a1,:32], a2 ++ vld1.32 {d12[1]}, [a1,:32], a2 ++ vld1.32 {d13[0]}, [a1,:32], a2 ++ vld1.32 {d13[1]}, [a1,:32], a2 ++ ++ vaddw.u8 q1, q1, d10 ++ vaddw.u8 q2, q2, d11 ++ vaddw.u8 q3, q3, d12 ++ vaddw.u8 q4, q4, d13 ++ ++ sub a1, a1, a2, lsl #3 ++ .endfunc ++ ++ .type idct_col4_st8, %function ++ .func idct_col4_st8 ++idct_col4_st8: ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q2 ++ vqmovun.s16 d4, q3 ++ vqmovun.s16 d5, q4 ++ ++ vst1.32 {d2[0]}, [a1,:32], a2 ++ vst1.32 {d2[1]}, [a1,:32], a2 ++ vst1.32 {d3[0]}, [a1,:32], a2 ++ vst1.32 {d3[1]}, [a1,:32], a2 ++ vst1.32 {d4[0]}, [a1,:32], a2 ++ vst1.32 {d4[1]}, [a1,:32], a2 ++ vst1.32 {d5[0]}, [a1,:32], a2 ++ vst1.32 {d5[1]}, [a1,:32], a2 ++ mov pc, lr ++ .endfunc ++ ++ .align 4 ++const: .short W1, W2, W3, W4, W5, W6, W7, W4c ++ ++ .macro idct_start data ++ pld [\data] ++ pld [\data, #64] ++ push {v1-v3, lr} ++ vpush {d8-d15} ++ adr a4, const ++ vld1.64 {d0,d1}, [a4,:128] ++ .endm ++ ++ .macro idct_end ++ vpop {d8-d15} ++ pop {v1-v3, pc} ++ .endm ++ ++ .align ++ .global ff_simple_idct_neon ++ .type ff_simple_idct_neon, %function ++ .func ff_simple_idct_neon ++/* void ff_simple_idct_neon(DCTELEM *data); */ ++ff_simple_idct_neon: ++ idct_start a1 ++ ++ mov a3, a1 ++ bl idct_row4_neon ++ bl idct_row4_neon ++ add a3, a3, #-128 ++ bl idct_col4_neon ++ add a3, a3, #-128 ++ idct_col4_st16 ++ add a3, a3, #-120 ++ bl idct_col4_neon ++ add a3, a3, #-128 ++ idct_col4_st16 ++ ++ idct_end ++ .endfunc ++ ++ .align ++ .global ff_simple_idct_put_neon ++ .type ff_simple_idct_put_neon, %function ++ .func ff_simple_idct_put_neon ++/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ ++ff_simple_idct_put_neon: ++ idct_start a3 ++ ++ bl idct_row4_neon ++ bl idct_row4_neon ++ add a3, a3, #-128 ++ bl idct_col4_neon ++ bl idct_col4_st8 ++ sub a1, a1, a2, lsl #3 ++ add a1, a1, #4 ++ add a3, a3, #-120 ++ bl idct_col4_neon ++ bl idct_col4_st8 ++ ++ idct_end ++ .endfunc ++ ++ .align ++ .global ff_simple_idct_add_neon ++ .type ff_simple_idct_add_neon, %function ++ .func ff_simple_idct_add_neon ++/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ ++ff_simple_idct_add_neon: ++ idct_start a3 ++ ++ bl idct_row4_neon ++ bl idct_row4_neon ++ add a3, a3, #-128 ++ bl idct_col4_neon ++ bl idct_col4_add8 ++ sub a1, a1, a2, lsl #3 ++ add a1, a1, #4 ++ add a3, a3, #-120 ++ bl idct_col4_neon ++ bl idct_col4_add8 ++ ++ idct_end ++ .endfunc +diff -Nurd mythtv.orig/libs/libavcodec/avcodec.h mythtv/libs/libavcodec/avcodec.h +--- mythtv.orig/libs/libavcodec/avcodec.h 2008-07-23 12:19:11.000000000 +0200 ++++ mythtv/libs/libavcodec/avcodec.h 2008-07-24 19:56:46.953198000 +0200 +@@ -1328,6 +1328,8 @@ + #define FF_IDCT_SIMPLEARMV6 17 + #define FF_IDCT_SIMPLEVIS 18 + #define FF_IDCT_WMV2 19 ++#define FF_IDCT_FAAN 20 ++#define FF_IDCT_SIMPLENEON 21 + + /** + * slice count +diff -Nurd mythtv.orig/libs/libavcodec/libavcodec.pro mythtv/libs/libavcodec/libavcodec.pro +--- mythtv.orig/libs/libavcodec/libavcodec.pro 2008-07-23 12:19:10.000000000 +0200 ++++ mythtv/libs/libavcodec/libavcodec.pro 2008-07-24 19:54:01.503198000 +0200 +@@ -413,6 +413,8 @@ + + contains( HAVE_ARMV6, yes ) { SOURCES += armv4l/simple_idct_armv6.S } + ++contains( HAVE_NEON, yes ) { SOURCES += armv4l/simple_idct_neon.S } ++ + contains( HAVE_VIS, yes ) { + SOURCES += sparc/dsputil_vis.c + SOURCES += sparc/simple_idct_vis.c +diff -Nurd mythtv.orig/libs/libavcodec/utils.c mythtv/libs/libavcodec/utils.c +--- mythtv.orig/libs/libavcodec/utils.c 2008-07-23 12:19:10.000000000 +0200 ++++ mythtv/libs/libavcodec/utils.c 2008-07-24 19:58:12.403198000 +0200 +@@ -594,6 +594,7 @@ + {"sh4", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SH4, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"}, ++{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"}, + {"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"}, |