6 files changed, 5481 insertions, 0 deletions
diff --git a/noncore/multimedia/opieplayer2/alphablend.c b/noncore/multimedia/opieplayer2/alphablend.c
new file mode 100644
index 0000000..57f6013
--- a/dev/null
+++ b/noncore/multimedia/opieplayer2/alphablend.c
@@ -0,0 +1,753 @@
+//TOAST_SPU will define ALL spu entries - no matter the tranparency
+//#define TOAST_SPU
+/* #define PRIV_CLUT */
+/* Currently only blend_yuv(..) works */
+/*
+ *
+ * Copyright (C) James Courtier-Dutton James@superbug.demon.co.uk - July 2001
+ * 
+ * Copyright (C) 2000  Thomas Mirlacher
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * The author may be reached as <dent@linuxvideo.org>
+ *
+ *------------------------------------------------------------
+ *
+ */
+/*
+#define LOG_BLEND_YUV
+*/
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <xine/video_out.h>
+#include "alphablend.h"
+#define BLEND_COLOR(dst, src, mask, o) ((((src&mask)*o + ((dst&mask)*(0x0f-o)))/0xf) & mask)
+#define BLEND_BYTE(dst, src, o) (((src)*o + ((dst)*(0xf-o)))/0xf)
+static void mem_blend16(uint16_t *mem, uint16_t clr, uint8_t o, int len) {
+  uint16_t *limit = mem + len;
+  while (mem < limit) {
+    *mem =
+     BLEND_COLOR(*mem, clr, 0xf800, o) |
+     BLEND_COLOR(*mem, clr, 0x07e0, o) |
+     BLEND_COLOR(*mem, clr, 0x001f, o);
+    mem++;
+  }
+}
+static void mem_blend24(uint8_t *mem, uint8_t r, uint8_t g, uint8_t b,
+ uint8_t o, int len) {
+  uint8_t *limit = mem + len*3;
+  while (mem < limit) {
+    *mem = BLEND_BYTE(*mem, r, o);
+    mem++;
+    *mem = BLEND_BYTE(*mem, g, o);
+    mem++;
+    *mem = BLEND_BYTE(*mem, b, o);
+    mem++;
+  }
+}
+static void mem_blend24_32(uint8_t *mem, uint8_t r, uint8_t g, uint8_t b,
+ uint8_t o, int len) {
+  uint8_t *limit = mem + len*4;
+  while (mem < limit) {
+    *mem = BLEND_BYTE(*mem, r, o);
+    mem++;
+    *mem = BLEND_BYTE(*mem, g, o);
+    mem++;
+    *mem = BLEND_BYTE(*mem, b, o);
+    mem += 2;
+  }
+}
+static void mem_blend32(uint8_t *mem, uint8_t *src, uint8_t o, int len) {
+  uint8_t *limit = mem + len*4;
+  while (mem < limit) {
+    *mem = BLEND_BYTE(*mem, src[0], o);
+    mem++;
+    *mem = BLEND_BYTE(*mem, src[1], o);
+    mem++;
+    *mem = BLEND_BYTE(*mem, src[2], o);
+    mem++;
+    *mem = BLEND_BYTE(*mem, src[3], o);
+    mem++;
+  }
+}
+/*
+ * Some macros for fixed point arithmetic.
+ *
+ * The blend_rgb* routines perform rle image scaling using
+ * scale factors that are expressed as integers scaled with
+ * a factor of 2**16.
+ *
+ * INT_TO_SCALED()/SCALED_TO_INT() converts from integer
+ * to scaled fixed point and back.
+ */
+        #define         SCALE_SHIFT  16
+        #define         SCALE_FACTOR  (1<<SCALE_SHIFT)
+        #defineINT_TO_SCALED(i)  ((i)  << SCALE_SHIFT)
+        #defineSCALED_TO_INT(sc) ((sc) >> SCALE_SHIFT)
+static rle_elem_t *
+rle_img_advance_line(rle_elem_t *rle, rle_elem_t *rle_limit, int w)
+{
+  int x;
+  for (x = 0; x < w && rle < rle_limit; ) {
+    x += rle->len;
+    rle++;
+  }
+  return rle;
+}
+void blend_rgb16 (uint8_t * img, vo_overlay_t * img_overl,
+                  int img_width, int img_height,
+                  int dst_width, int dst_height)
+{
+  uint8_t *trans;
+  clut_t* clut = (clut_t*) img_overl->clip_color;
+  int src_width = img_overl->width;
+  int src_height = img_overl->height;
+  rle_elem_t *rle = img_overl->rle;
+  rle_elem_t *rle_limit = rle + img_overl->num_rle;
+  int x, y, x1_scaled, x2_scaled;
+          int dy, dy_step, x_scale;/* scaled 2**SCALE_SHIFT */
+  int clip_right;
+  uint16_t *img_pix;
+  dy_step = INT_TO_SCALED(dst_height) / img_height;
+  x_scale = INT_TO_SCALED(img_width)  / dst_width;
+  img_pix = (uint16_t *) img
+      + (img_overl->y * img_height / dst_height) * img_width
+      + (img_overl->x * img_width / dst_width);
+  trans = img_overl->clip_trans;
+  /* avoid wraping overlay if drawing to small image */
+  if( (img_overl->x + img_overl->clip_right) < dst_width )
+    clip_right = img_overl->clip_right;
+  else
+    clip_right = dst_width - 1 - img_overl->x;
+  /* avoid buffer overflow */
+  if( (src_height + img_overl->y) >= dst_height )
+    src_height = dst_height - 1 - img_overl->y;
+  for (y = dy = 0; y < src_height && rle < rle_limit;) {
+    int mask = !(img_overl->clip_top > y || img_overl->clip_bottom < y);
+    rle_elem_t *rle_start = rle;
+    for (x = x1_scaled = 0; x < src_width;) {
+      uint8_t clr;
+      uint16_t o;
+      int rlelen;
+      clr = rle->color;
+      o   = trans[clr];
+      rlelen = rle->len;
+      if (o && mask) {
+        /* threat cases where clipping border is inside rle->len pixels */
+        if ( img_overl->clip_left > x ) {
+          if( img_overl->clip_left < x + rlelen ) {
+            x1_scaled = SCALED_TO_INT( img_overl->clip_left * x_scale );
+            rlelen -= img_overl->clip_left - x;
+            x += img_overl->clip_left - x;
+          } else {
+            o = 0;
+          }
+        } else if( clip_right < x + rlelen ) {
+          if( clip_right > x ) {
+            x2_scaled = SCALED_TO_INT( clip_right * x_scale);
+            mem_blend16(img_pix+x1_scaled, *((uint16_t *)&clut[clr]), o,
+                        x2_scaled-x1_scaled);
+            o = 0;            
+          } else {
+            o = 0;
+          }
+        } 
+      }
+      
+      x2_scaled = SCALED_TO_INT((x + rlelen) * x_scale);
+      if (o && mask) {
+        mem_blend16(img_pix+x1_scaled, *((uint16_t *)&clut[clr]), o, x2_scaled-x1_scaled);
+      }
+      x1_scaled = x2_scaled;
+      x += rlelen;
+      rle++;
+      if (rle >= rle_limit) break;
+    }
+    img_pix += img_width;
+    dy += dy_step;
+    if (dy >= INT_TO_SCALED(1)) {
+      dy -= INT_TO_SCALED(1);
+      ++y;
+      while (dy >= INT_TO_SCALED(1)) {
+        rle = rle_img_advance_line(rle, rle_limit, src_width);
+        dy -= INT_TO_SCALED(1);
+        ++y;
+      }
+    } else {
+              rle = rle_start;  /* y-scaling, reuse the last rle encoded line */
+    }
+  }
+}
+void blend_rgb24 (uint8_t * img, vo_overlay_t * img_overl,
+                  int img_width, int img_height,
+                  int dst_width, int dst_height)
+{
+  clut_t* clut = (clut_t*) img_overl->clip_color;
+  uint8_t *trans;
+  int src_width = img_overl->width;
+  int src_height = img_overl->height;
+  rle_elem_t *rle = img_overl->rle;
+  rle_elem_t *rle_limit = rle + img_overl->num_rle;
+  int x, y, x1_scaled, x2_scaled;
+          int dy, dy_step, x_scale;/* scaled 2**SCALE_SHIFT */
+  int clip_right;
+  uint8_t *img_pix;
+  dy_step = INT_TO_SCALED(dst_height) / img_height;
+  x_scale = INT_TO_SCALED(img_width)  / dst_width;
+  img_pix = img + 3 * (  (img_overl->y * img_height / dst_height) * img_width
+                       + (img_overl->x * img_width  / dst_width));
+  trans = img_overl->clip_trans;
+  /* avoid wraping overlay if drawing to small image */
+  if( (img_overl->x + img_overl->clip_right) < dst_width )
+    clip_right = img_overl->clip_right;
+  else
+    clip_right = dst_width - 1 - img_overl->x;
+  /* avoid buffer overflow */
+  if( (src_height + img_overl->y) >= dst_height )
+    src_height = dst_height - 1 - img_overl->y;
+  for (dy = y = 0; y < src_height && rle < rle_limit; ) {
+    int mask = !(img_overl->clip_top > y || img_overl->clip_bottom < y);
+    rle_elem_t *rle_start = rle;
+    for (x = x1_scaled = 0; x < src_width;) {
+      uint8_t clr;
+      uint16_t o;
+      int rlelen;
+      clr = rle->color;
+      o   = trans[clr];
+      rlelen = rle->len;
+      if (o && mask) {
+        /* threat cases where clipping border is inside rle->len pixels */
+        if ( img_overl->clip_left > x ) {
+          if( img_overl->clip_left < x + rlelen ) {
+            x1_scaled = SCALED_TO_INT( img_overl->clip_left * x_scale );
+            rlelen -= img_overl->clip_left - x;
+            x += img_overl->clip_left - x;
+          } else {
+            o = 0;
+          }
+        } else if( clip_right < x + rlelen ) {
+          if( clip_right > x ) {
+            x2_scaled = SCALED_TO_INT( clip_right * x_scale);
+            mem_blend24(img_pix + x1_scaled*3, clut[clr].cb,
+                    clut[clr].cr, clut[clr].y,
+                    o, x2_scaled-x1_scaled);
+            o = 0;            
+          } else {
+            o = 0;
+          }
+        } 
+      }
+      
+      x2_scaled = SCALED_TO_INT((x + rlelen) * x_scale);
+      if (o && mask) {
+        mem_blend24(img_pix + x1_scaled*3, clut[clr].cb,
+                    clut[clr].cr, clut[clr].y,
+                    o, x2_scaled-x1_scaled);
+      }
+      x1_scaled = x2_scaled;
+      x += rlelen;
+      rle++;
+      if (rle >= rle_limit) break;
+    }
+    img_pix += img_width * 3;
+    dy += dy_step;
+    if (dy >= INT_TO_SCALED(1)) {
+      dy -= INT_TO_SCALED(1);
+      ++y;
+      while (dy >= INT_TO_SCALED(1)) {
+        rle = rle_img_advance_line(rle, rle_limit, src_width);
+        dy -= INT_TO_SCALED(1);
+        ++y;
+      }
+    } else {
+              rle = rle_start;  /* y-scaling, reuse the last rle encoded line */
+    }
+  }
+}
+void blend_rgb32 (uint8_t * img, vo_overlay_t * img_overl,
+                  int img_width, int img_height,
+                  int dst_width, int dst_height)
+{
+  clut_t* clut = (clut_t*) img_overl->clip_color;
+  uint8_t *trans;
+  int src_width = img_overl->width;
+  int src_height = img_overl->height;
+  rle_elem_t *rle = img_overl->rle;
+  rle_elem_t *rle_limit = rle + img_overl->num_rle;
+  int x, y, x1_scaled, x2_scaled;
+          int dy, dy_step, x_scale;/* scaled 2**SCALE_SHIFT */
+  int clip_right;
+  uint8_t *img_pix;
+  dy_step = INT_TO_SCALED(dst_height) / img_height;
+  x_scale = INT_TO_SCALED(img_width)  / dst_width;
+  img_pix = img + 4 * (  (img_overl->y * img_height / dst_height) * img_width
+                       + (img_overl->x * img_width / dst_width));
+  trans = img_overl->clip_trans;
+  /* avoid wraping overlay if drawing to small image */
+  if( (img_overl->x + img_overl->clip_right) < dst_width )
+    clip_right = img_overl->clip_right;
+  else
+    clip_right = dst_width - 1 - img_overl->x;
+  /* avoid buffer overflow */
+  if( (src_height + img_overl->y) >= dst_height )
+    src_height = dst_height - 1 - img_overl->y;
+  for (y = dy = 0; y < src_height && rle < rle_limit; ) {
+    int mask = !(img_overl->clip_top > y || img_overl->clip_bottom < y);
+    rle_elem_t *rle_start = rle;
+    for (x = x1_scaled = 0; x < src_width;) {
+      uint8_t clr;
+      uint16_t o;
+      int rlelen;
+      clr = rle->color;
+      o   = trans[clr];
+      rlelen = rle->len;
+      if (o && mask) {
+        /* threat cases where clipping border is inside rle->len pixels */
+        if ( img_overl->clip_left > x ) {
+          if( img_overl->clip_left < x + rlelen ) {
+            x1_scaled = SCALED_TO_INT( img_overl->clip_left * x_scale );
+            rlelen -= img_overl->clip_left - x;
+            x += img_overl->clip_left - x;
+          } else {
+            o = 0;
+          }
+        } else if( clip_right < x + rlelen ) {
+          if( clip_right > x ) {
+            x2_scaled = SCALED_TO_INT( clip_right * x_scale);
+            mem_blend24_32(img_pix + x1_scaled*4, clut[clr].cb,
+                    clut[clr].cr, clut[clr].y,
+                    o, x2_scaled-x1_scaled);
+            o = 0;            
+          } else {
+            o = 0;
+          }
+        } 
+      }
+      x2_scaled = SCALED_TO_INT((x + rlelen) * x_scale);
+      if (o && mask) {
+        mem_blend24_32(img_pix + x1_scaled*4, clut[clr].cb,
+                    clut[clr].cr, clut[clr].y,
+                    o, x2_scaled-x1_scaled);
+      }
+      x1_scaled = x2_scaled;
+      x += rlelen;
+      rle++;
+      if (rle >= rle_limit) break;
+    }
+    img_pix += img_width * 4;
+    dy += dy_step;
+    if (dy >= INT_TO_SCALED(1)) {
+      dy -= INT_TO_SCALED(1);
+      ++y;
+      while (dy >= INT_TO_SCALED(1)) {
+        rle = rle_img_advance_line(rle, rle_limit, src_width);
+        dy -= INT_TO_SCALED(1);
+        ++y;
+      }
+    } else {
+              rle = rle_start;  /* y-scaling, reuse the last rle encoded line */
+    }
+  }
+}
+static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz)
+{
+  uint8_t *limit = mem + sz;
+  while (mem < limit) {
+    *mem = BLEND_BYTE(*mem, val, o);
+    mem++;
+  }
+}
+void blend_yuv (uint8_t *dst_base[3], vo_overlay_t * img_overl,
+                int dst_width, int dst_height)
+{
+  clut_t *my_clut;
+  uint8_t *my_trans;
+  int src_width = img_overl->width;
+  int src_height = img_overl->height;
+  rle_elem_t *rle = img_overl->rle;
+  rle_elem_t *rle_limit = rle + img_overl->num_rle;
+  int x_off = img_overl->x;
+  int y_off = img_overl->y;
+  int ymask,xmask;
+  int rle_this_bite;
+  int rle_remainder;
+  int rlelen;
+  int x, y;
+  int clip_right;
+  uint8_t clr=0;
+  uint8_t *dst_y = dst_base[0] + dst_width * y_off + x_off;
+  uint8_t *dst_cr = dst_base[2] +
+    (y_off / 2) * (dst_width / 2) + (x_off / 2) + 1;
+  uint8_t *dst_cb = dst_base[1] +
+    (y_off / 2) * (dst_width / 2) + (x_off / 2) + 1;
+#ifdef LOG_BLEND_YUV
+  printf("overlay_blend started x=%d, y=%d, w=%d h=%d\n",img_overl->x,img_overl->y,img_overl->width,img_overl->height);
+#endif
+  my_clut = (clut_t*) img_overl->clip_color;
+  my_trans = img_overl->clip_trans;
+  /* avoid wraping overlay if drawing to small image */
+  if( (x_off + img_overl->clip_right) < dst_width )
+    clip_right = img_overl->clip_right;
+  else
+    clip_right = dst_width - 1 - x_off;
+  /* avoid buffer overflow */
+  if( (src_height + y_off) >= dst_height )
+    src_height = dst_height - 1 - y_off;
+  rlelen=rle_remainder=0;
+  for (y = 0; y < src_height; y++) {
+    ymask = ((img_overl->clip_top > y) || (img_overl->clip_bottom < y));
+    xmask = 0;
+#ifdef LOG_BLEND_YUV
+    printf("X started ymask=%d y=%d src_height=%d\n",ymask, y, src_height);
+#endif
+    for (x = 0; x < src_width;) {
+      uint16_t o;
+#ifdef LOG_BLEND_YUV
+      printf("1:rle_len=%d, remainder=%d, x=%d\n",rlelen, rle_remainder, x);
+#endif
+      if ((rlelen < 0) || (rle_remainder < 0)) {
+        printf("alphablend: major bug in blend_yuv < 0\n");
+      } 
+      if (rlelen == 0) {
+        rle_remainder = rlelen = rle->len;
+        clr = rle->color;
+        rle++;
+      }
+      if (rle_remainder == 0) {
+        rle_remainder = rlelen;
+      }
+      if ((rle_remainder + x) > src_width) {
+        /* Do something for long rlelengths */
+        rle_remainder = src_width - x; 
+        ;
+      }
+#ifdef LOG_BLEND_YUV
+      printf("2:rle_len=%d, remainder=%d, x=%d\n",rlelen, rle_remainder, x);
+#endif
+      if (ymask == 0) {
+        if (x <= img_overl->clip_left) { 
+          /* Starts outside clip area */
+          if ((x + rle_remainder - 1) > img_overl->clip_left ) {
+#ifdef LOG_BLEND_YUV
+            printf("Outside clip left %d, ending inside\n", img_overl->clip_left); 
+#endif
+            /* Cutting needed, starts outside, ends inside */
+            rle_this_bite = (img_overl->clip_left - x + 1);
+            rle_remainder -= rle_this_bite;
+            rlelen -= rle_this_bite;
+            my_clut = (clut_t*) img_overl->color;
+            my_trans = img_overl->trans;
+            xmask = 0;
+          } else {
+#ifdef LOG_BLEND_YUV
+            printf("Outside clip left %d, ending outside\n", img_overl->clip_left); 
+#endif
+          /* no cutting needed, starts outside, ends outside */
+            rle_this_bite = rle_remainder;
+            rle_remainder = 0;
+            rlelen -= rle_this_bite;
+            my_clut = (clut_t*) img_overl->color;
+            my_trans = img_overl->trans;
+            xmask = 0;
+          }
+        } else if (x < clip_right) {
+          /* Starts inside clip area */
+          if ((x + rle_remainder) > clip_right ) {
+#ifdef LOG_BLEND_YUV
+            printf("Inside clip right %d, ending outside\n", clip_right);
+#endif
+            /* Cutting needed, starts inside, ends outside */
+            rle_this_bite = (clip_right - x);
+            rle_remainder -= rle_this_bite;
+            rlelen -= rle_this_bite;
+            my_clut = (clut_t*) img_overl->clip_color;
+            my_trans = img_overl->clip_trans;
+            xmask++;
+          } else {
+#ifdef LOG_BLEND_YUV
+            printf("Inside clip right %d, ending inside\n", clip_right);
+#endif
+          /* no cutting needed, starts inside, ends inside */
+            rle_this_bite = rle_remainder;
+            rle_remainder = 0;
+            rlelen -= rle_this_bite;
+            my_clut = (clut_t*) img_overl->clip_color;
+            my_trans = img_overl->clip_trans;
+            xmask++;
+          }
+        } else if (x >= clip_right) {
+          /* Starts outside clip area, ends outsite clip area */
+          if ((x + rle_remainder ) > src_width ) { 
+#ifdef LOG_BLEND_YUV
+            printf("Outside clip right %d, ending eol\n", clip_right);
+#endif
+            /* Cutting needed, starts outside, ends at right edge */
+            /* It should never reach here due to the earlier test of src_width */
+            rle_this_bite = (src_width - x );
+            rle_remainder -= rle_this_bite;
+            rlelen -= rle_this_bite;
+            my_clut = (clut_t*) img_overl->color;
+            my_trans = img_overl->trans;
+            xmask = 0;
+          } else {
+          /* no cutting needed, starts outside, ends outside */
+#ifdef LOG_BLEND_YUV
+            printf("Outside clip right %d, ending outside\n", clip_right);
+#endif
+            rle_this_bite = rle_remainder;
+            rle_remainder = 0;
+            rlelen -= rle_this_bite;
+            my_clut = (clut_t*) img_overl->color;
+            my_trans = img_overl->trans;
+            xmask = 0;
+          }
+        }
+      } else {
+        /* Outside clip are due to y */
+        /* no cutting needed, starts outside, ends outside */
+        rle_this_bite = rle_remainder;
+        rle_remainder = 0;
+        rlelen -= rle_this_bite;
+        my_clut = (clut_t*) img_overl->color;
+        my_trans = img_overl->trans;
+        xmask = 0;
+      }
+      o   = my_trans[clr];
+#ifdef LOG_BLEND_YUV
+      printf("Trans=%d clr=%d xmask=%d my_clut[clr]=%d\n",o, clr, xmask, my_clut[clr].y);
+#endif
+      if (o) {
+        if(o >= 15) {
+          memset(dst_y + x, my_clut[clr].y, rle_this_bite);
+          if (y & 1) {
+            memset(dst_cr + (x >> 1), my_clut[clr].cr, (rle_this_bite+1) >> 1);
+            memset(dst_cb + (x >> 1), my_clut[clr].cb, (rle_this_bite+1) >> 1);
+          }
+        } else {
+          mem_blend8(dst_y + x, my_clut[clr].y, o, rle_this_bite);
+          if (y & 1) {
+            /* Blending cr and cb should use a different function, with pre -128 to each sample */
+            mem_blend8(dst_cr + (x >> 1), my_clut[clr].cr, o, (rle_this_bite+1) >> 1);
+            mem_blend8(dst_cb + (x >> 1), my_clut[clr].cb, o, (rle_this_bite+1) >> 1);
+          }
+        }
+      }
+#ifdef LOG_BLEND_YUV
+      printf("rle_this_bite=%d, remainder=%d, x=%d\n",rle_this_bite, rle_remainder, x);
+#endif
+      x += rle_this_bite;
+      if (rle >= rle_limit) {
+#ifdef LOG_BLEND_YUV
+        printf("x-rle_limit\n");
+#endif
+        break;
+      }
+    }
+    if (rle >= rle_limit) {
+#ifdef LOG_BLEND_YUV
+        printf("x-rle_limit\n");
+#endif
+        break;
+    }
+    dst_y += dst_width;
+    if (y & 1) {
+      dst_cr += (dst_width + 1) / 2;
+      dst_cb += (dst_width + 1) / 2;
+    }
+  }
+#ifdef LOG_BLEND_YUV
+  printf("overlay_blend ended\n");
+#endif
+}
+            
+void blend_yuy2 (uint8_t * dst_img, vo_overlay_t * img_overl,
+                int dst_width, int dst_height)
+{
+  clut_t *my_clut;
+  uint8_t *my_trans;
+  int src_width = img_overl->width;
+  int src_height = img_overl->height;
+  rle_elem_t *rle = img_overl->rle;
+  rle_elem_t *rle_limit = rle + img_overl->num_rle;
+  int x_off = img_overl->x;
+  int y_off = img_overl->y;
+  int mask;
+  int x, y;
+  int l;
+  int clip_right;
+  uint32_t yuy2;
+  
+  uint8_t *dst_y = dst_img + 2 * (dst_width * y_off + x_off);
+  uint8_t *dst;
+  my_clut = (clut_t*) img_overl->clip_color;
+  my_trans = img_overl->clip_trans;
+  /* avoid wraping overlay if drawing to small image */
+  if( (x_off + img_overl->clip_right) < dst_width )
+    clip_right = img_overl->clip_right;
+  else
+    clip_right = dst_width - 1 - x_off;
+  /* avoid buffer overflow */
+  if( (src_height + y_off) >= dst_height )
+    src_height = dst_height - 1 - y_off;
+  for (y = 0; y < src_height; y++) {
+    mask = !(img_overl->clip_top > y || img_overl->clip_bottom < y);
+    dst = dst_y;
+    for (x = 0; x < src_width;) {
+      uint8_t clr;
+      uint16_t o;
+      int rlelen;
+      clr = rle->color;
+      o   = my_trans[clr];
+      rlelen = rle->len;
+      if (o && mask) {
+        /* threat cases where clipping border is inside rle->len pixels */
+        if ( img_overl->clip_left > x ) {
+          if( img_overl->clip_left < x + rlelen ) {
+            rlelen -= img_overl->clip_left - x;
+            x += img_overl->clip_left - x;
+          } else {
+            o = 0;
+          }
+        } else if( clip_right < x + rlelen ) {
+          if( clip_right > x ) {
+            /* fixme: case not implemented */
+            o = 0;            
+          } else {
+            o = 0;
+          }
+        } 
+      }
+      
+      if (o && mask) {
+        l = rlelen>>1;
+        if( !(x & 1) ) {
+          yuy2 =  my_clut[clr].y + (my_clut[clr].cb << 8) +
+                 (my_clut[clr].y << 16) + (my_clut[clr].cr << 24);
+        } else {
+          yuy2 =  my_clut[clr].y + (my_clut[clr].cr << 8) +
+                 (my_clut[clr].y << 16) + (my_clut[clr].cb << 24);
+        }
+        
+        if (o >= 15) {
+          while(l--) {
+            *((uint32_t *)dst)++ = yuy2;
+          }
+          if(rlelen & 1)
+            *((uint16_t *)dst)++ = yuy2 & 0xffff;
+        } else {
+          if( l ) {
+            mem_blend32(dst, (uint8_t *)&yuy2, o, l);
+            dst += 4*l;
+          }
+          
+          if(rlelen & 1) {
+            *dst = BLEND_BYTE(*dst, *((uint8_t *)&yuy2), o);
+            dst++;
+            *dst = BLEND_BYTE(*dst, *((uint8_t *)&yuy2+1), o);
+            dst++;
+          }
+        }
+      } else {
+        dst += rlelen*2;
+      }
+      x += rlelen;
+      rle++;
+      if (rle >= rle_limit) break;
+    }
+    if (rle >= rle_limit) break;
+    dst_y += dst_width*2;
+  }
+}
diff --git a/noncore/multimedia/opieplayer2/alphablend.h b/noncore/multimedia/opieplayer2/alphablend.h
new file mode 100644
index 0000000..7230f41
--- a/dev/null
+++ b/noncore/multimedia/opieplayer2/alphablend.h
@@ -0,0 +1,57 @@
+/*
+ *
+ * Copyright (C) 2000  Thomas Mirlacher
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * The author may be reached as <dent@linuxvideo.org>
+ *
+ *------------------------------------------------------------
+ *
+ */
+#ifndef __ALPHABLEND_H__
+#define __ALPHABLEND_H__
+#include <xine/video_out.h>
+typedef struct {         /* CLUT == Color LookUp Table */
+  uint8_t cb    : 8;
+  uint8_t cr    : 8;
+  uint8_t y     : 8;
+  uint8_t foo   : 8;
+} __attribute__ ((packed)) clut_t;
+void blend_rgb16 (uint8_t * img, vo_overlay_t * img_overl,
+                  int img_width, int img_height,
+                  int dst_width, int dst_height);
+void blend_rgb24 (uint8_t * img, vo_overlay_t * img_overl,
+                  int img_width, int img_height,
+                  int dst_width, int dst_height);
+void blend_rgb32 (uint8_t * img, vo_overlay_t * img_overl,
+                  int img_width, int img_height,
+                  int dst_width, int dst_height);
+void blend_yuv (uint8_t *dst_base[3], vo_overlay_t * img_overl,
+                int dst_width, int dst_height);
+void blend_yuy2 (uint8_t * dst_img, vo_overlay_t * img_overl,
+                int dst_width, int dst_height);
+void crop_overlay (vo_overlay_t * overlay);
+#endif
diff --git a/noncore/multimedia/opieplayer2/yuv2rgb.c b/noncore/multimedia/opieplayer2/yuv2rgb.c
new file mode 100644
index 0000000..d1d6627
--- a/dev/null
+++ b/noncore/multimedia/opieplayer2/yuv2rgb.c
@@ -0,0 +1,3160 @@
+/*
+ * yuv2rgb.c
+ *
+ * This file is part of xine, a unix video player.
+ *
+ * based on work from mpeg2dec:
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * $Id$
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include "yuv2rgb.h"
+#include <xine/xineutils.h>
+static int prof_scale_line = -1;
+static scale_line_func_t find_scale_line_func(int step);
+const int32_t Inverse_Table_6_9[8][4] = {
+  {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
+  {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
+  {104597, 132201, 25675, 53279}, /* unspecified */
+  {104597, 132201, 25675, 53279}, /* reserved */
+  {104448, 132798, 24759, 53109}, /* FCC */
+  {104597, 132201, 25675, 53279}, /* ITU-R Rec. 624-4 System B, G */
+  {104597, 132201, 25675, 53279}, /* SMPTE 170M */
+  {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
+};
+static void *my_malloc_aligned (size_t alignment, size_t size, void **chunk) {
+  char *pMem;
+  pMem = xine_xmalloc (size+alignment);
+  *chunk = pMem;
+  while ((int) pMem % alignment)
+    pMem++;
+  return pMem;
+}
+static int yuv2rgb_configure (yuv2rgb_t *this, 
+                              int source_width, int source_height,
+                              int y_stride, int uv_stride,
+                              int dest_width, int dest_height,
+                              int rgb_stride) {
+  /*
+  printf ("yuv2rgb setup (%d x %d => %d x %d)\n", source_width, source_height,
+          dest_width, dest_height);
+          */
+  if (prof_scale_line == -1)
+    prof_scale_line = xine_profiler_allocate_slot("xshm scale line");
+  this->source_width  = source_width;
+  this->source_height = source_height;
+  this->y_stride      = y_stride;
+  this->uv_stride     = uv_stride;
+  this->dest_width    = dest_width;
+  this->dest_height   = dest_height;
+  this->rgb_stride    = rgb_stride;
+  
+  if (this->y_chunk) {
+    free (this->y_chunk);
+    this->y_buffer = this->y_chunk = NULL;
+  }
+  if (this->u_chunk) {
+    free (this->u_chunk);
+    this->u_buffer = this->u_chunk = NULL;
+  }
+  if (this->v_chunk) {
+    free (this->v_chunk);
+    this->v_buffer = this->v_chunk = NULL;
+  }
+  
+  this->step_dx = source_width  * 32768 / dest_width;
+  this->step_dy = source_height * 32768 / dest_height;
+    
+  this->scale_line = find_scale_line_func(this->step_dx);
+  if ((source_width == dest_width) && (source_height == dest_height)) {
+    this->do_scale = 0;
+    /*
+     * space for two y-lines (for yuv2rgb_mlib)
+     * u,v subsampled 2:1
+     */
+    this->y_buffer = my_malloc_aligned (16, 2*dest_width, &this->y_chunk);
+    if (!this->y_buffer)
+      return 0;
+    this->u_buffer = my_malloc_aligned (16, (dest_width+1)/2, &this->u_chunk);
+    if (!this->u_buffer)
+      return 0;
+    this->v_buffer = my_malloc_aligned (16, (dest_width+1)/2, &this->v_chunk);
+    if (!this->v_buffer)
+      return 0;
+  } else {
+    this->do_scale = 1;
+    
+    /*
+     * space for two y-lines (for yuv2rgb_mlib)
+     * u,v subsampled 2:1
+     */
+    this->y_buffer = my_malloc_aligned (16, 2*dest_width, &this->y_chunk);
+    if (!this->y_buffer)
+      return 0;
+    this->u_buffer = my_malloc_aligned (16, (dest_width+1)/2, &this->u_chunk);
+    if (!this->u_buffer)
+      return 0;
+    this->v_buffer = my_malloc_aligned (16, (dest_width+1)/2, &this->v_chunk);
+    if (!this->v_buffer)
+      return 0;
+  }
+  return 1;
+}
+static void scale_line_gen (uint8_t *source, uint8_t *dest,
+                            int width, int step) {
+  /*
+   * scales a yuv source row to a dest row, with interpolation
+   * (good quality, but slow)
+   */
+  int p1;
+  int p2;
+  int dx;
+  xine_profiler_start_count(prof_scale_line);
+  p1 = *source++;
+  p2 = *source++;
+  dx = 0;
+  /*
+   * the following code has been optimized by Scott Smith <ssmith@akamai.com>:
+   *
+   * ok now I have a meaningful optimization for yuv2rgb.c:scale_line_gen.
+   * it removes the loop from within the while() loop by separating it out
+   * into 3 cases: where you are enlarging the line (<32768), where you are
+   * between 50% and 100% of the original line (<=65536), and where you are
+   * shrinking it by a lot.  anyways, I went from 200 delivered / 100+
+   * skipped to 200 delivered / 80 skipped for the enlarging case.  I
+   * noticed when looking at the assembly that the compiler was able to
+   * unroll these while(width) loops, whereas before it was trying to
+   * unroll the while(dx>32768) loops.  so the compiler is better able to
+   * deal with this code.
+   */
+  if (step < 32768) {
+    while (width) {
+      *dest = p1 + (((p2-p1) * dx)>>15);
+      dx += step;
+      if (dx > 32768) {
+        dx -= 32768;
+        p1 = p2;
+        p2 = *source++;
+      }
+      
+      dest ++;
+      width --;
+    }
+  } else if (step <= 65536) {
+    while (width) {
+      *dest = p1 + (((p2-p1) * dx)>>15);
+      dx += step;
+      if (dx > 65536) {
+        dx -= 65536;
+        p1 = *source++;
+        p2 = *source++;
+      } else {
+        dx -= 32768;
+        p1 = p2;
+        p2 = *source++;
+      }
+      
+      dest ++;
+      width --;
+    }
+  } else {
+    while (width) {
+      int offs;
+      *dest = p1 + (((p2-p1) * dx)>>15);
+      dx += step;
+      offs=((dx-1)>>15);
+      dx-=offs<<15;
+      source+=offs-2;
+      p1=*source++;
+      p2=*source++;
+      dest ++;
+      width --;
+    }
+  }
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 16 output pixels from 15 source pixels using shifts.
+ * Useful for scaling a PAL mpeg2 dvd input source to 4:3 format on
+ * a monitor using square pixels.
+ * (720 x 576 ==> 768 x 576)
+ */
+static void scale_line_15_16 (uint8_t *source, uint8_t *dest,
+                              int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 16) >= 0) {
+    p1 = source[0];
+    dest[0] = p1;
+    p2 = source[1];
+    dest[1] = (1*p1 + 7*p2) >> 3;
+    p1 = source[2];
+    dest[2] = (1*p2 + 7*p1) >> 3;
+    p2 = source[3];
+    dest[3] = (1*p1 + 3*p2) >> 2;
+    p1 = source[4];
+    dest[4] = (1*p2 + 3*p1) >> 2;
+    p2 = source[5];
+    dest[5] = (3*p1 + 5*p2) >> 3;
+    p1 = source[6];
+    dest[6] = (3*p2 + 5*p1) >> 3;
+    p2 = source[7];
+    dest[7] = (1*p1 + 1*p1) >> 1;
+    p1 = source[8];
+    dest[8] = (1*p2 + 1*p1) >> 1;
+    p2 = source[9];
+    dest[9] = (5*p1 + 3*p2) >> 3;
+    p1 = source[10];
+    dest[10] = (5*p2 + 3*p1) >> 3;
+    p2 = source[11];
+    dest[11] = (3*p1 + 1*p2) >> 2;
+    p1 = source[12];
+    dest[12] = (3*p2 + 1*p1) >> 2;
+    p2 = source[13];
+    dest[13] = (7*p1 + 1*p2) >> 3;
+    p1 = source[14];
+    dest[14] = (7*p2 + 1*p1) >> 3;
+    dest[15] = p1;
+    source += 15;
+    dest += 16;
+  }
+  if ((width += 16) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 7*source[1]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[1] + 7*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[2] + 3*source[3]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[3] + 3*source[4]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[4] + 5*source[5]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[5] + 5*source[6]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[6] + 1*source[7]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[7] + 1*source[8]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[8] + 3*source[9]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[9] + 3*source[10]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[10] + 1*source[11]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[11] + 1*source[12]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[12] + 1*source[13]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[13] + 1*source[14]) >> 3;
+ done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 53 output pixels from 45 source pixels using shifts.
+ * Useful for scaling a NTSC mpeg2 dvd input source to 16:9 display
+ * resulution
+ * fullscreen resolution, or to 16:9 format on a monitor using square
+ * pixels.
+ * (720 x 480 ==> 848 x 480)
+ */
+static void scale_line_45_53 (uint8_t *source, uint8_t *dest,
+                              int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 53) >= 0) {
+    p1 = source[0];
+    p2 = source[1];
+    dest[0] = p1;
+    dest[1] = (1*p1 + 7*p2) >> 3;
+    p1 = source[2];
+    dest[2] = (1*p2 + 3*p1) >> 2;
+    p2 = source[3];
+    dest[3] = (1*p1 + 1*p2) >> 1;
+    p1 = source[4];
+    dest[4] = (5*p2 + 3*p1) >> 3;
+    p2 = source[5];
+    dest[5] = (3*p1 + 1*p2) >> 2;
+    p1 = source[6];
+    dest[6] = (7*p2 + 1*p1) >> 3;
+    dest[7] = p1;
+    p2 = source[7];
+    dest[8] = (1*p1 + 3*p2) >> 2;
+    p1 = source[8];
+    dest[9] = (3*p2 + 5*p1) >> 3;
+    p2 = source[9];
+    dest[10] = (1*p1 + 1*p2) >> 1;
+    p1 = source[10];
+    dest[11] = (5*p2 + 3*p1) >> 3;
+    p2 = source[11];
+    dest[12] = (3*p1 + 1*p2) >> 2;
+    p1 = source[12];
+    dest[13] = p2;
+    dest[14] = (1*p2 + 7*p1) >> 3;
+    p2 = source[13];
+    dest[15] = (1*p1 + 3*p2) >> 2;
+    p1 = source[14];
+    dest[16] = (3*p2 + 5*p1) >> 3;
+    p2 = source[15];
+    dest[17] = (5*p1 + 3*p2) >> 3;
+    p1 = source[16];
+    dest[18] = (3*p2 + 1*p1) >> 2;
+    p2 = source[17];
+    dest[19] = (7*p1 + 1*p2) >> 3;
+    dest[20] = p2;
+    p1 = source[18];
+    dest[21] = (1*p2 + 7*p1) >> 3;
+    p2 = source[19];
+    dest[22] = (3*p1 + 5*p2) >> 3;
+    p1 = source[20];
+    dest[23] = (1*p2 + 1*p1) >> 1;
+    p2 = source[21];
+    dest[24] = (5*p1 + 3*p2) >> 3;
+    p1 = source[22];
+    dest[25] = (3*p2 + 1*p1) >> 2;
+    p2 = source[23];
+    dest[26] = (7*p1 + 1*p2) >> 3;
+    dest[27] = (1*p1 + 7*p2) >> 3;
+    p1 = source[24];
+    dest[28] = (1*p2 + 3*p1) >> 2;
+    p2 = source[25];
+    dest[29] = (3*p1 + 5*p2) >> 3;
+    p1 = source[26];
+    dest[30] = (1*p2 + 1*p1) >> 1;
+    p2 = source[27];
+    dest[31] = (5*p1 + 3*p2) >> 3;
+    p1 = source[28];
+    dest[32] = (7*p2 + 1*p1) >> 3;
+    p2 = source[29];
+    dest[33] = p1;
+    dest[34] = (1*p1 + 7*p2) >> 3;
+    p1 = source[30];
+    dest[35] = (1*p2 + 3*p1) >> 2;
+    p2 = source[31];
+    dest[36] = (3*p1 + 5*p2) >> 3;
+    p1 = source[32];
+    dest[37] = (5*p2 + 3*p1) >> 3;
+    p2 = source[33];
+    dest[38] = (3*p1 + 1*p2) >> 2;
+    p1 = source[34];
+    dest[39] = (7*p2 + 1*p1) >> 3;
+    dest[40] = p1;
+    p2 = source[35];
+    dest[41] = (1*p1 + 3*p2) >> 2;
+    p1 = source[36];
+    dest[42] = (3*p2 + 5*p1) >> 3;
+    p2 = source[37];
+    dest[43] = (1*p1 + 1*p2) >> 1;
+    p1 = source[38];
+    dest[44] = (5*p2 + 3*p1) >> 3;
+    p2 = source[39];
+    dest[45] = (3*p1 + 1*p2) >> 2;
+    p1 = source[40];
+    dest[46] = p2;
+    dest[47] = (1*p2 + 7*p1) >> 3;
+    p2 = source[41];
+    dest[48] = (1*p1 + 3*p2) >> 2;
+    p1 = source[42];
+    dest[49] = (3*p2 + 5*p1) >> 3;
+    p2 = source[43];
+    dest[50] = (1*p1 + 1*p2) >> 1;
+    p1 = source[44];
+    dest[51] = (3*p2 + 1*p1) >> 2;
+    p2 = source[45];
+    dest[52] = (7*p1 + 1*p2) >> 3;
+    source += 45;
+    dest += 53;
+  }
+  if ((width += 53) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 7*source[1]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[1] + 3*source[2]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[2] + 1*source[3]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[3] + 3*source[4]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[4] + 1*source[5]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[5] + 1*source[6]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[6];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[6] + 3*source[7]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[7] + 5*source[8]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[8] + 1*source[9]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[9] + 3*source[10]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[10] + 1*source[11]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = source[11];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[11] + 7*source[12]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[12] + 3*source[13]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[13] + 5*source[14]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[14] + 3*source[15]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[15] + 1*source[16]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[16] + 1*source[17]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[17];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[17] + 7*source[18]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[18] + 5*source[19]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[19] + 1*source[20]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[20] + 3*source[21]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[21] + 1*source[22]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[22] + 1*source[23]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[22] + 7*source[23]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[23] + 3*source[24]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[24] + 5*source[25]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[25] + 1*source[26]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[26] + 3*source[27]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[27] + 1*source[28]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[28];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[28] + 7*source[29]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[29] + 3*source[30]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[30] + 5*source[31]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[31] + 3*source[32]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[32] + 1*source[33]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[33] + 1*source[34]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[34];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[34] + 3*source[35]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[35] + 5*source[36]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[36] + 1*source[37]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[37] + 3*source[38]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[38] + 1*source[39]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = source[39];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[39] + 7*source[40]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[40] + 3*source[41]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[41] + 5*source[42]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[42] + 1*source[43]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[43] + 1*source[44]) >> 2;
+ done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 64 output pixels from 45 source pixels using shifts.
+ * Useful for scaling a PAL mpeg2 dvd input source to 1024x768
+ * fullscreen resolution, or to 16:9 format on a monitor using square
+ * pixels.
+ * (720 x 576 ==> 1024 x 576)
+ */
+static void scale_line_45_64 (uint8_t *source, uint8_t *dest,
+                             int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 64) >= 0) {
+    p1 = source[0];
+    p2 = source[1];
+    dest[0] = p1;
+    dest[1] = (1*p1 + 3*p2) >> 2;
+    p1 = source[2];
+    dest[2] = (5*p2 + 3*p1) >> 3;
+    p2 = source[3];
+    dest[3] = (7*p1 + 1*p2) >> 3;
+    dest[4] = (1*p1 + 3*p2) >> 2;
+    p1 = source[4];
+    dest[5] = (1*p2 + 1*p1) >> 1;
+    p2 = source[5];
+    dest[6] = (3*p1 + 1*p2) >> 2;
+    dest[7] = (1*p1 + 7*p2) >> 3;
+    p1 = source[6];
+    dest[8] = (3*p2 + 5*p1) >> 3;
+    p2 = source[7];
+    dest[9] = (5*p1 + 3*p2) >> 3;
+    p1 = source[8];
+    dest[10] = p2;
+    dest[11] = (1*p2 + 3*p1) >> 2;
+    p2 = source[9];
+    dest[12] = (5*p1 + 3*p2) >> 3;
+    p1 = source[10];
+    dest[13] = (7*p2 + 1*p1) >> 3;
+    dest[14] = (1*p2 + 7*p1) >> 3;
+    p2 = source[11];
+    dest[15] = (1*p1 + 1*p2) >> 1;
+    p1 = source[12];
+    dest[16] = (3*p2 + 1*p1) >> 2;
+    dest[17] = p1;
+    p2 = source[13];
+    dest[18] = (3*p1 + 5*p2) >> 3;
+    p1 = source[14];
+    dest[19] = (5*p2 + 3*p1) >> 3;
+    p2 = source[15];
+    dest[20] = p1;
+    dest[21] = (1*p1 + 3*p2) >> 2;
+    p1 = source[16];
+    dest[22] = (1*p2 + 1*p1) >> 1;
+    p2 = source[17];
+    dest[23] = (7*p1 + 1*p2) >> 3;
+    dest[24] = (1*p1 + 7*p2) >> 3;
+    p1 = source[18];
+    dest[25] = (3*p2 + 5*p1) >> 3;
+    p2 = source[19];
+    dest[26] = (3*p1 + 1*p2) >> 2;
+    dest[27] = p2;
+    p1 = source[20];
+    dest[28] = (3*p2 + 5*p1) >> 3;
+    p2 = source[21];
+    dest[29] = (5*p1 + 3*p2) >> 3;
+    p1 = source[22];
+    dest[30] = (7*p2 + 1*p1) >> 3;
+    dest[31] = (1*p2 + 3*p1) >> 2;
+    p2 = source[23];
+    dest[32] = (1*p1 + 1*p2) >> 1;
+    p1 = source[24];
+    dest[33] = (3*p2 + 1*p1) >> 2;
+    dest[34] = (1*p2 + 7*p1) >> 3;
+    p2 = source[25];
+    dest[35] = (3*p1 + 5*p2) >> 3;
+    p1 = source[26];
+    dest[36] = (3*p2 + 1*p1) >> 2;
+    p2 = source[27];
+    dest[37] = p1;
+    dest[38] = (1*p1 + 3*p2) >> 2;
+    p1 = source[28];
+    dest[39] = (5*p2 + 3*p1) >> 3;
+    p2 = source[29];
+    dest[40] = (7*p1 + 1*p2) >> 3;
+    dest[41] = (1*p1 + 7*p2) >> 3;
+    p1 = source[30];
+    dest[42] = (1*p2 + 1*p1) >> 1;
+    p2 = source[31];
+    dest[43] = (3*p1 + 1*p2) >> 2;
+    dest[44] = (1*p1 + 7*p2) >> 3;
+    p1 = source[32];
+    dest[45] = (3*p2 + 5*p1) >> 3;
+    p2 = source[33];
+    dest[46] = (5*p1 + 3*p2) >> 3;
+    p1 = source[34];
+    dest[47] = p2;
+    dest[48] = (1*p2 + 3*p1) >> 2;
+    p2 = source[35];
+    dest[49] = (1*p1 + 1*p2) >> 1;
+    p1 = source[36];
+    dest[50] = (7*p2 + 1*p1) >> 3;
+    dest[51] = (1*p2 + 7*p1) >> 3;
+    p2 = source[37];
+    dest[52] = (1*p1 + 1*p2) >> 1;
+    p1 = source[38];
+    dest[53] = (3*p2 + 1*p1) >> 2;
+    dest[54] = p1;
+    p2 = source[39];
+    dest[55] = (3*p1 + 5*p2) >> 3;
+    p1 = source[40];
+    dest[56] = (5*p2 + 3*p1) >> 3;
+    p2 = source[41];
+    dest[57] = (7*p1 + 1*p2) >> 3;
+    dest[58] = (1*p1 + 3*p2) >> 2;
+    p1 = source[42];
+    dest[59] = (1*p2 + 1*p1) >> 1;
+    p2 = source[43];
+    dest[60] = (7*p1 + 1*p2) >> 3;
+    dest[61] = (1*p1 + 7*p2) >> 3;
+    p1 = source[44];
+    dest[62] = (3*p2 + 5*p1) >> 3;
+    p2 = source[45];
+    dest[63] = (3*p1 + 1*p2) >> 2;
+    source += 45;
+    dest += 64;
+  }
+  if ((width += 64) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 3*source[1]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[1] + 3*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[2] + 1*source[3]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[2] + 3*source[3]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[3] + 1*source[4]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[4] + 1*source[5]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[4] + 7*source[5]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[5] + 5*source[6]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[6] + 3*source[7]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[7];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[7] + 3*source[8]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[8] + 3*source[9]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[9] + 1*source[10]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[9] + 7*source[10]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[10] + 1*source[11]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[11] + 1*source[12]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = source[12];
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[12] + 5*source[13]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[13] + 3*source[14]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[14];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[14] + 3*source[15]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[15] + 1*source[16]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[16] + 1*source[17]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[16] + 7*source[17]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[17] + 5*source[18]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[18] + 1*source[19]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = source[19];
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[19] + 5*source[20]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[20] + 3*source[21]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[21] + 1*source[22]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[21] + 3*source[22]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[22] + 1*source[23]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[23] + 1*source[24]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[23] + 7*source[24]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[24] + 5*source[25]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[25] + 1*source[26]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = source[26];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[26] + 3*source[27]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[27] + 3*source[28]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[28] + 1*source[29]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[28] + 7*source[29]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[29] + 1*source[30]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[30] + 1*source[31]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[30] + 7*source[31]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[31] + 5*source[32]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[32] + 3*source[33]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[33];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[33] + 3*source[34]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[34] + 1*source[35]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[35] + 1*source[36]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[35] + 7*source[36]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[36] + 1*source[37]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[37] + 1*source[38]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = source[38];
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[38] + 5*source[39]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[39] + 3*source[40]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[40] + 1*source[41]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[40] + 3*source[41]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[41] + 1*source[42]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[42] + 1*source[43]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[42] + 7*source[43]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[43] + 5*source[44]) >> 3;
+ done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 16 output pixels from 9 source pixels using shifts.
+ * Useful for scaling a PAL mpeg2 dvd input source to 1280x1024 fullscreen
+ * (720 x 576 ==> 1280 x XXX)
+ */
+static void scale_line_9_16 (uint8_t *source, uint8_t *dest,
+                             int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 16) >= 0) {
+    p1 = source[0];
+    p2 = source[1];
+    dest[0] = p1;
+    dest[1] = (1*p1 + 1*p2) >> 1;
+    p1 = source[2];
+    dest[2] = (7*p2 + 1*p1) >> 3;
+    dest[3] = (3*p2 + 5*p1) >> 3;
+    p2 = source[3];
+    dest[4] = (3*p1 + 1*p2) >> 2;
+    dest[5] = (1*p1 + 3*p2) >> 2;
+    p1 = source[4];
+    dest[6] = (5*p2 + 3*p1) >> 3;
+    dest[7] = (1*p2 + 7*p1) >> 3;
+    p2 = source[5];
+    dest[8] = (1*p1 + 1*p2) >> 1;
+    p1 = source[6];
+    dest[9] = p2;
+    dest[10] = (3*p2 + 5*p1) >> 3;
+    p2 = source[7];
+    dest[11] = (7*p1 + 1*p2) >> 3;
+    dest[12] = (1*p1 + 3*p2) >> 2;
+    p1 = source[8];
+    dest[13] = (3*p2 + 1*p1) >> 2;
+    dest[14] = (1*p2 + 7*p1) >> 3;
+    p2 = source[9];
+    dest[15] = (5*p1 + 3*p2) >> 3;
+    source += 9;
+    dest += 16;
+  }
+  if ((width += 16) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 1*source[1]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[1] + 1*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[1] + 5*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[2] + 1*source[3]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[2] + 3*source[3]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[3] + 3*source[4]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[3] + 7*source[4]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[4] + 1*source[5]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = source[5];
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[5] + 5*source[6]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[6] + 1*source[7]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[6] + 3*source[7]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[7] + 1*source[8]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[7] + 7*source[8]) >> 3;
+done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 12 output pixels from 11 source pixels using shifts.
+ * Useful for scaling a PAL vcd input source to 4:3 display format.
+ */
+static void scale_line_11_12 (uint8_t *source, uint8_t *dest,
+                             int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 12) >= 0) {
+    p1 = source[0];
+    p2 = source[1];
+    dest[0] = p1;
+    dest[1] = (1*p1 + 7*p2) >> 3;
+    p1 = source[2];
+    dest[2] = (1*p2 + 7*p1) >> 3;
+    p2 = source[3];
+    dest[3] = (1*p1 + 3*p2) >> 2;
+    p1 = source[4];
+    dest[4] = (3*p2 + 5*p1) >> 3;
+    p2 = source[5];
+    dest[5] = (3*p1 + 5*p2) >> 3;
+    p1 = source[6];
+    dest[6] = (1*p2 + 1*p1) >> 1;
+    p2 = source[7];
+    dest[7] = (5*p1 + 3*p2) >> 3;
+    p1 = source[8];
+    dest[8] = (5*p2 + 3*p1) >> 3;
+    p2 = source[9];
+    dest[9] = (3*p1 + 1*p2) >> 2;
+    p1 = source[10];
+    dest[10] = (7*p2 + 1*p1) >> 3;
+    p2 = source[11];
+    dest[11] = (7*p1 + 1*p2) >> 3;
+    source += 11;
+    dest += 12;
+  }
+  if ((width += 12) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 7*source[1]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[1] + 7*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[2] + 3*source[3]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[3] + 5*source[4]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[4] + 5*source[5]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[5] + 1*source[6]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[6] + 3*source[7]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[7] + 3*source[8]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[8] + 1*source[9]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[9] + 1*source[10]) >> 3;
+done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 24 output pixels from 11 source pixels using shifts.
+ * Useful for scaling a PAL vcd input source to 4:3 display format
+ * at 2*zoom.
+ */
+static void scale_line_11_24 (uint8_t *source, uint8_t *dest,
+                             int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 24) >= 0) {
+    p1 = source[0];
+    p2 = source[1];
+    dest[0] = p1;
+    dest[1] = (1*p1 + 1*p2) >> 1;
+    dest[2] = (1*p1 + 7*p2) >> 3;
+    p1 = source[2];
+    dest[3] = (5*p2 + 3*p1) >> 3;
+    dest[4] = (1*p2 + 7*p1) >> 3;
+    p2 = source[3];
+    dest[5] = (3*p1 + 1*p2) >> 2;
+    dest[6] = (1*p1 + 3*p2) >> 2;
+    p1 = source[4];
+    dest[7] = (3*p2 + 1*p1) >> 2;
+    dest[8] = (3*p2 + 5*p1) >> 3;
+    p2 = source[5];
+    dest[9] = (7*p1 + 1*p2) >> 3;
+    dest[10] = (3*p1 + 5*p2) >> 3;
+    p1 = source[6];
+    dest[11] = p2;
+    dest[12] = (1*p2 + 1*p1) >> 1;
+    dest[13] = p1;
+    p2 = source[7];
+    dest[14] = (5*p1 + 3*p2) >> 3;
+    dest[15] = (1*p1 + 7*p2) >> 3;
+    p1 = source[8];
+    dest[16] = (5*p2 + 3*p1) >> 3;
+    dest[17] = (1*p2 + 3*p1) >> 2;
+    p2 = source[9];
+    dest[18] = (3*p1 + 1*p2) >> 2;
+    dest[19] = (1*p1 + 3*p2) >> 2;
+    p1 = source[10];
+    dest[20] = (7*p2 + 1*p1) >> 3;
+    dest[21] = (3*p2 + 5*p1) >> 3;
+    p2 = source[11];
+    dest[22] = (7*p1 + 1*p2) >> 3;
+    dest[23] = (1*p1 + 1*p2) >> 1;
+    source += 11;
+    dest += 24;
+  }
+  if ((width += 24) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 1*source[1]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 7*source[1]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[1] + 3*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[1] + 7*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[2] + 1*source[3]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[2] + 3*source[3]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[3] + 1*source[4]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[3] + 5*source[4]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[4] + 1*source[5]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[4] + 5*source[5]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = source[5];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[5] + 1*source[6]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = source[6];
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[6] + 3*source[7]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[6] + 7*source[7]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (5*source[7] + 3*source[8]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[7] + 3*source[8]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[8] + 1*source[9]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[8] + 3*source[9]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[9] + 1*source[10]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[9] + 5*source[10]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[10] + 1*source[11]) >> 3;
+done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 8 output pixels from 5 source pixels using shifts.
+ * Useful for scaling a PAL svcd input source to 4:3 display format.
+ */
+static void scale_line_5_8 (uint8_t *source, uint8_t *dest,
+                            int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 8) >= 0) {
+    p1 = source[0];
+    p2 = source[1];
+    dest[0] = p1;
+    dest[1] = (3*p1 + 5*p2) >> 3;
+    p1 = source[2];
+    dest[2] = (3*p2 + 1*p1) >> 2;
+    dest[3] = (1*p2 + 7*p1) >> 3;
+    p2 = source[3];
+    dest[4] = (1*p1 + 1*p2) >> 1;
+    p1 = source[4];
+    dest[5] = (7*p2 + 1*p1) >> 3;
+    dest[6] = (1*p2 + 3*p1) >> 2;
+    p2 = source[5];
+    dest[7] = (5*p1 + 3*p2) >> 3;
+    source += 5;
+    dest += 8;
+  }
+  if ((width += 8) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[0] + 5*source[1]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (3*source[1] + 1*source[2]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[1] + 7*source[2]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[2] + 1*source[3]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = (7*source[3] + 1*source[4]) >> 3;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[3] + 3*source[4]) >> 2;
+done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/*
+ * Interpolates 4 output pixels from 3 source pixels using shifts.
+ * Useful for scaling a NTSC svcd input source to 4:3 display format.
+ */
+static void scale_line_3_4 (uint8_t *source, uint8_t *dest,
+                            int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  while ((width -= 4) >= 0) {
+    p1 = source[0];
+    p2 = source[1];
+    dest[0] = p1;
+    dest[1] = (1*p1 + 3*p2) >> 2;
+    p1 = source[2];
+    dest[2] = (1*p2 + 1*p1) >> 1;
+    p2 = source[3];
+    dest[3] = (3*p1 + 1*p2) >> 2;
+    source += 3;
+    dest += 4;
+  }
+  if ((width += 4) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[0] + 3*source[1]) >> 2;
+  if (--width <= 0) goto done;
+  *dest++ = (1*source[1] + 1*source[2]) >> 1;
+done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+/* Interpolate 2 output pixels from one source pixel. */
+static void scale_line_1_2 (uint8_t *source, uint8_t *dest,
+                            int width, int step) {
+  int p1, p2;
+  xine_profiler_start_count(prof_scale_line);
+  p1 = *source;
+  while ((width -= 4) >= 0) {
+    *dest++ = p1;
+    p2 = *++source;
+    *dest++ = (p1 + p2) >> 1;
+    *dest++ = p2;
+    p1 = *++source;
+    *dest++ = (p2 + p1) >> 1;
+  }
+  if ((width += 4) <= 0) goto done;
+  *dest++ = source[0];
+  if (--width <= 0) goto done;
+  *dest++ = (source[0] + source[1]) >> 1;
+  if (--width <= 0) goto done;
+  *dest++ = source[1];
+ done:
+  xine_profiler_stop_count(prof_scale_line);
+}
+                        
+/*
+ * Scale line with no horizontal scaling. For NTSC mpeg2 dvd input in
+ * 4:3 output format (720x480 -> 720x540)
+ */
+static void scale_line_1_1 (uint8_t *source, uint8_t *dest,
+                            int width, int step) {
+  xine_profiler_start_count(prof_scale_line);
+  xine_fast_memcpy(dest, source, width);
+  xine_profiler_stop_count(prof_scale_line);
+}
+                        
+static scale_line_func_t find_scale_line_func(int step) {
+  static struct {
+            int                 src_step;
+            int                 dest_step;
+            scale_line_func_tfunc;
+            char       *desc;
+  } scale_line[] = {
+    { 15, 16, scale_line_15_16, "dvd 4:3(pal)" },
+    { 45, 64, scale_line_45_64, "dvd 16:9(pal), fullscreen(1024x768)" },
+    {  9, 16, scale_line_9_16,  "dvd fullscreen(1280x1024)" },
+    { 45, 53, scale_line_45_53, "dvd 16:9(ntsc)" },
+    { 11, 12, scale_line_11_12, "vcd 4:3(pal)" },
+    { 11, 24, scale_line_11_24, "vcd 4:3(pal) 2*zoom" },
+    {  5,  8, scale_line_5_8,   "svcd 4:3(pal)" },
+    {  3,  4, scale_line_3_4,   "svcd 4:3(ntsc)" },
+    {  1,  2, scale_line_1_2,   "2*zoom" },
+    {  1,  1, scale_line_1_1,   "non-scaled" },
+  };
+  int i;
+  for (i = 0; i < sizeof(scale_line)/sizeof(scale_line[0]); i++) {
+    if (step == scale_line[i].src_step*32768/scale_line[i].dest_step) {
+      printf("yuv2rgb: using %s optimized scale_line\n", scale_line[i].desc);
+      return scale_line[i].func;
+    }
+  }
+  printf("yuv2rgb: using generic scale_line with interpolation\n");
+  return scale_line_gen;
+}
+static void scale_line_2 (uint8_t *source, uint8_t *dest,
+                          int width, int step) {
+  int p1;
+  int p2;
+  int dx;
+  p1 = *source; source+=2;
+  p2 = *source; source+=2;
+  dx = 0;
+  while (width) {
+    *dest = (p1 * (32768 - dx) + p2 * dx) / 32768;
+    dx += step;
+    while (dx > 32768) {
+      dx -= 32768;
+      p1 = p2;
+      p2 = *source;
+      source+=2;
+    }
+    dest ++;
+    width --;
+  }
+}
+static void scale_line_4 (uint8_t *source, uint8_t *dest,
+                          int width, int step) {
+  int p1;
+  int p2;
+  int dx;
+  p1 = *source; source+=4;
+  p2 = *source; source+=4;
+  dx = 0;
+  while (width) {
+    *dest = (p1 * (32768 - dx) + p2 * dx) / 32768;
+    dx += step;
+    while (dx > 32768) {
+      dx -= 32768;
+      p1 = p2;
+      p2 = *source;
+      source+=4;
+    }
+    dest ++;
+    width --;
+  }
+}
+        #define RGB(i)                                          \
+                U = pu[i];                                      \
+                V = pv[i];                                      \
+                r = this->table_rV[V];                          \
+                g = (void *) (((uint8_t *)this->table_gU[U]) + this->table_gV[V]);\
+        b = this->table_bU[U];
+        #define DST1(i)                                 \
+        Y = py_1[2*i];                          \
+                dst_1[2*i] = r[Y] + g[Y] + b[Y];\
+                Y = py_1[2*i+1];                \
+        dst_1[2*i+1] = r[Y] + g[Y] + b[Y];
+        #define DST2(i)                                 \
+                Y = py_2[2*i];                  \
+                dst_2[2*i] = r[Y] + g[Y] + b[Y];\
+                Y = py_2[2*i+1];                \
+        dst_2[2*i+1] = r[Y] + g[Y] + b[Y];
+        #define DST1RGB(i)                                              \
+                Y = py_1[2*i];                                          \
+                dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y];\
+                Y = py_1[2*i+1];                                        \
+        dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y];
+        #define DST2RGB(i)                                              \
+                Y = py_2[2*i];                                          \
+                dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y];\
+                Y = py_2[2*i+1];                                        \
+        dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y];
+        #define DST1BGR(i)                                              \
+                Y = py_1[2*i];                                          \
+                dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y];\
+                Y = py_1[2*i+1];                                        \
+        dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y];
+        #define DST2BGR(i)                                              \
+                Y = py_2[2*i];                                          \
+                dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y];\
+                Y = py_2[2*i+1];                                        \
+        dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
+        #define DST1CMAP(i)                                             \
+                Y = py_1[2*i];                                          \
+                dst_1[2*i] = this->cmap[r[Y] + g[Y] + b[Y]];    \
+                Y = py_1[2*i+1];                                        \
+        dst_1[2*i+1] = this->cmap[r[Y] + g[Y] + b[Y]];
+        #define DST2CMAP(i)                                             \
+                Y = py_2[2*i];                                          \
+                dst_2[2*i] = this->cmap[r[Y] + g[Y] + b[Y]];    \
+                Y = py_2[2*i+1];                                        \
+        dst_2[2*i+1] = this->cmap[r[Y] + g[Y] + b[Y]];
+static void yuv2rgb_c_32 (yuv2rgb_t *this, uint8_t * _dst,
+                          uint8_t * _py, uint8_t * _pu, uint8_t * _pv)
+{
+  int U, V, Y;
+  uint8_t  * py_1, * py_2, * pu, * pv;
+  uint32_t * r, * g, * b;
+  uint32_t * dst_1, * dst_2;
+  int width, height, dst_height;
+  int dy;
+  if (this->do_scale) {
+    scale_line_func_t scale_line = this->scale_line;
+    scale_line (_pu, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_pv, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_py, this->y_buffer, 
+                this->dest_width, this->step_dx);
+    dy = 0;
+    dst_height = this->dest_height;
+    for (height = 0;; ) {
+      dst_1 = (uint32_t*)_dst;
+      py_1  = this->y_buffer;
+      pu    = this->u_buffer;
+      pv    = this->v_buffer;
+      width = this->dest_width >> 3;
+      do {
+          RGB(0);
+          DST1(0);
+          RGB(1);
+          DST1(1);
+      
+          RGB(2);
+          DST1(2);
+          RGB(3);
+          DST1(3);
+          pu += 4;
+          pv += 4;
+          py_1 += 8;
+          dst_1 += 8;
+      } while (--width);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+      while (--dst_height > 0 && dy < 32768) {
+        xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width*4); 
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      do {
+          dy -= 32768;
+          _py += this->y_stride;
+          scale_line (_py, this->y_buffer, 
+                      this->dest_width, this->step_dx);
+          if (height & 1) {
+              _pu += this->uv_stride;
+              _pv += this->uv_stride;
+          
+              scale_line (_pu, this->u_buffer,
+                          this->dest_width >> 1, this->step_dx);
+              scale_line (_pv, this->v_buffer,
+                          this->dest_width >> 1, this->step_dx);
+          
+          }
+          height++;
+      } while( dy>=32768);
+    }
+  } else {
+    height = this->source_height >> 1;
+    do {
+      dst_1 = (uint32_t*)_dst;
+      dst_2 = (void*)( (uint8_t *)_dst + this->rgb_stride );
+      py_1 = _py;
+      py_2 = _py + this->y_stride;
+      pu   = _pu;
+      pv   = _pv;
+      width = this->source_width >> 3;
+      do {
+        RGB(0);
+        DST1(0);
+        DST2(0);
+        RGB(1);
+        DST2(1);
+        DST1(1);
+        RGB(2);
+        DST1(2);
+        DST2(2);
+        RGB(3);
+        DST2(3);
+        DST1(3);
+      
+        pu += 4;
+        pv += 4;
+        py_1 += 8;
+        py_2 += 8;
+        dst_1 += 8;
+        dst_2 += 8;
+      } while (--width);
+      _dst += 2 * this->rgb_stride; 
+      _py += 2 * this->y_stride;
+      _pu += this->uv_stride;
+      _pv += this->uv_stride;
+    } while (--height);
+  }
+}
+/* This is very near from the yuv2rgb_c_32 code */
+static void yuv2rgb_c_24_rgb (yuv2rgb_t *this, uint8_t * _dst,
+                              uint8_t * _py, uint8_t * _pu, uint8_t * _pv)
+{
+  int U, V, Y;
+  uint8_t * py_1, * py_2, * pu, * pv;
+  uint8_t * r, * g, * b;
+  uint8_t * dst_1, * dst_2;
+  int width, height, dst_height;
+  int dy;
+  if (this->do_scale) {
+    scale_line_func_t scale_line = this->scale_line;
+    scale_line (_pu, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_pv, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_py, this->y_buffer, 
+                this->dest_width, this->step_dx);
+    dy = 0;
+    dst_height = this->dest_height;
+    for (height = 0;; ) {
+      dst_1 = _dst;
+      py_1  = this->y_buffer;
+      pu    = this->u_buffer;
+      pv    = this->v_buffer;
+      width = this->dest_width >> 3;
+      do {
+          RGB(0);
+          DST1RGB(0);
+          RGB(1);
+          DST1RGB(1);
+      
+          RGB(2);
+          DST1RGB(2);
+          RGB(3);
+          DST1RGB(3);
+          pu += 4;
+          pv += 4;
+          py_1 += 8;
+          dst_1 += 24;
+      } while (--width);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+      while (--dst_height > 0 && dy < 32768) {
+        xine_fast_memcpy (_dst, _dst-this->rgb_stride, this->dest_width*3); 
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      do {
+          dy -= 32768;
+          _py += this->y_stride;
+          scale_line (_py, this->y_buffer, 
+                      this->dest_width, this->step_dx);
+          if (height & 1) {
+              _pu += this->uv_stride;
+              _pv += this->uv_stride;
+          
+              scale_line (_pu, this->u_buffer,
+                          this->dest_width >> 1, this->step_dx);
+              scale_line (_pv, this->v_buffer,
+                          this->dest_width >> 1, this->step_dx);
+          
+          }
+          height++;
+      } while (dy>=32768);
+    }
+  } else {
+    height = this->source_height >> 1;
+    do {
+      dst_1 = _dst;
+      dst_2 = (void*)( (uint8_t *)_dst + this->rgb_stride );
+      py_1  = _py;
+      py_2  = _py + this->y_stride;
+      pu    = _pu;
+      pv    = _pv;
+      width = this->source_width >> 3;
+      do {
+        RGB(0);
+        DST1RGB(0);
+        DST2RGB(0);
+        RGB(1);
+        DST2RGB(1);
+        DST1RGB(1);
+        RGB(2);
+        DST1RGB(2);
+        DST2RGB(2);
+        RGB(3);
+        DST2RGB(3);
+        DST1RGB(3);
+        pu += 4;
+        pv += 4;
+        py_1 += 8;
+        py_2 += 8;
+        dst_1 += 24;
+        dst_2 += 24;
+      } while (--width);
+      _dst += 2 * this->rgb_stride; 
+      _py += 2 * this->y_stride;
+      _pu += this->uv_stride;
+      _pv += this->uv_stride;
+      
+    } while (--height);
+  }
+}
+/* only trivial mods from yuv2rgb_c_24_rgb */
+static void yuv2rgb_c_24_bgr (yuv2rgb_t *this, uint8_t * _dst,
+                              uint8_t * _py, uint8_t * _pu, uint8_t * _pv)
+{
+  int U, V, Y;
+  uint8_t * py_1, * py_2, * pu, * pv;
+  uint8_t * r, * g, * b;
+  uint8_t * dst_1, * dst_2;
+  int width, height, dst_height;
+  int dy;
+  if (this->do_scale) {
+    scale_line_func_t scale_line = this->scale_line;
+    scale_line (_pu, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_pv, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_py, this->y_buffer, 
+                this->dest_width, this->step_dx);
+    dy = 0;
+    dst_height = this->dest_height;
+    for (height = 0;; ) {
+      dst_1 = _dst;
+      py_1  = this->y_buffer;
+      pu    = this->u_buffer;
+      pv    = this->v_buffer;
+      width = this->dest_width >> 3;
+      do {
+          RGB(0);
+          DST1BGR(0);
+          RGB(1);
+          DST1BGR(1);
+      
+          RGB(2);
+          DST1BGR(2);
+          RGB(3);
+          DST1BGR(3);
+          pu += 4;
+          pv += 4;
+          py_1 += 8;
+          dst_1 += 24;
+      } while (--width);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+      while (--dst_height > 0 && dy < 32768) {
+        xine_fast_memcpy (_dst, _dst-this->rgb_stride, this->dest_width*3);
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      do {
+          dy -= 32768;
+          _py += this->y_stride;
+          scale_line (_py, this->y_buffer, 
+                      this->dest_width, this->step_dx);
+          if (height & 1) {
+              _pu += this->uv_stride;
+              _pv += this->uv_stride;
+          
+              scale_line (_pu, this->u_buffer,
+                          this->dest_width >> 1, this->step_dx);
+              scale_line (_pv, this->v_buffer,
+                          this->dest_width >> 1, this->step_dx);
+          
+          }
+          height++;
+      } while( dy>=32768 );
+    }
+  } else {
+    height = this->source_height >> 1;
+    do {
+      dst_1 = _dst;
+      dst_2 = (void*)( (uint8_t *)_dst + this->rgb_stride );
+      py_1 = _py;
+      py_2 = _py + this->y_stride;
+      pu   = _pu;
+      pv   = _pv;
+      width = this->source_width >> 3;
+      do {
+        RGB(0);
+        DST1BGR(0);
+        DST2BGR(0);
+        RGB(1);
+        DST2BGR(1);
+        DST1BGR(1);
+        RGB(2);
+        DST1BGR(2);
+        DST2BGR(2);
+        RGB(3);
+        DST2BGR(3);
+        DST1BGR(3);
+        pu += 4;
+        pv += 4;
+        py_1 += 8;
+        py_2 += 8;
+        dst_1 += 24;
+        dst_2 += 24;
+      } while (--width);
+      _dst += 2 * this->rgb_stride; 
+      _py += 2 * this->y_stride;
+      _pu += this->uv_stride;
+      _pv += this->uv_stride;
+    } while (--height);
+  }
+}
+/* This is exactly the same code as yuv2rgb_c_32 except for the types of */
+/* r, g, b, dst_1, dst_2 */
+static void yuv2rgb_c_16 (yuv2rgb_t *this, uint8_t * _dst,
+                          uint8_t * _py, uint8_t * _pu, uint8_t * _pv)
+{
+  int U, V, Y;
+  uint8_t * py_1, * py_2, * pu, * pv;
+  uint16_t * r, * g, * b;
+  uint16_t * dst_1, * dst_2;
+  int width, height, dst_height;
+  int dy;
+  if (this->do_scale) {
+    scale_line_func_t scale_line = this->scale_line;
+    scale_line (_pu, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_pv, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_py, this->y_buffer, 
+                this->dest_width, this->step_dx);
+    dy = 0;
+    dst_height = this->dest_height;
+    for (height = 0;; ) {
+      dst_1 = (uint16_t*)_dst;
+      py_1  = this->y_buffer;
+      pu    = this->u_buffer;
+      pv    = this->v_buffer;
+      width = this->dest_width >> 3;
+      do {
+          RGB(0);
+          DST1(0);
+          RGB(1);
+          DST1(1);
+      
+          RGB(2);
+          DST1(2);
+          RGB(3);
+          DST1(3);
+          pu += 4;
+          pv += 4;
+          py_1 += 8;
+          dst_1 += 8;
+      } while (--width);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+      while (--dst_height > 0 && dy < 32768) {
+        xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width*2); 
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      do {
+          dy -= 32768;
+          _py += this->y_stride;
+          scale_line (_py, this->y_buffer, 
+                      this->dest_width, this->step_dx);
+          if (height & 1) {
+              _pu += this->uv_stride;
+              _pv += this->uv_stride;
+          
+              scale_line (_pu, this->u_buffer,
+                          this->dest_width >> 1, this->step_dx);
+              scale_line (_pv, this->v_buffer,
+                          this->dest_width >> 1, this->step_dx);
+          
+          }
+          height++;
+      } while( dy>=32768);
+    }
+  } else {
+    height = this->source_height >> 1;
+    do {
+      dst_1 = (uint16_t*)_dst;
+      dst_2 = (void*)( (uint8_t *)_dst + this->rgb_stride );
+      py_1 = _py;
+      py_2 = _py + this->y_stride;
+      pu   = _pu;
+      pv   = _pv;
+      width = this->source_width >> 3;
+      do {
+        RGB(0);
+        DST1(0);
+        DST2(0);
+        RGB(1);
+        DST2(1);
+        DST1(1);
+        RGB(2);
+        DST1(2);
+        DST2(2);
+        RGB(3);
+        DST2(3);
+        DST1(3);
+        pu += 4;
+        pv += 4;
+        py_1 += 8;
+        py_2 += 8;
+        dst_1 += 8;
+        dst_2 += 8;
+      } while (--width);
+      _dst += 2 * this->rgb_stride; 
+      _py += 2 * this->y_stride;
+      _pu += this->uv_stride;
+      _pv += this->uv_stride;
+    } while (--height);
+  }
+}
+/* This is exactly the same code as yuv2rgb_c_32 except for the types of */
+/* r, g, b, dst_1, dst_2 */
+static void yuv2rgb_c_8 (yuv2rgb_t *this, uint8_t * _dst,
+                          uint8_t * _py, uint8_t * _pu, uint8_t * _pv)
+{
+  int U, V, Y;
+  uint8_t  * py_1, * py_2, * pu, * pv;
+  uint8_t * r, * g, * b;
+  uint8_t * dst_1, * dst_2;
+  int width, height, dst_height;
+  int dy;
+  if (this->do_scale) {
+    scale_line_func_t scale_line = this->scale_line;
+    scale_line (_pu, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_pv, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_py, this->y_buffer, 
+                this->dest_width, this->step_dx);
+    dy = 0;
+    dst_height = this->dest_height;
+    for (height = 0;; ) {
+      dst_1 = (uint8_t*)_dst;
+      py_1  = this->y_buffer;
+      pu    = this->u_buffer;
+      pv    = this->v_buffer;
+      width = this->dest_width >> 3;
+      do {
+          RGB(0);
+          DST1(0);
+          RGB(1);
+          DST1(1);
+      
+          RGB(2);
+          DST1(2);
+          RGB(3);
+          DST1(3);
+          pu += 4;
+          pv += 4;
+          py_1 += 8;
+          dst_1 += 8;
+      } while (--width);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+      while (--dst_height > 0 && dy < 32768) {
+        xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width); 
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      do {
+          dy -= 32768;
+          _py += this->y_stride;
+          scale_line (_py, this->y_buffer, 
+                      this->dest_width, this->step_dx);
+          if (height & 1) {
+              _pu += this->uv_stride;
+              _pv += this->uv_stride;
+          
+              scale_line (_pu, this->u_buffer,
+                          this->dest_width >> 1, this->step_dx);
+              scale_line (_pv, this->v_buffer,
+                          this->dest_width >> 1, this->step_dx);
+          
+          }
+          height++;
+      } while( dy>=32768 );
+    }
+  } else {
+    height = this->source_height >> 1;
+    do {
+      dst_1 = (uint8_t*)_dst;
+      dst_2 = (void*)( (uint8_t *)_dst + this->rgb_stride );
+      py_1 = _py;
+      py_2 = _py + this->y_stride;
+      pu   = _pu;
+      pv   = _pv;
+      width = this->source_width >> 3;
+      do {
+        RGB(0);
+        DST1(0);
+        DST2(0);
+        RGB(1);
+        DST2(1);
+        DST1(1);
+        RGB(2);
+        DST1(2);
+        DST2(2);
+        RGB(3);
+        DST2(3);
+        DST1(3);
+      
+        pu += 4;
+        pv += 4;
+        py_1 += 8;
+        py_2 += 8;
+        dst_1 += 8;
+        dst_2 += 8;
+      } while (--width);
+      _dst += 2 * this->rgb_stride; 
+      _py += 2 * this->y_stride;
+      _pu += this->uv_stride;
+      _pv += this->uv_stride;
+    } while (--height);
+  }
+}
+/* now for something different: 256 grayscale mode */
+static void yuv2rgb_c_gray (yuv2rgb_t *this, uint8_t * _dst,
+                            uint8_t * _py, uint8_t * _pu, uint8_t * _pv)
+{
+  int height, dst_height;
+  int dy;
+  if (this->do_scale) {
+    scale_line_func_t scale_line = this->scale_line;
+    dy = 0;
+    dst_height = this->dest_height;
+    for (;;) {
+      scale_line (_py, _dst, this->dest_width, this->step_dx);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+      while (--dst_height > 0 && dy < 32768) {
+        xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width); 
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      _py += this->y_stride*(dy>>15);
+      dy &= 32767;
+      /* dy -= 32768; 
+         _py += this->y_stride;
+      */
+    }
+  } else {
+    for (height = this->source_height; --height >= 0; ) {
+      xine_fast_memcpy(_dst, _py, this->dest_width);
+      _dst += this->rgb_stride;
+      _py += this->y_stride;
+    }
+  }
+}
+/* now for something different: 256 color mode */
+static void yuv2rgb_c_palette (yuv2rgb_t *this, uint8_t * _dst,
+                               uint8_t * _py, uint8_t * _pu, uint8_t * _pv)
+{
+  int U, V, Y;
+  uint8_t * py_1, * py_2, * pu, * pv;
+  uint16_t * r, * g, * b;
+  uint8_t * dst_1, * dst_2;
+  int width, height, dst_height;
+  int dy;
+  if (this->do_scale) {
+    scale_line_func_t scale_line = this->scale_line;
+    scale_line (_pu, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_pv, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+    scale_line (_py, this->y_buffer,
+                this->dest_width, this->step_dx);
+    dy = 0;
+    dst_height = this->dest_height;
+    for (height = 0;; ) {
+      dst_1 = _dst;
+      py_1  = this->y_buffer;
+      pu    = this->u_buffer;
+      pv    = this->v_buffer;
+      width = this->dest_width >> 3;
+      do {
+          RGB(0);
+          DST1CMAP(0);
+          RGB(1);
+          DST1CMAP(1);
+      
+          RGB(2);
+          DST1CMAP(2);
+          RGB(3);
+          DST1CMAP(3);
+          pu += 4;
+          pv += 4;
+          py_1 += 8;
+          dst_1 += 8;
+      } while (--width);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+      while (--dst_height > 0 && dy < 32768) {
+        xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width); 
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      do {
+          dy -= 32768;
+          _py += this->y_stride;
+          scale_line (_py, this->y_buffer,
+                      this->dest_width, this->step_dx);
+          if (height & 1) {
+              _pu += this->uv_stride;
+              _pv += this->uv_stride;
+          
+              scale_line (_pu, this->u_buffer,
+                          this->dest_width >> 1, this->step_dx);
+              scale_line (_pv, this->v_buffer,
+                          this->dest_width >> 1, this->step_dx);
+          
+          }
+          height++;
+      } while( dy>=32768 );
+    }
+  } else {
+    height = this->source_height >> 1;
+    do {
+      dst_1 = _dst;
+      dst_2 = _dst + this->rgb_stride;
+      py_1 = _py;
+      py_2 = _py + this->y_stride;
+      pu   = _pu;
+      pv   = _pv;
+      width = this->source_width >> 3;
+      do {
+        RGB(0);
+        DST1CMAP(0);
+        DST2CMAP(0);
+        RGB(1);
+        DST2CMAP(1);
+        DST1CMAP(1);
+        RGB(2);
+        DST1CMAP(2);
+        DST2CMAP(2);
+        RGB(3);
+        DST2CMAP(3);
+        DST1CMAP(3);
+        pu += 4;
+        pv += 4;
+        py_1 += 8;
+        py_2 += 8;
+        dst_1 += 8;
+        dst_2 += 8;
+      } while (--width);
+      _dst += 2 * this->rgb_stride; 
+      _py += 2 * this->y_stride;
+      _pu += this->uv_stride;
+      _pv += this->uv_stride;
+    } while (--height);
+  }
+}
+static int div_round (int dividend, int divisor)
+{
+  if (dividend > 0)
+    return (dividend + (divisor>>1)) / divisor;
+  else
+    return -((-dividend + (divisor>>1)) / divisor);
+}
+static void yuv2rgb_setup_tables (yuv2rgb_factory_t *this, int mode, int swapped) 
+{
+  int i;
+  uint8_t table_Y[1024];
+  uint32_t * table_32 = 0;
+  uint16_t * table_16 = 0;
+  uint8_t * table_8 = 0;
+  int entry_size = 0;
+  void *table_r = 0, *table_g = 0, *table_b = 0;
+  int shift_r = 0, shift_g = 0, shift_b = 0;
+  int crv = Inverse_Table_6_9[this->matrix_coefficients][0];
+  int cbu = Inverse_Table_6_9[this->matrix_coefficients][1];
+  int cgu = -Inverse_Table_6_9[this->matrix_coefficients][2];
+  int cgv = -Inverse_Table_6_9[this->matrix_coefficients][3];
+  for (i = 0; i < 1024; i++) {
+    int j;
+    j = (76309 * (i - 384 - 16) + 32768) >> 16;
+    j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
+    table_Y[i] = j;
+  }
+  switch (mode) {
+  case MODE_32_RGB:
+  case MODE_32_BGR:
+    table_32 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t));
+    entry_size = sizeof (uint32_t);
+    table_r = table_32 + 197;
+    table_b = table_32 + 197 + 685;
+    table_g = table_32 + 197 + 2*682;
+    if (swapped) {
+      switch (mode) {
+      case MODE_32_RGB: shift_r =  8; shift_g = 16; shift_b = 24; break;
+              case MODE_32_BGR:shift_r = 24; shift_g = 16; shift_b =  8; break;
+      }
+    } else {
+      switch (mode) {
+              case MODE_32_RGB:shift_r = 16; shift_g =  8; shift_b =  0; break;
+              case MODE_32_BGR:shift_r =  0; shift_g =  8; shift_b = 16; break;
+      }
+    }
+    for (i = -197; i < 256+197; i++)
+      ((uint32_t *) table_r)[i] = table_Y[i+384] << shift_r;
+    for (i = -132; i < 256+132; i++)
+      ((uint32_t *) table_g)[i] = table_Y[i+384] << shift_g;
+    for (i = -232; i < 256+232; i++)
+      ((uint32_t *) table_b)[i] = table_Y[i+384] << shift_b;
+    break;
+  case MODE_24_RGB:
+  case MODE_24_BGR:
+    table_8 = malloc ((256 + 2*232) * sizeof (uint8_t));
+    entry_size = sizeof (uint8_t);
+    table_r = table_g = table_b = table_8 + 232;
+    for (i = -232; i < 256+232; i++)
+      ((uint8_t * )table_b)[i] = table_Y[i+384];
+    break;
+  case MODE_15_BGR:
+  case MODE_16_BGR:
+  case MODE_15_RGB:
+  case MODE_16_RGB:
+    table_16 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));
+    entry_size = sizeof (uint16_t);
+    table_r = table_16 + 197;
+    table_b = table_16 + 197 + 685;
+    table_g = table_16 + 197 + 2*682;
+    if (swapped) {
+      switch (mode) {
+      case MODE_15_BGR: shift_r =  8; shift_g =  5; shift_b = 2; break;
+              case MODE_16_BGR:shift_r =  8; shift_g =  5; shift_b = 3; break;
+              case MODE_15_RGB:shift_r =  2; shift_g =  5; shift_b = 8; break;
+              case MODE_16_RGB:shift_r =  3; shift_g =  5; shift_b = 8; break;
+      }
+    } else {
+      switch (mode) {
+              case MODE_15_BGR:shift_r =  0; shift_g =  5; shift_b = 10; break;
+              case MODE_16_BGR:shift_r =  0; shift_g =  5; shift_b = 11; break;
+              case MODE_15_RGB:shift_r = 10; shift_g =  5; shift_b =  0; break;
+              case MODE_16_RGB:shift_r = 11; shift_g =  5; shift_b =  0; break;
+      }
+    }
+    for (i = -197; i < 256+197; i++)
+      ((uint16_t *)table_r)[i] = (table_Y[i+384] >> 3) << shift_r;
+    for (i = -132; i < 256+132; i++) {
+      int j = table_Y[i+384] >> (((mode==MODE_16_RGB) || (mode==MODE_16_BGR)) ? 2 : 3);
+      if (swapped)
+        ((uint16_t *)table_g)[i] = (j&7) << 13 | (j>>3);
+      else
+        ((uint16_t *)table_g)[i] = j << 5;
+    }
+    for (i = -232; i < 256+232; i++)
+      ((uint16_t *)table_b)[i] = (table_Y[i+384] >> 3) << shift_b;
+    break;
+  case MODE_8_RGB:
+  case MODE_8_BGR:
+    table_8 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
+    entry_size = sizeof (uint8_t);
+    table_r = table_8 + 197;
+    table_b = table_8 + 197 + 685;
+    table_g = table_8 + 197 + 2*682;
+    switch (mode) {
+    case MODE_8_RGB: shift_r =  5; shift_g =  2; shift_b =  0; break;
+    case MODE_8_BGR: shift_r =  0; shift_g =  3; shift_b =  6; break;
+    }
+    for (i = -197; i < 256+197; i++)
+      ((uint8_t *) table_r)[i] = (table_Y[i+384] >> 5) << shift_r;
+    for (i = -132; i < 256+132; i++)
+      ((uint8_t *) table_g)[i] = (table_Y[i+384] >> 5) << shift_g;
+    for (i = -232; i < 256+232; i++)
+      ((uint8_t *) table_b)[i] = (table_Y[i+384] >> 6) << shift_b;
+    break;
+  case MODE_8_GRAY:
+    return;
+  case MODE_PALETTE:
+    table_16 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));
+    entry_size = sizeof (uint16_t);
+    table_r = table_16 + 197;
+    table_b = table_16 + 197 + 685;
+    table_g = table_16 + 197 + 2*682;
+    shift_r = 10;
+    shift_g = 5;
+    shift_b = 0;
+    for (i = -197; i < 256+197; i++)
+      ((uint16_t *)table_r)[i] = (table_Y[i+384] >> 3) << 10;
+    for (i = -132; i < 256+132; i++)
+      ((uint16_t *)table_g)[i] = (table_Y[i+384] >> 3) << 5;
+    for (i = -232; i < 256+232; i++)
+      ((uint16_t *)table_b)[i] = (table_Y[i+384] >> 3) << 0;
+    break;
+  default:
+    fprintf (stderr, "mode %d not supported by yuv2rgb\n", mode);
+    abort();
+  }
+  
+  for (i = 0; i < 256; i++) {
+    this->table_rV[i] = (((uint8_t *) table_r) +
+                         entry_size * div_round (crv * (i-128), 76309));
+    this->table_gU[i] = (((uint8_t *) table_g) +
+                         entry_size * div_round (cgu * (i-128), 76309));
+    this->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
+    this->table_bU[i] = (((uint8_t *)table_b) +
+                         entry_size * div_round (cbu * (i-128), 76309));
+  }
+  this->gamma = 0;
+  this->entry_size = entry_size;
+}
+static uint32_t yuv2rgb_single_pixel_32 (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v)
+{
+  uint32_t * r, * g, * b;
+  r = this->table_rV[v];
+  g = (void *) (((uint8_t *)this->table_gU[u]) + this->table_gV[v]);
+  b = this->table_bU[u];
+  return r[y] + g[y] + b[y];
+}
+static uint32_t yuv2rgb_single_pixel_24_rgb (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v)
+{
+  uint8_t * r, * g, * b;
+  r = this->table_rV[v];
+  g = (void *) (((uint8_t *)this->table_gU[u]) + this->table_gV[v]);
+  b = this->table_bU[u];
+  return (uint32_t) r[y] +
+          ((uint32_t) g[y] << 8) +
+          ((uint32_t) b[y] << 16);
+}
+static uint32_t yuv2rgb_single_pixel_24_bgr (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v)
+{
+  uint8_t * r, * g, * b;
+  r = this->table_rV[v];
+  g = (void *) (((uint8_t *)this->table_gU[u]) + this->table_gV[v]);
+  b = this->table_bU[u];
+  return (uint32_t) b[y] +
+          ((uint32_t) g[y] << 8) +
+          ((uint32_t) r[y] << 16);
+}
+static uint32_t yuv2rgb_single_pixel_16 (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v)
+{
+  uint16_t * r, * g, * b;
+  r = this->table_rV[v];
+  g = (void *) (((uint8_t *)this->table_gU[u]) + this->table_gV[v]);
+  b = this->table_bU[u];
+  return r[y] + g[y] + b[y];
+}
+static uint32_t yuv2rgb_single_pixel_8 (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v)
+{
+  uint8_t * r, * g, * b;
+  r = this->table_rV[v];
+  g = (void *) (((uint8_t *)this->table_gU[u]) + this->table_gV[v]);
+  b = this->table_bU[u];
+  return r[y] + g[y] + b[y];
+}
+static uint32_t yuv2rgb_single_pixel_gray (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v)
+{
+  return y;
+}
+static uint32_t yuv2rgb_single_pixel_palette (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v)
+{
+  uint16_t * r, * g, * b;
+  r = this->table_rV[v];
+  g = (void *) (((uint8_t *)this->table_gU[u]) + this->table_gV[v]);
+  b = this->table_bU[u];
+  return this->cmap[r[y] + g[y] + b[y]];
+}
+static void yuv2rgb_c_init (yuv2rgb_factory_t *this)
+{
+  switch (this->mode) {
+  case MODE_32_RGB:
+  case MODE_32_BGR:
+    this->yuv2rgb_fun = yuv2rgb_c_32;
+    break;
+  case MODE_24_RGB:
+  case MODE_24_BGR:
+    this->yuv2rgb_fun =
+        (this->mode==MODE_24_RGB && !this->swapped) || (this->mode==MODE_24_BGR && this->swapped)
+            ? yuv2rgb_c_24_rgb
+            : yuv2rgb_c_24_bgr;
+    break;
+  case MODE_15_BGR:
+  case MODE_16_BGR:
+  case MODE_15_RGB:
+  case MODE_16_RGB:
+    this->yuv2rgb_fun = yuv2rgb_c_16;
+    break;
+  case MODE_8_RGB:
+  case MODE_8_BGR:
+    this->yuv2rgb_fun = yuv2rgb_c_8;
+    break;
+  case MODE_8_GRAY:
+    this->yuv2rgb_fun = yuv2rgb_c_gray;
+    break;
+  case MODE_PALETTE:
+    this->yuv2rgb_fun = yuv2rgb_c_palette;
+    break;
+  default:
+    printf ("yuv2rgb: mode %d not supported by yuv2rgb\n", this->mode);
+    abort();
+  }
+}
+static void yuv2rgb_single_pixel_init (yuv2rgb_factory_t *this) {
+  switch (this->mode) {
+  case MODE_32_RGB:
+  case MODE_32_BGR:
+    this->yuv2rgb_single_pixel_fun = yuv2rgb_single_pixel_32;
+    break;
+  case MODE_24_RGB:
+  case MODE_24_BGR:
+    this->yuv2rgb_single_pixel_fun =
+        (this->mode==MODE_24_RGB && !this->swapped) || (this->mode==MODE_24_BGR && this->swapped)
+            ? yuv2rgb_single_pixel_24_rgb
+            : yuv2rgb_single_pixel_24_bgr;
+    break;
+  case MODE_15_BGR:
+  case MODE_16_BGR:
+  case MODE_15_RGB:
+  case MODE_16_RGB:
+    this->yuv2rgb_single_pixel_fun = yuv2rgb_single_pixel_16;
+    break;
+  case MODE_8_RGB:
+  case MODE_8_BGR:
+    this->yuv2rgb_single_pixel_fun = yuv2rgb_single_pixel_8;
+    break;
+  case MODE_8_GRAY:
+    this->yuv2rgb_single_pixel_fun = yuv2rgb_single_pixel_gray;
+    break;
+  case MODE_PALETTE:
+    this->yuv2rgb_single_pixel_fun = yuv2rgb_single_pixel_palette;
+    break;
+  default:
+    printf ("yuv2rgb: mode %d not supported by yuv2rgb\n", this->mode);
+    abort();
+  }
+}
+/*
+ * yuy2 stuff
+ */
+static void yuy22rgb_c_32 (yuv2rgb_t *this, uint8_t * _dst, uint8_t * _p)
+{
+  int U, V, Y;
+  uint8_t * py_1, * pu, * pv;
+  uint32_t * r, * g, * b;
+  uint32_t * dst_1;
+  int width, height;
+  int dy;
+  /* FIXME: implement unscaled version */
+  scale_line_4 (_p+1, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_4 (_p+3, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_2 (_p, this->y_buffer,
+                this->dest_width, this->step_dx);
+  
+  dy = 0;
+  height = this->dest_height;
+  
+  for (;;) {
+    dst_1 = (uint32_t*)_dst;
+    py_1  = this->y_buffer;
+    pu    = this->u_buffer;
+    pv    = this->v_buffer;
+    width = this->dest_width >> 3;
+    do {
+      RGB(0);
+      DST1(0);
+      RGB(1);
+      DST1(1);
+      
+      RGB(2);
+      DST1(2);
+      
+      RGB(3);
+      DST1(3);
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      dst_1 += 8;
+    } while (--width);
+    
+    dy += this->step_dy;
+    _dst += this->rgb_stride;
+    
+    while (--height > 0 && dy < 32768) {
+      
+      xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width*4);
+      
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+    }
+    
+    if (height <= 0)
+      break;
+    _p += this->y_stride*2*(dy>>15);
+    dy &= 32767;
+    /*
+      dy -= 32768;
+      _p += this->y_stride*2;
+    */
+    
+    scale_line_4 (_p+1, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_4 (_p+3, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_2 (_p, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+  }
+}
+static void yuy22rgb_c_24_rgb (yuv2rgb_t *this, uint8_t * _dst, uint8_t * _p)
+{
+  int U, V, Y;
+  uint8_t * py_1, * pu, * pv;
+  uint8_t * r, * g, * b;
+  uint8_t * dst_1;
+  int width, height;
+  int dy;
+  /* FIXME: implement unscaled version */
+  scale_line_4 (_p+1, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_4 (_p+3, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_2 (_p, this->y_buffer, 
+                this->dest_width, this->step_dx);
+  dy = 0;
+  height = this->dest_height;
+  
+  for (;;) {
+    dst_1 = _dst;
+    py_1  = this->y_buffer;
+    pu    = this->u_buffer;
+    pv    = this->v_buffer;
+    
+    width = this->dest_width >> 3;
+    
+    do {
+      RGB(0);
+      DST1RGB(0);
+      
+      RGB(1);
+      DST1RGB(1);
+      
+      RGB(2);
+      DST1RGB(2);
+      
+      RGB(3);
+      DST1RGB(3);
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      dst_1 += 24;
+    } while (--width);
+    dy += this->step_dy;
+    _dst += this->rgb_stride;
+    
+    while (--height > 0 && dy < 32768) {
+      xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width*3);
+      
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+    }
+    
+    if (height <= 0)
+      break;
+    _p += this->y_stride*2*(dy>>15);
+    dy &= 32767;
+    /*
+      dy -= 32768;
+      _p += this->y_stride*2;
+    */
+    
+    scale_line_4 (_p+1, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_4 (_p+3, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_2 (_p, this->y_buffer,
+                  this->dest_width, this->step_dx);
+  }
+}
+static void yuy22rgb_c_24_bgr (yuv2rgb_t *this, uint8_t * _dst, uint8_t * _p)
+{
+  int U, V, Y;
+  uint8_t * py_1, * pu, * pv;
+  uint8_t * r, * g, * b;
+  uint8_t * dst_1;
+  int width, height;
+  int dy;
+  /* FIXME: implement unscaled version */
+  scale_line_4 (_p+1, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_4 (_p+3, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_2 (_p, this->y_buffer, 
+                this->dest_width, this->step_dx);
+  dy = 0;
+  height = this->dest_height;
+  
+  for (;;) {
+    dst_1 = _dst;
+    py_1  = this->y_buffer;
+    pu    = this->u_buffer;
+    pv    = this->v_buffer;
+    
+    width = this->dest_width >> 3;
+    
+    do {
+      RGB(0);
+      DST1BGR(0);
+      
+      RGB(1);
+      DST1BGR(1);
+      
+      RGB(2);
+      DST1BGR(2);
+      RGB(3);
+      DST1BGR(3);
+      
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      dst_1 += 24;
+    } while (--width);
+    
+    dy += this->step_dy;
+    _dst += this->rgb_stride;
+    
+    while (--height > 0 && dy < 32768) {
+      xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width*3);
+      
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+    }
+    
+    if (height <= 0)
+      break;
+    _p += this->y_stride*2*(dy>>15);
+    dy &= 32767;
+    scale_line_4 (_p+1, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_4 (_p+3, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_2 (_p, this->y_buffer,
+                  this->dest_width, this->step_dx);
+  }
+}
+static void yuy22rgb_c_16 (yuv2rgb_t *this, uint8_t * _dst, uint8_t * _p)
+{
+  int U, V, Y;
+  uint8_t * py_1, * pu, * pv;
+  uint16_t * r, * g, * b;
+  uint16_t * dst_1;
+  int width, height;
+  int dy;
+  /* FIXME: implement unscaled version */
+  scale_line_4 (_p+1, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_4 (_p+3, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_2 (_p, this->y_buffer, 
+                this->dest_width, this->step_dx);
+  
+  dy = 0;
+  height = this->dest_height;
+  for (;;) {
+    dst_1 = (uint16_t*)_dst;
+    py_1  = this->y_buffer;
+    pu    = this->u_buffer;
+    pv    = this->v_buffer;
+    
+    width = this->dest_width >> 3;
+    
+    do {
+      RGB(0);
+      DST1(0);
+      RGB(1);
+      DST1(1);
+      
+      RGB(2);
+      DST1(2);
+      
+      RGB(3);
+      DST1(3);
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      dst_1 += 8;
+    } while (--width);
+    
+    dy += this->step_dy;
+    _dst += this->rgb_stride;
+    while (--height > 0 && dy < 32768) {
+      
+      xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width*2); 
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+    }
+    
+    if (height <= 0)
+      break;
+    _p += this->y_stride*2*(dy>>15);
+    dy &= 32767;
+    
+    scale_line_4 (_p+1, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_4 (_p+3, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_2 (_p, this->y_buffer,
+                  this->dest_width, this->step_dx);
+  }
+}
+static void yuy22rgb_c_8 (yuv2rgb_t *this, uint8_t * _dst, uint8_t * _p)
+{
+  int U, V, Y;
+  uint8_t * py_1, * pu, * pv;
+  uint8_t * r, * g, * b;
+  uint8_t * dst_1;
+  int width, height;
+  int dy;
+  /* FIXME: implement unscaled version */
+  scale_line_4 (_p+1, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_4 (_p+3, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_2 (_p, this->y_buffer,
+                this->dest_width, this->step_dx);
+  
+  dy = 0;
+  height = this->dest_height;
+  
+  for (;;) {
+    dst_1 = _dst;
+    py_1  = this->y_buffer;
+    pu    = this->u_buffer;
+    pv    = this->v_buffer;
+    
+    width = this->dest_width >> 3;
+    
+    do {
+      RGB(0);
+      DST1(0);
+      
+      RGB(1);
+      DST1(1);
+      RGB(2);
+      DST1(2);
+      
+      RGB(3);
+      DST1(3);
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      dst_1 += 8;
+    } while (--width);
+    
+    dy += this->step_dy;
+    _dst += this->rgb_stride;
+    
+    while (--height > 0 && dy < 32768) {
+      
+      xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width); 
+      
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+    }
+    
+    if (height <= 0)
+      break;
+    _p += this->y_stride*2*(dy>>15);
+    dy &= 32767;
+    
+    scale_line_4 (_p+1, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_4 (_p+3, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_2 (_p, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+  }
+}
+static void yuy22rgb_c_gray (yuv2rgb_t *this, uint8_t * _dst, uint8_t * _p)
+{
+  int width, height;
+  int dy;
+  uint8_t * dst;
+  uint8_t * y;
+  if (this->do_scale) {
+    dy = 0;
+    height = this->dest_height;
+  
+    for (;;) {
+      scale_line_2 (_p, _dst, this->dest_width, this->step_dx);
+    
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+    
+      while (--height > 0 && dy < 32768) {
+      
+        xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width); 
+      
+        dy += this->step_dy;
+        _dst += this->rgb_stride;
+      }
+    
+      if (height <= 0)
+        break;
+      _p += this->y_stride*2*(dy>>15);
+      dy &= 32767;
+    }
+  } else {
+    for (height = this->source_height; --height >= 0; ) { 
+      dst = _dst;
+      y = _p;
+      for (width = this->source_width; --width >= 0; ) {
+        *dst++ = *y;
+        y += 2;
+      }
+      _dst += this->rgb_stride;
+      _p += this->y_stride*2;
+    }
+  }
+}
+static void yuy22rgb_c_palette (yuv2rgb_t *this, uint8_t * _dst, uint8_t * _p)
+{
+  int U, V, Y;
+  uint8_t * py_1, * pu, * pv;
+  uint16_t * r, * g, * b;
+  uint8_t * dst_1;
+  int width, height;
+  int dy;
+  
+  scale_line_4 (_p+1, this->u_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_4 (_p+3, this->v_buffer,
+                this->dest_width >> 1, this->step_dx);
+  scale_line_2 (_p, this->y_buffer,
+                this->dest_width, this->step_dx);
+    
+  dy = 0;
+  height = this->dest_height;
+  
+  for (;;) {
+    dst_1 = _dst;
+    py_1  = this->y_buffer;
+    pu    = this->u_buffer;
+    pv    = this->v_buffer;
+    
+    width = this->dest_width >> 3;
+    
+    do {
+      RGB(0);
+      DST1CMAP(0);
+      RGB(1);
+      DST1CMAP(1);
+      RGB(2);
+      DST1CMAP(2);
+      RGB(3);
+      DST1CMAP(3);
+      pu += 4;
+      pv += 4;
+      py_1 += 8;
+      dst_1 += 8;
+    } while (--width);
+    dy += this->step_dy;
+    _dst += this->rgb_stride;
+    while (--height > 0 && dy < 32768) {
+      xine_fast_memcpy (_dst, (uint8_t*)_dst-this->rgb_stride, this->dest_width);
+      dy += this->step_dy;
+      _dst += this->rgb_stride;
+    }
+    if (height <= 0)
+      break;
+    _p += this->y_stride*2*(dy>>15);
+    dy &= 32767;
+    scale_line_4 (_p+1, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_4 (_p+3, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+    scale_line_2 (_p, this->y_buffer,
+                  this->dest_width, this->step_dx);
+  }
+}
+static void yuy22rgb_c_init (yuv2rgb_factory_t *this)
+{
+  switch (this->mode) {
+  case MODE_32_RGB:
+  case MODE_32_BGR:
+    this->yuy22rgb_fun = yuy22rgb_c_32;
+    break;
+  case MODE_24_RGB:
+  case MODE_24_BGR:
+    this->yuy22rgb_fun =
+        (this->mode==MODE_24_RGB && !this->swapped) || (this->mode==MODE_24_BGR && this->swapped)
+            ? yuy22rgb_c_24_rgb
+            : yuy22rgb_c_24_bgr;
+    break;
+  case MODE_15_BGR:
+  case MODE_16_BGR:
+  case MODE_15_RGB:
+  case MODE_16_RGB:
+    this->yuy22rgb_fun = yuy22rgb_c_16;
+    break;
+  case MODE_8_RGB:
+  case MODE_8_BGR:
+    this->yuy22rgb_fun = yuy22rgb_c_8;
+    break;
+  case MODE_8_GRAY:
+    this->yuy22rgb_fun = yuy22rgb_c_gray;
+    break;
+  case MODE_PALETTE:
+    this->yuy22rgb_fun = yuy22rgb_c_palette;
+    break;
+  default:
+    printf ("yuv2rgb: mode %d not supported for yuy2\n", this->mode);
+  }
+}
+yuv2rgb_t *yuv2rgb_create_converter (yuv2rgb_factory_t *factory) {
+  yuv2rgb_t *this = xine_xmalloc (sizeof (yuv2rgb_t));
+  
+  this->cmap                     = factory->cmap;
+  this->y_chunk = this->y_buffer = NULL;
+  this->u_chunk = this->u_buffer = NULL;
+  this->v_chunk = this->v_buffer = NULL;
+  this->table_rV                 = factory->table_rV;
+  this->table_gU                 = factory->table_gU;
+  this->table_gV                 = factory->table_gV;
+  this->table_bU                 = factory->table_bU;
+  this->yuv2rgb_fun              = factory->yuv2rgb_fun;
+  this->yuy22rgb_fun             = factory->yuy22rgb_fun;
+  this->yuv2rgb_single_pixel_fun = factory->yuv2rgb_single_pixel_fun;
+  this->configure                = yuv2rgb_configure;
+  return this;
+}
+/*
+ * factory functions 
+ */
+void yuv2rgb_set_gamma (yuv2rgb_factory_t *this, int gamma) {
+  int i;
+  
+  for (i = 0; i < 256; i++) {
+    (uint8_t *)this->table_rV[i] += this->entry_size*(gamma - this->gamma);
+    (uint8_t *)this->table_gU[i] += this->entry_size*(gamma - this->gamma);
+    (uint8_t *)this->table_bU[i] += this->entry_size*(gamma - this->gamma);
+  }
+#ifdef ARCH_X86
+  mmx_yuv2rgb_set_gamma(gamma);
+#endif  
+  this->gamma = gamma;
+}
+int yuv2rgb_get_gamma (yuv2rgb_factory_t *this) {
+  return this->gamma;
+}
+yuv2rgb_factory_t* yuv2rgb_factory_init (int mode, int swapped, 
+                                         uint8_t *cmap) {
+  yuv2rgb_factory_t *this;
+#ifdef ARCH_X86
+  uint32_t mm = xine_mm_accel();
+#endif
+  this = malloc (sizeof (yuv2rgb_factory_t));
+  this->mode                = mode;
+  this->swapped             = swapped;
+  this->cmap                = cmap;
+  this->create_converter    = yuv2rgb_create_converter;
+  this->set_gamma           = yuv2rgb_set_gamma;
+  this->get_gamma           = yuv2rgb_get_gamma;
+  this->matrix_coefficients = 6;
+  yuv2rgb_setup_tables (this, mode, swapped);
+  /*
+   * auto-probe for the best yuv2rgb function
+   */
+  this->yuv2rgb_fun = NULL;
+#ifdef ARCH_X86
+  if ((this->yuv2rgb_fun == NULL) && (mm & MM_ACCEL_X86_MMXEXT)) {
+    yuv2rgb_init_mmxext (this);
+    if (this->yuv2rgb_fun != NULL)
+      printf ("yuv2rgb: using MMXEXT for colorspace transform\n");
+  }
+  if ((this->yuv2rgb_fun == NULL) && (mm & MM_ACCEL_X86_MMX)) {
+    yuv2rgb_init_mmx (this);
+    if (this->yuv2rgb_fun != NULL)
+      printf ("yuv2rgb: using MMX for colorspace transform\n");
+  }
+#endif
+#if HAVE_MLIB
+  if (this->yuv2rgb_fun == NULL) {
+    yuv2rgb_init_mlib (this);
+    if (this->yuv2rgb_fun != NULL)
+      printf ("yuv2rgb: using medialib for colorspace transform\n");
+  }
+#endif
+  if (this->yuv2rgb_fun == NULL) {
+    printf ("yuv2rgb: no accelerated colorspace conversion found\n");
+    yuv2rgb_c_init (this);
+  }
+  /*
+   * auto-probe for the best yuy22rgb function
+   */
+  /* FIXME: implement mmx/mlib functions */
+  yuy22rgb_c_init (this);
+  /*
+   * set up single pixel function
+   */
+  yuv2rgb_single_pixel_init (this);
+  return this;
+}
diff --git a/noncore/multimedia/opieplayer2/yuv2rgb.h b/noncore/multimedia/opieplayer2/yuv2rgb.h
new file mode 100644
index 0000000..5b9c3f6
--- a/dev/null
+++ b/noncore/multimedia/opieplayer2/yuv2rgb.h
@@ -0,0 +1,151 @@
+#ifndef HAVE_YUV2RGB_H
+#define HAVE_YUV2RGB_h
+#include <inttypes.h>
+typedef struct yuv2rgb_s yuv2rgb_t;
+typedef struct yuv2rgb_factory_s yuv2rgb_factory_t;
+/*
+ * function types for functions which can be replaced
+ * by hardware-accelerated versions
+ */
+/* internal function use to scale yuv data */
+typedef void (*scale_line_func_t) (uint8_t *source, uint8_t *dest, int width, int step);
+typedef void (*yuv2rgb_fun_t) (yuv2rgb_t *this, uint8_t * image, uint8_t * py, uint8_t * pu, uint8_t * pv) ;
+typedef void (*yuy22rgb_fun_t) (yuv2rgb_t *this, uint8_t * image, uint8_t * p);
+typedef uint32_t (*yuv2rgb_single_pixel_fun_t) (yuv2rgb_t *this, uint8_t y, uint8_t u, uint8_t v);
+/*
+ * modes supported - feel free to implement yours
+ */
+#define MODE_8_RGB    1
+#define MODE_8_BGR    2
+#define MODE_15_RGB   3
+#define MODE_15_BGR   4
+#define MODE_16_RGB   5
+#define MODE_16_BGR   6
+#define MODE_24_RGB   7
+#define MODE_24_BGR   8
+#define MODE_32_RGB   9
+#define MODE_32_BGR  10
+        #defineMODE_8_GRAY  11
+#define MODE_PALETTE 12
+struct yuv2rgb_s {
+  /*
+   * configure converter for scaling factors
+   */
+  int (*configure) (yuv2rgb_t *this,
+                    int source_width, int source_height,
+                    int y_stride, int uv_stride,
+                    int dest_width, int dest_height,
+                    int rgb_stride);
+  /*
+   * this is the function to call for the yuv2rgb and scaling process
+   */
+  yuv2rgb_fun_t     yuv2rgb_fun;
+  /*
+   * this is the function to call for the yuy2->rgb and scaling process
+   */
+  yuy22rgb_fun_t    yuy22rgb_fun;
+  /*
+   * this is the function to call for the yuv2rgb for a single pixel
+   * (used for converting clut colors)
+   */
+  yuv2rgb_single_pixel_fun_t yuv2rgb_single_pixel_fun;
+  /* private stuff below */
+  int               source_width, source_height;
+  int               y_stride, uv_stride;
+  int               dest_width, dest_height;
+  int               rgb_stride;
+  int               step_dx, step_dy;
+  int               do_scale;
+  uint8_t          *y_buffer;
+  uint8_t          *u_buffer;
+  uint8_t          *v_buffer;
+          void           *y_chunk;
+          void           *u_chunk;
+          void           *v_chunk;
+  void            **table_rV;
+  void            **table_gU;
+  int              *table_gV;
+  void            **table_bU;
+  uint8_t          *cmap;
+  scale_line_func_t scale_line;
+  
+} ;
+/*
+ * convenience class to easily create a lot of converters
+ */
+struct yuv2rgb_factory_s {
+  yuv2rgb_t* (*create_converter) (yuv2rgb_factory_t *this);
+  /* 
+   * adjust gamma (-100 to 100 looks fine) 
+   * for all converters produced by this factory
+   */
+  void (*set_gamma) (yuv2rgb_factory_t *this, int gamma);
+  /* 
+   * get gamma value 
+   */
+  int (*get_gamma) (yuv2rgb_factory_t *this);
+                   
+  /* private data */
+  int      mode;
+  int      swapped;
+  uint8_t *cmap;
+  int      gamma;
+  int      entry_size;
+  uint32_t matrix_coefficients;
+  void    *table_rV[256];
+  void    *table_gU[256];
+  int      table_gV[256];
+  void    *table_bU[256];
+  /* preselected functions for mode/swap/hardware */
+  yuv2rgb_fun_t               yuv2rgb_fun;
+  yuy22rgb_fun_t              yuy22rgb_fun;
+  yuv2rgb_single_pixel_fun_t  yuv2rgb_single_pixel_fun;
+};
+yuv2rgb_factory_t *yuv2rgb_factory_init (int mode, int swapped, uint8_t *colormap);
+                   
+/*
+ * internal stuff below this line
+ */
+void mmx_yuv2rgb_set_gamma(int gamma);
+void yuv2rgb_init_mmxext (yuv2rgb_factory_t *this);
+void yuv2rgb_init_mmx (yuv2rgb_factory_t *this);
+void yuv2rgb_init_mlib (yuv2rgb_factory_t *this);
+#endif
diff --git a/noncore/multimedia/opieplayer2/yuv2rgb_mlib.c b/noncore/multimedia/opieplayer2/yuv2rgb_mlib.c
new file mode 100644
index 0000000..908b439
--- a/dev/null
+++ b/noncore/multimedia/opieplayer2/yuv2rgb_mlib.c
@@ -0,0 +1,313 @@
+/*
+ * yuv2rgb_mlib.c
+ * Copyright (C) 2000-2001 Silicon Integrated System Corp.
+ * All Rights Reserved.
+ *
+ * Author: Juergen Keil <jk@tools.de>
+ *
+ * This file is part of xine, a free unix video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#if HAVE_MLIB
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <mlib_video.h>
+#include "attributes.h"
+#include "yuv2rgb.h"
+static void scale_line (uint8_t *source, uint8_t *dest,
+                        int width, int step) {
+  unsigned p1;
+  unsigned p2;
+  int dx;
+  p1 = *source++;
+  p2 = *source++;
+  dx = 0;
+  while (width) {
+    /*
+    printf ("scale_line, width = %d\n", width);
+    printf ("scale_line, dx = %d, p1 = %d, p2 = %d\n", dx, p1, p2);
+    */
+ 
+    *dest = (p1 * (32768 - dx) + p2 * dx) / 32768;
+    dx += step;
+    while (dx > 32768) {
+      dx -= 32768;
+      p1 = p2;
+      p2 = *source++;
+    }
+    dest ++;
+    width --;
+  }
+}
+                        
+static void mlib_yuv420_rgb24 (yuv2rgb_t *this,
+                               uint8_t * image, uint8_t * py,
+                               uint8_t * pu, uint8_t * pv)
+{
+  int dst_height;
+  int dy;
+  mlib_status mlib_stat;
+  if (this->do_scale) {
+    dy = 0;
+    dst_height = this->dest_height;
+    for (;;) {
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      pu += this->uv_stride;
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      pv += this->uv_stride;
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      py += this->y_stride;
+      scale_line (py, this->y_buffer + this->dest_width, 
+                  this->dest_width, this->step_dx);
+      py += this->y_stride;
+        
+      mlib_stat = mlib_VideoColorYUV2RGB420(image,
+                                            this->y_buffer, 
+                                            this->u_buffer,
+                                            this->v_buffer,
+                                            this->dest_width & ~1, 2,
+                                            this->rgb_stride,
+                                            this->dest_width,
+                                            this->dest_width >> 1);
+      dy += this->step_dy;
+      image += this->rgb_stride;
+      
+      while (--dst_height > 0 && dy < 32768) {
+        memcpy (image, (uint8_t*)image-this->rgb_stride, this->dest_width*6);
+        dy += this->step_dy;
+        image += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      dy -= 32768;
+      dy += this->step_dy;
+      image += this->rgb_stride;
+      
+      while (--dst_height > 0 && dy < 32768) {
+        memcpy (image, (uint8_t*)image-this->rgb_stride, this->dest_width*3);
+        dy += this->step_dy;
+        image += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      dy -= 32768;
+    }
+  } else {
+    mlib_stat = mlib_VideoColorYUV2RGB420(image, py, pu, pv,
+                                          this->source_width,
+                                          this->source_height,
+                                          this->rgb_stride,
+                                          this->y_stride,
+                                          this->uv_stride);
+  }
+}
+static void mlib_yuv420_argb32 (yuv2rgb_t *this,
+                                uint8_t * image, uint8_t * py,
+                                uint8_t * pu, uint8_t * pv)
+{
+  int dst_height;
+  int dy;
+  mlib_status mlib_stat;
+  if (this->do_scale) {
+    dy = 0;
+    dst_height = this->dest_height;
+    for (;;) {
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      pu += this->uv_stride;
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      pv += this->uv_stride;
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      py += this->y_stride;
+      scale_line (py, this->y_buffer + this->dest_width, 
+                  this->dest_width, this->step_dx);
+      py += this->y_stride;
+        
+      mlib_stat = mlib_VideoColorYUV2ARGB420(image,
+                                             this->y_buffer, 
+                                             this->u_buffer,
+                                             this->v_buffer,
+                                             this->dest_width & ~1, 2,
+                                             this->rgb_stride,
+                                             this->dest_width,
+                                             this->dest_width >> 1);
+      dy += this->step_dy;
+      image += this->rgb_stride;
+      
+      while (--dst_height > 0 && dy < 32768) {
+        memcpy (image, (uint8_t*)image-this->rgb_stride, this->dest_width*8);
+        dy += this->step_dy;
+        image += this->rgb_stride;
+      }
+      
+      if (dst_height <= 0)
+        break;
+      dy -= 32768;
+      dy += this->step_dy;
+      image += this->rgb_stride;
+      
+      while (--dst_height > 0 && dy < 32768) {
+        memcpy (image, (uint8_t*)image-this->rgb_stride, this->dest_width*4);
+        dy += this->step_dy;
+        image += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      dy -= 32768;
+    }
+  } else {
+    mlib_stat = mlib_VideoColorYUV2ARGB420(image, py, pu, pv,
+                                           this->source_width,
+                                           this->source_height,
+                                           this->rgb_stride,
+                                           this->y_stride,
+                                           this->uv_stride);
+  }
+}
+static void mlib_yuv420_abgr32 (yuv2rgb_t *this,
+                                uint8_t * image, uint8_t * py,
+                                uint8_t * pu, uint8_t * pv)
+{
+  int dst_height;
+  int dy;
+  mlib_status mlib_stat;
+  if (this->do_scale) {
+    dy = 0;
+    dst_height = this->dest_height;
+    for (;;) {
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      pu += this->uv_stride;
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      pv += this->uv_stride;
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      py += this->y_stride;
+      scale_line (py, this->y_buffer + this->dest_width, 
+                  this->dest_width, this->step_dx);
+      py += this->y_stride;
+        
+      mlib_stat = mlib_VideoColorYUV2ABGR420(image,
+                                             this->y_buffer, 
+                                             this->u_buffer,
+                                             this->v_buffer,
+                                             this->dest_width & ~1, 2,
+                                             this->rgb_stride,
+                                             this->dest_width,
+                                             this->dest_width >> 1);
+      dy += this->step_dy;
+      image += this->rgb_stride;
+      
+      while (--dst_height > 0 && dy < 32768) {
+        memcpy (image, (uint8_t*)image-this->rgb_stride, this->dest_width*8);
+        dy += this->step_dy;
+        image += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      dy -= 32768;
+      dy += this->step_dy;
+      image += this->rgb_stride;
+      
+      while (--dst_height > 0 && dy < 32768) {
+        memcpy (image, (uint8_t*)image-this->rgb_stride, this->dest_width*4);
+        dy += this->step_dy;
+        image += this->rgb_stride;
+      }
+      if (dst_height <= 0)
+        break;
+      dy -= 32768;
+    }
+  } else {
+    mlib_stat = mlib_VideoColorYUV2ABGR420(image, py, pu, pv,
+                                           this->source_width,
+                                           this->source_height,
+                                           this->rgb_stride,
+                                           this->y_stride,
+                                           this->uv_stride);
+  }
+}
+void yuv2rgb_init_mlib (yuv2rgb_factory_t *this) {
+  if (this->swapped) return; /*no swapped pixel output upto now*/
+  switch (this->mode) {
+  case MODE_24_RGB:
+    this->yuv2rgb_fun = mlib_yuv420_rgb24;
+    break;
+  case MODE_32_RGB:
+    this->yuv2rgb_fun = mlib_yuv420_argb32;
+    break;
+  case MODE_32_BGR:
+    this->yuv2rgb_fun = mlib_yuv420_abgr32;
+    break;
+  }
+}
+        #endif/* HAVE_MLIB */
diff --git a/noncore/multimedia/opieplayer2/yuv2rgb_mmx.c b/noncore/multimedia/opieplayer2/yuv2rgb_mmx.c
new file mode 100644
index 0000000..f092e6f
--- a/dev/null
+++ b/noncore/multimedia/opieplayer2/yuv2rgb_mmx.c
@@ -0,0 +1,1047 @@
+/*
+ * yuv2rgb_mmx.c
+ * Copyright (C) 2000-2001 Silicon Integrated System Corp.
+ * All Rights Reserved.
+ *
+ * Author: Olie Lho <ollie@sis.com.tw>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifdef ARCH_X86
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include "yuv2rgb.h"
+#include "xineutils.h"
+#define CPU_MMXEXT 0
+#define CPU_MMX 1
+/* CPU_MMXEXT/CPU_MMX adaptation layer */
+        #define movntq(src,dest)\
+        do {                    \
+            if (cpu == CPU_MMXEXT)\
+                movntq_r2m (src, dest);\
+            else                \
+                movq_r2m (src, dest);\
+} while (0)
+static mmx_t mmx_subYw = {0x1010101010101010};
+static mmx_t mmx_addYw = {0x0000000000000000};
+void mmx_yuv2rgb_set_gamma(int gamma) 
+{
+int a,s,i;
+  if( gamma <= 16 ) {
+    a = 0;
+    s = 16 - gamma;
+  } else {
+    a = gamma - 16;
+    s = 0;
+  }
+  
+  for( i = 0; i < 8; i++ ) {
+    *((unsigned char *)&mmx_subYw + i) = s;
+    *((unsigned char *)&mmx_addYw + i) = a;
+  }
+}
+static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    static mmx_t mmx_80w = {0x0080008000800080};
+    static mmx_t mmx_U_green = {0xf37df37df37df37d};
+    static mmx_t mmx_U_blue = {0x4093409340934093};
+    static mmx_t mmx_V_red = {0x3312331233123312};
+    static mmx_t mmx_V_green = {0xe5fce5fce5fce5fc};
+    static mmx_t mmx_00ffw = {0x00ff00ff00ff00ff};
+    static mmx_t mmx_Y_coeff = {0x253f253f253f253f};
+            movq_m2r (*py, mm6);        // mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
+            pxor_r2r (mm4, mm4);        // mm4 = 0
+            psubusb_m2r (mmx_subYw, mm6);// Y -= 16
+    paddusb_m2r (mmx_addYw, mm6);
+            movd_m2r (*pu, mm0);        // mm0 = 00 00 00 00 u3 u2 u1 u0
+            movq_r2r (mm6, mm7);        // mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
+            pand_m2r (mmx_00ffw, mm6);  // mm6 =    Y6    Y4    Y2    Y0
+            psrlw_i2r (8, mm7);                 // mm7 =    Y7    Y5    Y3    Y1
+            movd_m2r (*pv, mm1);        // mm1 = 00 00 00 00 v3 v2 v1 v0
+            psllw_i2r (3, mm6);                 // promote precision
+            pmulhw_m2r (mmx_Y_coeff, mm6);// mm6 = luma_rgb even
+            psllw_i2r (3, mm7);                 // promote precision
+            punpcklbw_r2r (mm4, mm0);   // mm0 = u3 u2 u1 u0
+            psubsw_m2r (mmx_80w, mm0);  // u -= 128
+            punpcklbw_r2r (mm4, mm1);   // mm1 = v3 v2 v1 v0
+            pmulhw_m2r (mmx_Y_coeff, mm7);// mm7 = luma_rgb odd
+            psllw_i2r (3, mm0);                 // promote precision
+            psubsw_m2r (mmx_80w, mm1);  // v -= 128
+            movq_r2r (mm0, mm2);        // mm2 = u3 u2 u1 u0
+            psllw_i2r (3, mm1);                 // promote precision
+            movq_r2r (mm1, mm4);        // mm4 = v3 v2 v1 v0
+            pmulhw_m2r (mmx_U_blue, mm0);// mm0 = chroma_b
+    // slot
+    // slot
+            pmulhw_m2r (mmx_V_red, mm1);// mm1 = chroma_r
+            movq_r2r (mm0, mm3);        // mm3 = chroma_b
+            paddsw_r2r (mm6, mm0);      // mm0 = B6 B4 B2 B0
+            paddsw_r2r (mm7, mm3);      // mm3 = B7 B5 B3 B1
+            packuswb_r2r (mm0, mm0);    // saturate to 0-255
+            pmulhw_m2r (mmx_U_green, mm2);// mm2 = u * u_green
+            packuswb_r2r (mm3, mm3);    // saturate to 0-255
+            punpcklbw_r2r (mm3, mm0);   // mm0 = B7 B6 B5 B4 B3 B2 B1 B0
+            pmulhw_m2r (mmx_V_green, mm4);// mm4 = v * v_green
+    
+    // slot
+    // slot
+            paddsw_r2r (mm4, mm2);      // mm2 = chroma_g
+            movq_r2r (mm2, mm5);        // mm5 = chroma_g
+            movq_r2r (mm1, mm4);        // mm4 = chroma_r
+            paddsw_r2r (mm6, mm2);      // mm2 = G6 G4 G2 G0
+            packuswb_r2r (mm2, mm2);    // saturate to 0-255
+            paddsw_r2r (mm6, mm1);      // mm1 = R6 R4 R2 R0
+            packuswb_r2r (mm1, mm1);    // saturate to 0-255
+            paddsw_r2r (mm7, mm4);      // mm4 = R7 R5 R3 R1
+            packuswb_r2r (mm4, mm4);    // saturate to 0-255
+            paddsw_r2r (mm7, mm5);      // mm5 = G7 G5 G3 G1
+            packuswb_r2r (mm5, mm5);    // saturate to 0-255
+            punpcklbw_r2r (mm4, mm1);   // mm1 = R7 R6 R5 R4 R3 R2 R1 R0
+            punpcklbw_r2r (mm5, mm2);   // mm2 = G7 G6 G5 G4 G3 G2 G1 G0
+}
+// basic opt
+static inline void mmx_unpack_16rgb (uint8_t * image, int cpu)
+{
+    static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8};
+    static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfc};
+    static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8};
+    /*
+     * convert RGB plane to RGB 16 bits
+     * mm0 -> B, mm1 -> R, mm2 -> G
+     * mm4 -> GB, mm5 -> AR pixel 4-7
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+            pand_m2r (mmx_bluemask, mm0);// mm0 = b7b6b5b4b3______
+            pxor_r2r (mm4, mm4);        // mm4 = 0
+            pand_m2r (mmx_greenmask, mm2);// mm2 = g7g6g5g4g3g2____
+            psrlq_i2r (3, mm0);                 // mm0 = ______b7b6b5b4b3
+            movq_r2r (mm2, mm7);        // mm7 = g7g6g5g4g3g2____
+            movq_r2r (mm0, mm5);        // mm5 = ______b7b6b5b4b3
+            pand_m2r (mmx_redmask, mm1);// mm1 = r7r6r5r4r3______
+    punpcklbw_r2r (mm4, mm2);
+    punpcklbw_r2r (mm1, mm0);
+    psllq_i2r (3, mm2);
+    punpckhbw_r2r (mm4, mm7);
+    por_r2r (mm2, mm0);
+    psllq_i2r (3, mm7);
+    movntq (mm0, *image);
+    punpckhbw_r2r (mm1, mm5);
+    por_r2r (mm7, mm5);
+    // U
+    // V
+    movntq (mm5, *(image+8));
+}
+static inline void mmx_unpack_15rgb (uint8_t * image, int cpu)
+{
+    static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8};
+    static mmx_t mmx_greenmask = {0xf8f8f8f8f8f8f8f8};
+    static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8};
+    /*
+     * convert RGB plane to RGB 15 bits
+     * mm0 -> B, mm1 -> R, mm2 -> G
+     * mm4 -> GB, mm5 -> AR pixel 4-7
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+            pand_m2r (mmx_bluemask, mm0);// mm0 = b7b6b5b4b3______
+            pxor_r2r (mm4, mm4);        // mm4 = 0
+            pand_m2r (mmx_greenmask, mm2);// mm2 = g7g6g5g4g3g2____
+            psrlq_i2r (3, mm0);                 // mm0 = ______b7b6b5b4b3
+            movq_r2r (mm2, mm7);        // mm7 = g7g6g5g4g3g2____
+            movq_r2r (mm0, mm5);        // mm5 = ______b7b6b5b4b3
+            pand_m2r (mmx_redmask, mm1);// mm1 = r7r6r5r4r3______
+    punpcklbw_r2r (mm4, mm2);
+    psrlq_i2r (1, mm1);
+    punpcklbw_r2r (mm1, mm0);
+    psllq_i2r (2, mm2);
+    punpckhbw_r2r (mm4, mm7);
+    por_r2r (mm2, mm0);
+    psllq_i2r (2, mm7);
+    movntq (mm0, *image);
+    punpckhbw_r2r (mm1, mm5);
+    por_r2r (mm7, mm5);
+    // U
+    // V
+    movntq (mm5, *(image+8));
+}
+static inline void mmx_unpack_32rgb (uint8_t * image, int cpu)
+{
+    /*
+     * convert RGB plane to RGB packed format,
+     * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
+     * mm4 -> GB, mm5 -> AR pixel 4-7,
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+    pxor_r2r (mm3, mm3);
+    movq_r2r (mm0, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    movq_r2r (mm1, mm7);
+    punpcklbw_r2r (mm3, mm7);
+    movq_r2r (mm0, mm4);
+    punpcklwd_r2r (mm7, mm6);
+    movq_r2r (mm1, mm5);
+    /* scheduling: this is hopeless */
+    movntq (mm6, *image);
+    movq_r2r (mm0, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    punpckhwd_r2r (mm7, mm6);
+    movntq (mm6, *(image+8));
+    punpckhbw_r2r (mm2, mm4);
+    punpckhbw_r2r (mm3, mm5);
+    punpcklwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+16));
+    movq_r2r (mm0, mm4);
+    punpckhbw_r2r (mm2, mm4);
+    punpckhwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+24));
+}
+static inline void mmx_unpack_32bgr (uint8_t * image, int cpu)
+{
+    /*
+     * convert RGB plane to RGB packed format,
+     * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
+     * mm4 -> GB, mm5 -> AR pixel 4-7,
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+    pxor_r2r (mm3, mm3);
+    movq_r2r (mm1, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    movq_r2r (mm0, mm7);
+    punpcklbw_r2r (mm3, mm7);
+    movq_r2r (mm1, mm4);
+    punpcklwd_r2r (mm7, mm6);
+    movq_r2r (mm0, mm5);
+    /* scheduling: this is hopeless */
+    movntq (mm6, *image);
+    movq_r2r (mm0, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    punpckhwd_r2r (mm7, mm6);
+    movntq (mm6, *(image+8));
+    punpckhbw_r2r (mm2, mm4);
+    punpckhbw_r2r (mm3, mm5);
+    punpcklwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+16));
+    movq_r2r (mm0, mm4);
+    punpckhbw_r2r (mm2, mm4);
+    punpckhwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+24));
+}
+static inline void mmx_unpack_24rgb (uint8_t * image, int cpu)
+{
+    /*
+     * convert RGB plane to RGB packed format,
+     * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
+     * mm4 -> GB, mm5 -> AR pixel 4-7,
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+    pxor_r2r (mm3, mm3);
+    movq_r2r (mm0, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    movq_r2r (mm1, mm7);
+    punpcklbw_r2r (mm3, mm7);
+    movq_r2r (mm0, mm4);
+    punpcklwd_r2r (mm7, mm6);
+    movq_r2r (mm1, mm5);
+    /* scheduling: this is hopeless */
+    movntq (mm6, *image);
+    movq_r2r (mm0, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    punpckhwd_r2r (mm7, mm6);
+    movntq (mm6, *(image+8));
+    punpckhbw_r2r (mm2, mm4);
+    punpckhbw_r2r (mm3, mm5);
+    punpcklwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+16));
+}
+static inline void yuv420_rgb16 (yuv2rgb_t *this,
+                                 uint8_t * image,
+                                 uint8_t * py, uint8_t * pu, uint8_t * pv,
+                                 int cpu)
+{
+    int i;
+    int rgb_stride = this->rgb_stride;
+    int y_stride   = this->y_stride;
+    int uv_stride  = this->uv_stride;
+    int width      = this->source_width;
+    int height     = this->source_height;
+    int dst_height = this->dest_height;
+    uint8_t *img;
+    width >>= 3;
+    if (!this->do_scale) {
+      y_stride -= 8 * width;
+      uv_stride -= 4 * width;
+      do {
+        i = width; img = image;
+        do {
+          mmx_yuv2rgb (py, pu, pv); 
+          mmx_unpack_16rgb (img, cpu); 
+          py += 8;
+          pu += 4;
+          pv += 4;
+          img += 16;
+        } while (--i);
+        
+        py += y_stride;
+        image += rgb_stride;
+        if (height & 1) {
+          pu += uv_stride;
+          pv += uv_stride;
+        } else {
+          pu -= 4 * width;
+          pv -= 4 * width;
+        }
+      } while (--height);
+    } else {
+      scale_line_func_t scale_line = this->scale_line;
+      uint8_t *y_buf, *u_buf, *v_buf;
+      int      dy = 0;
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      for (height = 0;; ) {
+        y_buf = this->y_buffer;
+        u_buf = this->u_buffer;
+        v_buf = this->v_buffer;
+        i = this->dest_width >> 3; img = image;
+        do {
+          /* printf ("i : %d\n",i); */
+          mmx_yuv2rgb (y_buf, u_buf, v_buf); 
+          mmx_unpack_16rgb (img, cpu); 
+          y_buf += 8;
+          u_buf += 4;
+          v_buf += 4;
+          img += 16;
+        } while (--i);
+        
+        dy += this->step_dy;
+        image += rgb_stride;
+        while (--dst_height > 0 && dy < 32768) {
+          xine_fast_memcpy (image, image-rgb_stride, this->dest_width*2); 
+          dy += this->step_dy;
+          image += rgb_stride;
+        }
+        if (dst_height <= 0)
+          break;
+        do {
+            dy -= 32768;
+            py += y_stride;
+            scale_line (py, this->y_buffer, 
+                        this->dest_width, this->step_dx);
+            if (height & 1) {
+                pu += uv_stride;
+                pv += uv_stride;
+          
+                scale_line (pu, this->u_buffer,
+                            this->dest_width >> 1, this->step_dx);
+                scale_line (pv, this->v_buffer,
+                            this->dest_width >> 1, this->step_dx);
+          
+            }
+            height++;
+        } while( dy>=32768);
+      }
+    } 
+}
+static inline void yuv420_rgb15 (yuv2rgb_t *this,
+                                 uint8_t * image,
+                                 uint8_t * py, uint8_t * pu, uint8_t * pv,
+                                 int cpu)
+{
+    int i;
+    int rgb_stride = this->rgb_stride;
+    int y_stride   = this->y_stride;
+    int uv_stride  = this->uv_stride;
+    int width      = this->source_width;
+    int height     = this->source_height;
+    int dst_height = this->dest_height;
+    uint8_t *img;
+    width >>= 3;
+    if (!this->do_scale) {
+      y_stride -= 8 * width;
+      uv_stride -= 4 * width;
+      do {
+        i = width; img = image;
+        do {
+          mmx_yuv2rgb (py, pu, pv); 
+          mmx_unpack_15rgb (img, cpu); 
+          py += 8;
+          pu += 4;
+          pv += 4;
+          img += 16;
+        } while (--i);
+        
+        py += y_stride;
+        image += rgb_stride;
+        if (height & 1) {
+          pu += uv_stride;
+          pv += uv_stride;
+        } else {
+          pu -= 4 * width;
+          pv -= 4 * width;
+        }
+      } while (--height);
+    } else {
+      scale_line_func_t scale_line = this->scale_line;
+      uint8_t *y_buf, *u_buf, *v_buf;
+      int      dy = 0;
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      for (height = 0;; ) {
+        y_buf = this->y_buffer;
+        u_buf = this->u_buffer;
+        v_buf = this->v_buffer;
+        i = this->dest_width >> 3; img = image;
+        do {
+          /* printf ("i : %d\n",i); */
+          mmx_yuv2rgb (y_buf, u_buf, v_buf); 
+          mmx_unpack_15rgb (img, cpu); 
+          y_buf += 8;
+          u_buf += 4;
+          v_buf += 4;
+          img += 16;
+        } while (--i);
+        
+        dy += this->step_dy;
+        image += rgb_stride;
+        while (--dst_height > 0 && dy < 32768) {
+          xine_fast_memcpy (image, image-rgb_stride, this->dest_width*2); 
+          dy += this->step_dy;
+          image += rgb_stride;
+        }
+        if (dst_height <= 0)
+          break;
+        do {
+            dy -= 32768;
+            py += y_stride;
+            scale_line (py, this->y_buffer, 
+                        this->dest_width, this->step_dx);
+            if (height & 1) {
+                pu += uv_stride;
+                pv += uv_stride;
+          
+                scale_line (pu, this->u_buffer,
+                            this->dest_width >> 1, this->step_dx);
+                scale_line (pv, this->v_buffer,
+                            this->dest_width >> 1, this->step_dx);
+          
+            }
+            height++;
+        } while( dy>=32768 );
+      }
+    } 
+}
+static inline void yuv420_rgb24 (yuv2rgb_t *this,
+                                 uint8_t * image, uint8_t * py,
+                                 uint8_t * pu, uint8_t * pv, int cpu)
+{
+    int i;
+    int rgb_stride = this->rgb_stride;
+    int y_stride   = this->y_stride;
+    int uv_stride  = this->uv_stride;
+    int width      = this->source_width;
+    int height     = this->source_height;
+    int dst_height = this->dest_height;
+    uint8_t *img;
+    /* rgb_stride -= 4 * this->dest_width; */
+    width >>= 3;
+    if (!this->do_scale) {
+      y_stride -= 8 * width;
+      uv_stride -= 4 * width;
+      do {
+        i = width; img = image;
+        do {
+          mmx_yuv2rgb (py, pu, pv);
+          mmx_unpack_24rgb (img, cpu);
+          py += 8;
+          pu += 4;
+          pv += 4;
+          img += 24;
+        } while (--i);
+        py += y_stride;
+        image += rgb_stride;
+        if (height & 1) {
+          pu += uv_stride;
+          pv += uv_stride;
+        } else {
+          pu -= 4 * width;
+          pv -= 4 * width;
+        }
+      } while (--height);
+    } else {
+      scale_line_func_t scale_line = this->scale_line;
+      uint8_t *y_buf, *u_buf, *v_buf;
+      int      dy = 0;
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      for (height = 0;; ) {
+        y_buf = this->y_buffer;
+        u_buf = this->u_buffer;
+        v_buf = this->v_buffer;
+        i = this->dest_width >> 3; img=image;
+        do {
+          /* printf ("i : %d\n",i); */
+          mmx_yuv2rgb (y_buf, u_buf, v_buf); 
+          mmx_unpack_24rgb (img, cpu); 
+          y_buf += 8;
+          u_buf += 4;
+          v_buf += 4;
+          img += 24;
+        } while (--i);
+        
+        dy += this->step_dy;
+        image += rgb_stride;
+        while (--dst_height > 0 && dy < 32768) {
+          xine_fast_memcpy (image, image-rgb_stride, this->dest_width*3);
+          dy += this->step_dy;
+          image += rgb_stride;
+        }
+        if (dst_height <= 0)
+          break;
+        do {
+            dy -= 32768;
+            py += y_stride;
+        
+            scale_line (py, this->y_buffer, 
+                        this->dest_width, this->step_dx);
+            if (height & 1) {
+                pu += uv_stride;
+                pv += uv_stride;
+          
+                scale_line (pu, this->u_buffer,
+                            this->dest_width >> 1, this->step_dx);
+                scale_line (pv, this->v_buffer,
+                            this->dest_width >> 1, this->step_dx);
+            }
+            height++;
+        } while( dy>=32768 );
+      }
+      
+    }
+}
+static inline void yuv420_argb32 (yuv2rgb_t *this,
+                                  uint8_t * image, uint8_t * py,
+                                  uint8_t * pu, uint8_t * pv, int cpu)
+{
+    int i;
+    int rgb_stride = this->rgb_stride;
+    int y_stride   = this->y_stride;
+    int uv_stride  = this->uv_stride;
+    int width      = this->source_width;
+    int height     = this->source_height;
+    int dst_height = this->dest_height;
+    uint8_t *img;
+    /* rgb_stride -= 4 * this->dest_width; */
+    width >>= 3;
+    if (!this->do_scale) {
+      y_stride -= 8 * width;
+      uv_stride -= 4 * width;
+      do {
+        i = width; img = image;
+        do {
+          mmx_yuv2rgb (py, pu, pv);
+          mmx_unpack_32rgb (img, cpu);
+          py += 8;
+          pu += 4;
+          pv += 4;
+          img += 32;
+        } while (--i);
+        py += y_stride;
+        image += rgb_stride;
+        if (height & 1) {
+          pu += uv_stride;
+          pv += uv_stride;
+        } else {
+          pu -= 4 * width;
+          pv -= 4 * width;
+        }
+      } while (--height);
+    } else {
+      scale_line_func_t scale_line = this->scale_line;
+      uint8_t *y_buf, *u_buf, *v_buf;
+      int      dy = 0;
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      for (height = 0;; ) {
+        y_buf = this->y_buffer;
+        u_buf = this->u_buffer;
+        v_buf = this->v_buffer;
+        i = this->dest_width >> 3; img=image;
+        do {
+          /* printf ("i : %d\n",i); */
+          mmx_yuv2rgb (y_buf, u_buf, v_buf); 
+          mmx_unpack_32rgb (img, cpu); 
+          y_buf += 8;
+          u_buf += 4;
+          v_buf += 4;
+          img += 32;
+        } while (--i);
+        
+        dy += this->step_dy;
+        image += rgb_stride;
+        while (--dst_height > 0 && dy < 32768) {
+          xine_fast_memcpy (image, image-rgb_stride, this->dest_width*4); 
+          dy += this->step_dy;
+          image += rgb_stride;
+        }
+        if (dst_height <= 0)
+          break;
+        do {
+            dy -= 32768;
+            py += y_stride;
+        
+            scale_line (py, this->y_buffer, 
+                        this->dest_width, this->step_dx);
+            if (height & 1) {
+                pu += uv_stride;
+                pv += uv_stride;
+          
+                scale_line (pu, this->u_buffer,
+                            this->dest_width >> 1, this->step_dx);
+                scale_line (pv, this->v_buffer,
+                            this->dest_width >> 1, this->step_dx);
+            }
+            height++;
+        } while( dy>=32768 );
+      }
+      
+    }
+}
+static inline void yuv420_abgr32 (yuv2rgb_t *this,
+                                  uint8_t * image, uint8_t * py,
+                                  uint8_t * pu, uint8_t * pv, int cpu)
+{
+    int i;
+    int rgb_stride = this->rgb_stride;
+    int y_stride   = this->y_stride;
+    int uv_stride  = this->uv_stride;
+    int width      = this->source_width;
+    int height     = this->source_height;
+    int dst_height = this->dest_height;
+    uint8_t *img;
+    /* rgb_stride -= 4 * this->dest_width; */
+    width >>= 3;
+    if (!this->do_scale) {
+      y_stride -= 8 * width;
+      uv_stride -= 4 * width;
+      do {
+        i = width; img = image;
+        do {
+          mmx_yuv2rgb (py, pu, pv);
+          mmx_unpack_32bgr (img, cpu);
+          py += 8;
+          pu += 4;
+          pv += 4;
+          img += 32;
+        } while (--i);
+        py += y_stride;
+        image += rgb_stride;
+        if (height & 1) {
+          pu += uv_stride;
+          pv += uv_stride;
+        } else {
+          pu -= 4 * width;
+          pv -= 4 * width;
+        }
+      } while (--height);
+    } else {
+      scale_line_func_t scale_line = this->scale_line;
+      uint8_t *y_buf, *u_buf, *v_buf;
+      int      dy = 0;
+      scale_line (pu, this->u_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (pv, this->v_buffer,
+                  this->dest_width >> 1, this->step_dx);
+      scale_line (py, this->y_buffer, 
+                  this->dest_width, this->step_dx);
+      for (height = 0;; ) {
+        y_buf = this->y_buffer;
+        u_buf = this->u_buffer;
+        v_buf = this->v_buffer;
+        i = this->dest_width >> 3; img=image;
+        do {
+          /* printf ("i : %d\n",i); */
+          mmx_yuv2rgb (y_buf, u_buf, v_buf); 
+          mmx_unpack_32bgr (img, cpu); 
+          y_buf += 8;
+          u_buf += 4;
+          v_buf += 4;
+          img += 32;
+        } while (--i);
+        
+        dy += this->step_dy;
+        image += rgb_stride;
+        while (--dst_height > 0 && dy < 32768) {
+          xine_fast_memcpy (image, image-rgb_stride, this->dest_width*4); 
+          dy += this->step_dy;
+          image += rgb_stride;
+        }
+        if (dst_height <= 0)
+          break;
+        do {
+            dy -= 32768;
+            py += y_stride;
+        
+            scale_line (py, this->y_buffer, 
+                        this->dest_width, this->step_dx);
+            if (height & 1) {
+                pu += uv_stride;
+                pv += uv_stride;
+          
+                scale_line (pu, this->u_buffer,
+                            this->dest_width >> 1, this->step_dx);
+                scale_line (pv, this->v_buffer,
+                            this->dest_width >> 1, this->step_dx);
+            }
+            height++;
+        } while( dy>=32768 );
+      }
+      
+    }
+}
+static void mmxext_rgb15 (yuv2rgb_t *this, uint8_t * image,
+                          uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_rgb15 (this, image, py, pu, pv, CPU_MMXEXT);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmxext_rgb16 (yuv2rgb_t *this, uint8_t * image,
+                          uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_rgb16 (this, image, py, pu, pv, CPU_MMXEXT);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmxext_rgb24 (yuv2rgb_t *this, uint8_t * image,
+                           uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_rgb24 (this, image, py, pu, pv, CPU_MMXEXT);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmxext_argb32 (yuv2rgb_t *this, uint8_t * image,
+                           uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_argb32 (this, image, py, pu, pv, CPU_MMXEXT);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmxext_abgr32 (yuv2rgb_t *this, uint8_t * image,
+                           uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_abgr32 (this, image, py, pu, pv, CPU_MMXEXT);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmx_rgb15 (yuv2rgb_t *this, uint8_t * image,
+                       uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_rgb15 (this, image, py, pu, pv, CPU_MMX);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmx_rgb16 (yuv2rgb_t *this, uint8_t * image,
+                       uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_rgb16 (this, image, py, pu, pv, CPU_MMX);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmx_rgb24 (yuv2rgb_t *this, uint8_t * image,
+                       uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_rgb24 (this, image, py, pu, pv, CPU_MMX);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmx_argb32 (yuv2rgb_t *this, uint8_t * image,
+                        uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_argb32 (this, image, py, pu, pv, CPU_MMX);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+static void mmx_abgr32 (yuv2rgb_t *this, uint8_t * image,
+                        uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    yuv420_abgr32 (this, image, py, pu, pv, CPU_MMX);
+            emms();/* re-initialize x86 FPU after MMX use */
+}
+void yuv2rgb_init_mmxext (yuv2rgb_factory_t *this) {
+  if (this->swapped) 
+    return; /*no swapped pixel output upto now*/
+  switch (this->mode) {
+  case MODE_15_RGB:
+    this->yuv2rgb_fun = mmxext_rgb15;
+    break;
+  case MODE_16_RGB:
+    this->yuv2rgb_fun = mmxext_rgb16;
+    break;
+  case MODE_24_RGB:
+    this->yuv2rgb_fun = mmxext_rgb24;
+    break;
+  case MODE_32_RGB:
+    this->yuv2rgb_fun = mmxext_argb32;
+    break;
+  case MODE_32_BGR:
+    this->yuv2rgb_fun = mmxext_abgr32;
+    break;
+  }
+}
+void yuv2rgb_init_mmx (yuv2rgb_factory_t *this) {
+  if (this->swapped) 
+    return; /*no swapped pixel output upto now*/
+  switch (this->mode) {
+  case MODE_15_RGB:
+    this->yuv2rgb_fun = mmx_rgb15;
+    break;
+  case MODE_16_RGB:
+    this->yuv2rgb_fun = mmx_rgb16;
+    break;
+  case MODE_24_RGB:
+    this->yuv2rgb_fun = mmx_rgb24;
+    break;
+  case MODE_32_RGB:
+    this->yuv2rgb_fun = mmx_argb32;
+    break;
+  case MODE_32_BGR:
+    this->yuv2rgb_fun = mmx_abgr32;
+    break;
+  }
+}
+#endif