aboutsummaryrefslogtreecommitdiffstats
path: root/distrib/sdl-1.2.15/src/video/ps3/spulibs
diff options
context:
space:
mode:
authorSteve Kondik <shade@chemlab.org>2012-11-18 15:47:18 -0800
committerSteve Kondik <shade@chemlab.org>2012-11-18 15:47:18 -0800
commita546c7006355a7bd1df4267ee53d0bfa2c017c8c (patch)
tree01be0bf6c0d6968e1468ec9661fd52110f9b05a7 /distrib/sdl-1.2.15/src/video/ps3/spulibs
parentbaf3d7830396202df5cc47bd7bcee109c319cdb3 (diff)
parent0f809250987b64f491bd3b4b73c0f0d33036a786 (diff)
downloadexternal_qemu-a546c7006355a7bd1df4267ee53d0bfa2c017c8c.zip
external_qemu-a546c7006355a7bd1df4267ee53d0bfa2c017c8c.tar.gz
external_qemu-a546c7006355a7bd1df4267ee53d0bfa2c017c8c.tar.bz2
Merge branch 'jb-mr1-release' of https://android.googlesource.com/platform/external/qemu into mr1-staging
Change-Id: I8a4a71ac65b08e6e17f26c942f67a15b85211115
Diffstat (limited to 'distrib/sdl-1.2.15/src/video/ps3/spulibs')
-rw-r--r--distrib/sdl-1.2.15/src/video/ps3/spulibs/Makefile83
-rw-r--r--distrib/sdl-1.2.15/src/video/ps3/spulibs/bilin_scaler.c2050
-rw-r--r--distrib/sdl-1.2.15/src/video/ps3/spulibs/fb_writer.c193
-rw-r--r--distrib/sdl-1.2.15/src/video/ps3/spulibs/spu_common.h108
-rw-r--r--distrib/sdl-1.2.15/src/video/ps3/spulibs/yuv2rgb_converter.c629
5 files changed, 3063 insertions, 0 deletions
diff --git a/distrib/sdl-1.2.15/src/video/ps3/spulibs/Makefile b/distrib/sdl-1.2.15/src/video/ps3/spulibs/Makefile
new file mode 100644
index 0000000..dc580d9
--- /dev/null
+++ b/distrib/sdl-1.2.15/src/video/ps3/spulibs/Makefile
@@ -0,0 +1,83 @@
+# This Makefile is for building the CELL BE SPU libs
+# libfb_writer_spu.so, libyuv2rgb_spu.so, libbilin_scaler_spu.so
+
+# Toolchain
+SPU_GCC=/usr/bin/spu-gcc
+PPU_GCC=/usr/bin/gcc
+PPU_EMBEDSPU=/usr/bin/embedspu
+PPU_AR=/usr/bin/ar
+PPU_LD=/usr/bin/ld
+INSTALL=/usr/bin/install
+
+SPU_CFLAGS=-W -Wall -Winline -Wno-main -I. -I /usr/spu/include -I /opt/cell/sdk/usr/spu/include -finline-limit=10000 -Winline -ftree-vectorize -funroll-loops -fmodulo-sched -ffast-math -fPIC -O2
+
+# Usually /usr/lib, depending on your distribution
+PREFIX=/usr/lib
+
+
+all: libfb_writer_spu.a libfb_writer_spu.so \
+ libyuv2rgb_spu.so libyuv2rgb_spu.a \
+ libbilin_scaler_spu.so libbilin_scaler_spu.a
+
+
+# fb_writer
+fb_writer_spu-embed.o: fb_writer.c spu_common.h
+ $(SPU_GCC) $(SPU_CFLAGS) -o fb_writer_spu fb_writer.c -lm
+ $(PPU_EMBEDSPU) -m32 fb_writer_spu fb_writer_spu fb_writer_spu-embed.o
+
+libfb_writer_spu.so: fb_writer_spu-embed.o
+ $(PPU_LD) -o libfb_writer_spu.so -shared -soname=libfb_writer_spu.so fb_writer_spu-embed.o
+
+libfb_writer_spu.a: fb_writer_spu-embed.o
+ $(PPU_AR) -qcs libfb_writer_spu.a fb_writer_spu-embed.o
+
+
+# yuv2rgb_converter
+yuv2rgb_spu-embed.o: yuv2rgb_converter.c spu_common.h
+ $(SPU_GCC) $(SPU_CFLAGS) -o yuv2rgb_spu yuv2rgb_converter.c -lm
+ $(PPU_EMBEDSPU) -m32 yuv2rgb_spu yuv2rgb_spu yuv2rgb_spu-embed.o
+
+libyuv2rgb_spu.a: yuv2rgb_spu-embed.o
+ $(PPU_AR) -qcs libyuv2rgb_spu.a yuv2rgb_spu-embed.o
+
+libyuv2rgb_spu.so: yuv2rgb_spu-embed.o
+ $(PPU_LD) -o libyuv2rgb_spu.so -shared -soname=libyuv2rgb_spu.so yuv2rgb_spu-embed.o
+
+
+# bilin_scaler
+bilin_scaler_spu-embed.o: bilin_scaler.c spu_common.h
+ $(SPU_GCC) $(SPU_CFLAGS) -o bilin_scaler_spu bilin_scaler.c -lm
+ $(PPU_EMBEDSPU) -m32 bilin_scaler_spu bilin_scaler_spu bilin_scaler_spu-embed.o
+
+libbilin_scaler_spu.a: bilin_scaler_spu-embed.o
+ $(PPU_AR) -qcs libbilin_scaler_spu.a bilin_scaler_spu-embed.o
+
+libbilin_scaler_spu.so: bilin_scaler_spu-embed.o
+ $(PPU_LD) -o libbilin_scaler_spu.so -shared -soname=libbilin_scaler_spu.so bilin_scaler_spu-embed.o
+
+install: libfb_writer_spu.a libfb_writer_spu.so \
+ libyuv2rgb_spu.so libyuv2rgb_spu.a \
+ libbilin_scaler_spu.so libbilin_scaler_spu.a
+ $(INSTALL) -c -m 0755 libfb_writer_spu.so $(PREFIX)/.
+ $(INSTALL) -c -m 0655 libfb_writer_spu.a $(PREFIX)/.
+ $(INSTALL) -c -m 0755 libyuv2rgb_spu.so $(PREFIX)/.
+ $(INSTALL) -c -m 0655 libyuv2rgb_spu.a $(PREFIX)/.
+ $(INSTALL) -c -m 0755 libbilin_scaler_spu.so $(PREFIX)/.
+ $(INSTALL) -c -m 0655 libbilin_scaler_spu.a $(PREFIX)/.
+
+
+uninstall: $(PREFIX)/libfb_writer_spu.so $(PREFIX)/libfb_writer_spu.a \
+ $(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a \
+ $(PREFIX)/libbilin_scaler_spu.so $(PREFIX)/libbilin_scaler_spu.a
+ rm -f $(PREFIX)/libfb_writer_spu.a
+ rm -f $(PREFIX)/libfb_writer_spu.so
+ rm -f $(PREFIX)/libyuv2rgb_spu.so
+ rm -f $(PREFIX)/libyuv2rgb_spu.a
+ rm -f $(PREFIX)/libbilin_scaler_spu.so
+ rm -f $(PREFIX)/libbilin_scaler_spu.a
+
+
+clean:
+ rm -f bilin_scaler_spu-embed.o libbilin_scaler_spu.so libbilin_scaler_spu.a bilin_scaler_spu
+ rm -f yuv2rgb_spu-embed.o libyuv2rgb_spu.so libyuv2rgb_spu.a yuv2rgb_spu
+ rm -f fb_writer_spu-embed.o libfb_writer_spu.so libfb_writer_spu.a fb_writer_spu
diff --git a/distrib/sdl-1.2.15/src/video/ps3/spulibs/bilin_scaler.c b/distrib/sdl-1.2.15/src/video/ps3/spulibs/bilin_scaler.c
new file mode 100644
index 0000000..be9b5c6
--- /dev/null
+++ b/distrib/sdl-1.2.15/src/video/ps3/spulibs/bilin_scaler.c
@@ -0,0 +1,2050 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com>
+ * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ * SPE code based on research by:
+ * Rene Becker
+ * Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+// Debugging
+//#define DEBUG
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+ fprintf( stdout, fmt, ##args ); \
+ fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+struct scale_parms_t parms __attribute__((aligned(128)));
+
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
+
+/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
+unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
+unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
+unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
+
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+
+void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
+void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
+
+void scale_srcw16_dstw16();
+void scale_srcw16_dstw32();
+void scale_srcw32_dstw16();
+void scale_srcw32_dstw32();
+
+int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
+{
+ deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
+ /* DMA transfer for the input parameters */
+ spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
+ DMA_WAIT_TAG(TAG_INIT);
+
+ deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
+ parms.dst_pixel_width, parms.dst_pixel_height);
+
+ if(parms.src_pixel_width & 0x1f) {
+ if(parms.dst_pixel_width & 0x1F) {
+ deprintf("[SPU] Using scale_srcw16_dstw16\n");
+ scale_srcw16_dstw16();
+ } else {
+ deprintf("[SPU] Using scale_srcw16_dstw32\n");
+ scale_srcw16_dstw32();
+ }
+ } else {
+ if(parms.dst_pixel_width & 0x1F) {
+ deprintf("[SPU] Using scale_srcw32_dstw16\n");
+ scale_srcw32_dstw16();
+ } else {
+ deprintf("[SPU] Using scale_srcw32_dstw32\n");
+ scale_srcw32_dstw32();
+ }
+ }
+ deprintf("[SPU] bilin_scaler_spu... done!\n");
+
+ return 0;
+}
+
+
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+ vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+ vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+ vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+ vec_s = spu_sel(vec_s, vec_255, select_2);
+ return spu_convtu(vec_s,0);
+}
+
+
+/*
+ * scale_srcw16_dstw16()
+ *
+ * processes an input image of width 16
+ * scaling is done to a width 16
+ * result stored in RAM
+ */
+void scale_srcw16_dstw16() {
+ // extract parameters
+ unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+ unsigned int src_width = parms.src_pixel_width;
+ unsigned int src_height = parms.src_pixel_height;
+ unsigned int dst_width = parms.dst_pixel_width;
+ unsigned int dst_height = parms.dst_pixel_height;
+
+ // YVU
+ unsigned int src_linestride_y = src_width;
+ unsigned int src_dbl_linestride_y = src_width<<1;
+ unsigned int src_linestride_vu = src_width>>1;
+ unsigned int src_dbl_linestride_vu = src_width;
+
+ // scaled YVU
+ unsigned int scaled_src_linestride_y = dst_width;
+
+ // ram addresses
+ unsigned char* src_addr_y = parms.y_plane;
+ unsigned char* src_addr_v = parms.v_plane;
+ unsigned char* src_addr_u = parms.u_plane;
+
+ // for handling misalignment, addresses are precalculated
+ unsigned char* precalc_src_addr_v = src_addr_v;
+ unsigned char* precalc_src_addr_u = src_addr_u;
+
+ unsigned int dst_picture_size = dst_width*dst_height;
+
+ // Sizes for destination
+ unsigned int dst_dbl_linestride_y = dst_width<<1;
+ unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+ // Perform address calculation for Y, V and U in main memory with dst_addr as base
+ unsigned char* dst_addr_main_memory_y = dst_addr;
+ unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+ unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+ // calculate scale factors
+ vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+ float y_scale = (float)src_height/(float)dst_height;
+
+ // double buffered processing
+ // buffer switching
+ unsigned int curr_src_idx = 0;
+ unsigned int curr_dst_idx = 0;
+ unsigned int next_src_idx, next_dst_idx;
+
+ // 2 lines y as output, upper and lowerline
+ unsigned int curr_interpl_y_upper = 0;
+ unsigned int next_interpl_y_upper;
+ unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+ // only 1 line v/u output, both planes have the same dimension
+ unsigned int curr_interpl_vu = 0;
+ unsigned int next_interpl_vu;
+
+ // weights, calculated in every loop iteration
+ vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_y_upper;
+ vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+ vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_vu;
+
+ // line indices for the src picture
+ float curr_src_y_upper = 0.0f, next_src_y_upper;
+ float curr_src_y_lower, next_src_y_lower;
+ float curr_src_vu = 0.0f, next_src_vu;
+
+ // line indices for the dst picture
+ unsigned int dst_y=0, dst_vu=0;
+
+ // offset for the v and u plane to handle misalignement
+ unsigned int curr_lsoff_v = 0, next_lsoff_v;
+ unsigned int curr_lsoff_u = 0, next_lsoff_u;
+
+ // calculate lower line indices
+ curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+ curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+ // lower line weight
+ vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+ // start partially double buffered processing
+ // get initial data, 2 sets of y, 1 set v, 1 set u
+ mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+ mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF,
+ 0, 0 );
+ mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+ mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+ /* iteration loop
+ * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+ * the scaled output is 2 lines y, 1 line v, 1 line u
+ * the yuv2rgb-converted output is stored to RAM
+ */
+ for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+ dst_y = dst_vu<<1;
+
+ // calculate next indices
+ next_src_vu = ((float)dst_vu+1)*y_scale;
+ next_src_y_upper = ((float)dst_y+2)*y_scale;
+ next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+ next_interpl_vu = (unsigned int) next_src_vu;
+ next_interpl_y_upper = (unsigned int) next_src_y_upper;
+ next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+ // calculate weight NORTH-SOUTH
+ vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+ vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+ vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+ // get next lines
+ next_src_idx = curr_src_idx^1;
+ next_dst_idx = curr_dst_idx^1;
+
+ // 4 lines y
+ mfc_get( y_plane[next_src_idx],
+ (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ // 2 lines v
+ precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
+ next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
+ mfc_get( v_plane[next_src_idx],
+ ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
+ src_dbl_linestride_vu+(next_lsoff_v<<1),
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ // 2 lines u
+ precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
+ next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
+ mfc_get( u_plane[next_src_idx],
+ ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
+ src_dbl_linestride_vu+(next_lsoff_v<<1),
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+
+ // Store the result back to main memory into a destination buffer in YUV format
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+ //---------------------------------------------------------------------------------------------
+
+
+ // update for next cycle
+ curr_src_idx = next_src_idx;
+ curr_dst_idx = next_dst_idx;
+
+ curr_interpl_y_upper = next_interpl_y_upper;
+ curr_interpl_y_lower = next_interpl_y_lower;
+ curr_interpl_vu = next_interpl_vu;
+
+ vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+ vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+ vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+ curr_src_y_upper = next_src_y_upper;
+ curr_src_y_lower = next_src_y_lower;
+ curr_src_vu = next_src_vu;
+
+ curr_lsoff_v = next_lsoff_v;
+ curr_lsoff_u = next_lsoff_u;
+ }
+
+
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+
+ // Store the result back to main memory into a destination buffer in YUV format
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ // wait for completion
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+ //---------------------------------------------------------------------------------------------
+}
+
+
+/*
+ * scale_srcw16_dstw32()
+ *
+ * processes an input image of width 16
+ * scaling is done to a width 32
+ * yuv2rgb conversion on a width of 32
+ * result stored in RAM
+ */
+void scale_srcw16_dstw32() {
+ // extract parameters
+ unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+ unsigned int src_width = parms.src_pixel_width;
+ unsigned int src_height = parms.src_pixel_height;
+ unsigned int dst_width = parms.dst_pixel_width;
+ unsigned int dst_height = parms.dst_pixel_height;
+
+ // YVU
+ unsigned int src_linestride_y = src_width;
+ unsigned int src_dbl_linestride_y = src_width<<1;
+ unsigned int src_linestride_vu = src_width>>1;
+ unsigned int src_dbl_linestride_vu = src_width;
+ // scaled YVU
+ unsigned int scaled_src_linestride_y = dst_width;
+
+ // ram addresses
+ unsigned char* src_addr_y = parms.y_plane;
+ unsigned char* src_addr_v = parms.v_plane;
+ unsigned char* src_addr_u = parms.u_plane;
+
+ unsigned int dst_picture_size = dst_width*dst_height;
+
+ // Sizes for destination
+ unsigned int dst_dbl_linestride_y = dst_width<<1;
+ unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+ // Perform address calculation for Y, V and U in main memory with dst_addr as base
+ unsigned char* dst_addr_main_memory_y = dst_addr;
+ unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+ unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+
+ // for handling misalignment, addresses are precalculated
+ unsigned char* precalc_src_addr_v = src_addr_v;
+ unsigned char* precalc_src_addr_u = src_addr_u;
+
+ // calculate scale factors
+ vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+ float y_scale = (float)src_height/(float)dst_height;
+
+ // double buffered processing
+ // buffer switching
+ unsigned int curr_src_idx = 0;
+ unsigned int curr_dst_idx = 0;
+ unsigned int next_src_idx, next_dst_idx;
+
+ // 2 lines y as output, upper and lowerline
+ unsigned int curr_interpl_y_upper = 0;
+ unsigned int next_interpl_y_upper;
+ unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+ // only 1 line v/u output, both planes have the same dimension
+ unsigned int curr_interpl_vu = 0;
+ unsigned int next_interpl_vu;
+
+ // weights, calculated in every loop iteration
+ vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_y_upper;
+ vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+ vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_vu;
+
+ // line indices for the src picture
+ float curr_src_y_upper = 0.0f, next_src_y_upper;
+ float curr_src_y_lower, next_src_y_lower;
+ float curr_src_vu = 0.0f, next_src_vu;
+
+ // line indices for the dst picture
+ unsigned int dst_y=0, dst_vu=0;
+
+ // offset for the v and u plane to handle misalignement
+ unsigned int curr_lsoff_v = 0, next_lsoff_v;
+ unsigned int curr_lsoff_u = 0, next_lsoff_u;
+
+ // calculate lower line idices
+ curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+ curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+ // lower line weight
+ vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+ // start partially double buffered processing
+ // get initial data, 2 sets of y, 1 set v, 1 set u
+ mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+ mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF,
+ 0, 0 );
+ mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+ mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+ // iteration loop
+ // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+ // the scaled output is 2 lines y, 1 line v, 1 line u
+ // the yuv2rgb-converted output is stored to RAM
+ for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+ dst_y = dst_vu<<1;
+
+ // calculate next indices
+ next_src_vu = ((float)dst_vu+1)*y_scale;
+ next_src_y_upper = ((float)dst_y+2)*y_scale;
+ next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+ next_interpl_vu = (unsigned int) next_src_vu;
+ next_interpl_y_upper = (unsigned int) next_src_y_upper;
+ next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+ // calculate weight NORTH-SOUTH
+ vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+ vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+ vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+ // get next lines
+ next_src_idx = curr_src_idx^1;
+ next_dst_idx = curr_dst_idx^1;
+
+ // 4 lines y
+ mfc_get( y_plane[next_src_idx],
+ (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ // 2 lines v
+ precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
+ next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
+ mfc_get( v_plane[next_src_idx],
+ ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
+ src_dbl_linestride_vu+(next_lsoff_v<<1),
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ // 2 lines u
+ precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
+ next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
+ mfc_get( u_plane[next_src_idx],
+ ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
+ src_dbl_linestride_vu+(next_lsoff_v<<1),
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+ //---------------------------------------------------------------------------------------------
+
+
+ // update for next cycle
+ curr_src_idx = next_src_idx;
+ curr_dst_idx = next_dst_idx;
+
+ curr_interpl_y_upper = next_interpl_y_upper;
+ curr_interpl_y_lower = next_interpl_y_lower;
+ curr_interpl_vu = next_interpl_vu;
+
+ vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+ vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+ vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+ curr_src_y_upper = next_src_y_upper;
+ curr_src_y_lower = next_src_y_lower;
+ curr_src_vu = next_src_vu;
+
+ curr_lsoff_v = next_lsoff_v;
+ curr_lsoff_u = next_lsoff_u;
+ }
+
+
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ // wait for completion
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+ //---------------------------------------------------------------------------------------------
+}
+
+
+/*
+ * scale_srcw32_dstw16()
+ *
+ * processes an input image of width 32
+ * scaling is done to a width 16
+ * yuv2rgb conversion on a width of 16
+ * result stored in RAM
+ */
+void scale_srcw32_dstw16() {
+ // extract parameters
+ unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+ unsigned int src_width = parms.src_pixel_width;
+ unsigned int src_height = parms.src_pixel_height;
+ unsigned int dst_width = parms.dst_pixel_width;
+ unsigned int dst_height = parms.dst_pixel_height;
+
+ // YVU
+ unsigned int src_linestride_y = src_width;
+ unsigned int src_dbl_linestride_y = src_width<<1;
+ unsigned int src_linestride_vu = src_width>>1;
+ unsigned int src_dbl_linestride_vu = src_width;
+ // scaled YVU
+ unsigned int scaled_src_linestride_y = dst_width;
+
+ // ram addresses
+ unsigned char* src_addr_y = parms.y_plane;
+ unsigned char* src_addr_v = parms.v_plane;
+ unsigned char* src_addr_u = parms.u_plane;
+
+ unsigned int dst_picture_size = dst_width*dst_height;
+
+ // Sizes for destination
+ unsigned int dst_dbl_linestride_y = dst_width<<1;
+ unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+ // Perform address calculation for Y, V and U in main memory with dst_addr as base
+ unsigned char* dst_addr_main_memory_y = dst_addr;
+ unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+ unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+ // calculate scale factors
+ vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+ float y_scale = (float)src_height/(float)dst_height;
+
+ // double buffered processing
+ // buffer switching
+ unsigned int curr_src_idx = 0;
+ unsigned int curr_dst_idx = 0;
+ unsigned int next_src_idx, next_dst_idx;
+
+ // 2 lines y as output, upper and lowerline
+ unsigned int curr_interpl_y_upper = 0;
+ unsigned int next_interpl_y_upper;
+ unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+ // only 1 line v/u output, both planes have the same dimension
+ unsigned int curr_interpl_vu = 0;
+ unsigned int next_interpl_vu;
+
+ // weights, calculated in every loop iteration
+ vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_y_upper;
+ vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+ vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_vu;
+
+ // line indices for the src picture
+ float curr_src_y_upper = 0.0f, next_src_y_upper;
+ float curr_src_y_lower, next_src_y_lower;
+ float curr_src_vu = 0.0f, next_src_vu;
+
+ // line indices for the dst picture
+ unsigned int dst_y=0, dst_vu=0;
+
+ // calculate lower line idices
+ curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+ curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+ // lower line weight
+ vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+ // start partially double buffered processing
+ // get initial data, 2 sets of y, 1 set v, 1 set u
+ mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+ mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF,
+ 0, 0 );
+ mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+ mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+ // iteration loop
+ // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+ // the scaled output is 2 lines y, 1 line v, 1 line u
+ // the yuv2rgb-converted output is stored to RAM
+ for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+ dst_y = dst_vu<<1;
+
+ // calculate next indices
+ next_src_vu = ((float)dst_vu+1)*y_scale;
+ next_src_y_upper = ((float)dst_y+2)*y_scale;
+ next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+ next_interpl_vu = (unsigned int) next_src_vu;
+ next_interpl_y_upper = (unsigned int) next_src_y_upper;
+ next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+ // calculate weight NORTH-SOUTH
+ vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+ vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+ vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+ // get next lines
+ next_src_idx = curr_src_idx^1;
+ next_dst_idx = curr_dst_idx^1;
+
+ // 4 lines y
+ mfc_get( y_plane[next_src_idx],
+ (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ // 2 lines v
+ mfc_get( v_plane[next_src_idx],
+ (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
+ src_dbl_linestride_vu,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ // 2 lines u
+ mfc_get( u_plane[next_src_idx],
+ (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
+ src_dbl_linestride_vu,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w16( v_plane[curr_src_idx],
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w16( u_plane[curr_src_idx],
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+ //---------------------------------------------------------------------------------------------
+
+
+ // update for next cycle
+ curr_src_idx = next_src_idx;
+ curr_dst_idx = next_dst_idx;
+
+ curr_interpl_y_upper = next_interpl_y_upper;
+ curr_interpl_y_lower = next_interpl_y_lower;
+ curr_interpl_vu = next_interpl_vu;
+
+ vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+ vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+ vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+ curr_src_y_upper = next_src_y_upper;
+ curr_src_y_lower = next_src_y_lower;
+ curr_src_vu = next_src_vu;
+ }
+
+
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w16( v_plane[curr_src_idx],
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w16( u_plane[curr_src_idx],
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ // wait for completion
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+ //---------------------------------------------------------------------------------------------
+}
+
+
+/**
+ * scale_srcw32_dstw32()
+ *
+ * processes an input image of width 32
+ * scaling is done to a width 32
+ * yuv2rgb conversion on a width of 32
+ * result stored in RAM
+ */
+void scale_srcw32_dstw32() {
+ // extract parameters
+ unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+ unsigned int src_width = parms.src_pixel_width;
+ unsigned int src_height = parms.src_pixel_height;
+ unsigned int dst_width = parms.dst_pixel_width;
+ unsigned int dst_height = parms.dst_pixel_height;
+
+ // YVU
+ unsigned int src_linestride_y = src_width;
+ unsigned int src_dbl_linestride_y = src_width<<1;
+ unsigned int src_linestride_vu = src_width>>1;
+ unsigned int src_dbl_linestride_vu = src_width;
+
+ // scaled YVU
+ unsigned int scaled_src_linestride_y = dst_width;
+
+ // ram addresses
+ unsigned char* src_addr_y = parms.y_plane;
+ unsigned char* src_addr_v = parms.v_plane;
+ unsigned char* src_addr_u = parms.u_plane;
+
+ unsigned int dst_picture_size = dst_width*dst_height;
+
+ // Sizes for destination
+ unsigned int dst_dbl_linestride_y = dst_width<<1;
+ unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+ // Perform address calculation for Y, V and U in main memory with dst_addr as base
+ unsigned char* dst_addr_main_memory_y = dst_addr;
+ unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+ unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+ // calculate scale factors
+ vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+ float y_scale = (float)src_height/(float)dst_height;
+
+ // double buffered processing
+ // buffer switching
+ unsigned int curr_src_idx = 0;
+ unsigned int curr_dst_idx = 0;
+ unsigned int next_src_idx, next_dst_idx;
+
+ // 2 lines y as output, upper and lowerline
+ unsigned int curr_interpl_y_upper = 0;
+ unsigned int next_interpl_y_upper;
+ unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+ // only 1 line v/u output, both planes have the same dimension
+ unsigned int curr_interpl_vu = 0;
+ unsigned int next_interpl_vu;
+
+ // weights, calculated in every loop iteration
+ vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_y_upper;
+ vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+ vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+ vector float vf_next_NSweight_vu;
+
+ // line indices for the src picture
+ float curr_src_y_upper = 0.0f, next_src_y_upper;
+ float curr_src_y_lower, next_src_y_lower;
+ float curr_src_vu = 0.0f, next_src_vu;
+
+ // line indices for the dst picture
+ unsigned int dst_y=0, dst_vu=0;
+
+ // calculate lower line idices
+ curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+ curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+ // lower line weight
+ vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+ // start partially double buffered processing
+ // get initial data, 2 sets of y, 1 set v, 1 set u
+ mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+ mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF,
+ 0, 0 );
+ mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+ mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+ // iteration loop
+ // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+ // the scaled output is 2 lines y, 1 line v, 1 line u
+ // the yuv2rgb-converted output is stored to RAM
+ for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+ dst_y = dst_vu<<1;
+
+ // calculate next indices
+ next_src_vu = ((float)dst_vu+1)*y_scale;
+ next_src_y_upper = ((float)dst_y+2)*y_scale;
+ next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+ next_interpl_vu = (unsigned int) next_src_vu;
+ next_interpl_y_upper = (unsigned int) next_src_y_upper;
+ next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+ // calculate weight NORTH-SOUTH
+ vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+ vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+ vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+ // get next lines
+ next_src_idx = curr_src_idx^1;
+ next_dst_idx = curr_dst_idx^1;
+
+ // 4 lines y
+ mfc_get( y_plane[next_src_idx],
+ (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+ (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+ src_dbl_linestride_y,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ // 2 lines v
+ mfc_get( v_plane[next_src_idx],
+ (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
+ src_dbl_linestride_vu,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+ // 2 lines u
+ mfc_get( u_plane[next_src_idx],
+ (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
+ src_dbl_linestride_vu,
+ RETR_BUF+next_src_idx,
+ 0, 0 );
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w16( v_plane[curr_src_idx],
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w16( u_plane[curr_src_idx],
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+
+
+ // Store the result back to main memory into a destination buffer in YUV format
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+ //---------------------------------------------------------------------------------------------
+
+
+ // update for next cycle
+ curr_src_idx = next_src_idx;
+ curr_dst_idx = next_dst_idx;
+
+ curr_interpl_y_upper = next_interpl_y_upper;
+ curr_interpl_y_lower = next_interpl_y_lower;
+ curr_interpl_vu = next_interpl_vu;
+
+ vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+ vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+ vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+ curr_src_y_upper = next_src_y_upper;
+ curr_src_y_lower = next_src_y_lower;
+ curr_src_vu = next_src_vu;
+ }
+
+
+
+ DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+ // scaling
+ // work line y_upper
+ bilinear_scale_line_w16( y_plane[curr_src_idx],
+ scaled_y_plane[curr_src_idx],
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_upper,
+ src_linestride_y );
+ // work line y_lower
+ bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+ scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+ dst_width,
+ vf_x_scale,
+ vf_curr_NSweight_y_lower,
+ src_linestride_y );
+ // work line v
+ bilinear_scale_line_w16( v_plane[curr_src_idx],
+ scaled_v_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+ // work line u
+ bilinear_scale_line_w16( u_plane[curr_src_idx],
+ scaled_u_plane[curr_src_idx],
+ dst_width>>1,
+ vf_x_scale,
+ vf_curr_NSweight_vu,
+ src_linestride_vu );
+
+
+ // Store the result back to main memory into a destination buffer in YUV format
+ //---------------------------------------------------------------------------------------------
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+ // Perform three DMA transfers to 3 different locations in the main memory!
+ // dst_width: Pixel width of destination image
+ // dst_addr: Destination address in main memory
+ // dst_vu: Counter which is incremented one by one
+ // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+ mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
+ dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
+ (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+ dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
+ STR_BUF+curr_dst_idx, // Tag
+ 0, 0 );
+
+ // wait for completion
+ DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+ //---------------------------------------------------------------------------------------------
+}
+
+
+/*
+ * bilinear_scale_line_w8()
+ *
+ * processes a line of yuv-input, width has to be a multiple of 8
+ * scaled yuv-output is written to local store buffer
+ *
+ * @param src buffer for 2 lines input
+ * @param dst_ buffer for 1 line output
+ * @param dst_width the width of the destination line
+ * @param vf_x_scale a float vector, at each entry is the x_scale-factor
+ * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
+ * @param src_linestride the stride of the srcline
+ */
+void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
+
+ unsigned char* dst = dst_;
+
+ unsigned int dst_x;
+ for( dst_x=0; dst_x<dst_width; dst_x+=8) {
+ // address calculation for loading the 4 surrounding pixel of each calculated
+ // destination pixel
+ vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
+ // lower range->first 4 pixel
+ // upper range->next 4 pixel
+ vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
+ vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
+ vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
+ vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
+
+ // calculate weight EAST-WEST
+ vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
+ vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
+ vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
+ vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
+ vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
+ vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
+ vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
+ vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
+ vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
+ vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
+
+ // calculate address offset
+ //
+ // pixel NORTH WEST
+ vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
+ vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
+
+ // pixel NORTH EAST-->(offpixelNW+1)
+ vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
+ vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
+ vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
+
+ // SOUTH-WEST-->(offpixelNW+src_linestride)
+ vector unsigned int vui_srclinestride = spu_splats( src_linestride );
+ vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
+ vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
+
+ // SOUTH-EAST-->(offpixelNW+src_linestride+1)
+ vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
+ vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
+
+ // calculate each address
+ vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
+ vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
+ vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
+ vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
+ vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
+
+ vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
+ vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
+ vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
+ vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
+
+ // get each pixel
+ //
+ // scalar load, afterwards insertion into the right position
+ // NORTH WEST
+ vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
+ vuc_pixel_NW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
+ vuc_pixel_NW_lower_range, 7 );
+ vuc_pixel_NW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
+ vuc_pixel_NW_lower_range, 11 );
+ vuc_pixel_NW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
+ vuc_pixel_NW_lower_range, 15 );
+
+ vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
+ vuc_pixel_NW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
+ vuc_pixel_NW_upper_range, 7 );
+ vuc_pixel_NW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
+ vuc_pixel_NW_upper_range, 11 );
+ vuc_pixel_NW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
+ vuc_pixel_NW_upper_range, 15 );
+
+ // NORTH EAST
+ vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
+ vuc_pixel_NE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
+ vuc_pixel_NE_lower_range, 7 );
+ vuc_pixel_NE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
+ vuc_pixel_NE_lower_range, 11 );
+ vuc_pixel_NE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
+ vuc_pixel_NE_lower_range, 15 );
+
+ vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
+ vuc_pixel_NE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
+ vuc_pixel_NE_upper_range, 7 );
+ vuc_pixel_NE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
+ vuc_pixel_NE_upper_range, 11 );
+ vuc_pixel_NE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
+ vuc_pixel_NE_upper_range, 15 );
+
+
+ // SOUTH WEST
+ vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
+ vuc_pixel_SW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
+ vuc_pixel_SW_lower_range, 7 );
+ vuc_pixel_SW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
+ vuc_pixel_SW_lower_range, 11 );
+ vuc_pixel_SW_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
+ vuc_pixel_SW_lower_range, 15 );
+
+ vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
+ vuc_pixel_SW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
+ vuc_pixel_SW_upper_range, 7 );
+ vuc_pixel_SW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
+ vuc_pixel_SW_upper_range, 11 );
+ vuc_pixel_SW_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
+ vuc_pixel_SW_upper_range, 15 );
+
+ // SOUTH EAST
+ vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
+ vuc_pixel_SE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
+ vuc_pixel_SE_lower_range, 7 );
+ vuc_pixel_SE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
+ vuc_pixel_SE_lower_range, 11 );
+ vuc_pixel_SE_lower_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
+ vuc_pixel_SE_lower_range, 15 );
+
+ vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
+ vuc_pixel_SE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
+ vuc_pixel_SE_upper_range, 7 );
+ vuc_pixel_SE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
+ vuc_pixel_SE_upper_range, 11 );
+ vuc_pixel_SE_upper_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
+ vuc_pixel_SE_upper_range, 15 );
+
+
+ // convert to float
+ vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
+ vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
+
+ vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
+ vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
+
+ vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
+ vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
+
+ vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
+ vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
+
+
+
+ // first linear interpolation: EWtop
+ // EWtop = NW + EWweight*(NE-NW)
+ //
+ // lower range
+ vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
+ vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
+ vf_EWtop_lower_range_tmp,
+ vf_pixel_NW_lower_range );
+
+ // upper range
+ vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
+ vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
+ vf_EWtop_upper_range_tmp,
+ vf_pixel_NW_upper_range );
+
+
+
+ // second linear interpolation: EWbottom
+ // EWbottom = SW + EWweight*(SE-SW)
+ //
+ // lower range
+ vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
+ vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
+ vf_EWbottom_lower_range_tmp,
+ vf_pixel_SW_lower_range );
+
+ // upper range
+ vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
+ vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
+ vf_EWbottom_upper_range_tmp,
+ vf_pixel_SW_upper_range );
+
+
+
+ // third linear interpolation: the bilinear interpolated value
+ // result = EWtop + NSweight*(EWbottom-EWtop);
+ //
+ // lower range
+ vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
+ vector float vf_result_lower_range = spu_madd( vf_NSweight,
+ vf_result_lower_range_tmp,
+ vf_EWtop_lower_range );
+
+ // upper range
+ vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
+ vector float vf_result_upper_range = spu_madd( vf_NSweight,
+ vf_result_upper_range_tmp,
+ vf_EWtop_upper_range );
+
+
+ // convert back: using saturated arithmetic
+ vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
+ vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
+
+ // merge results->lower,upper
+ vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
+ 0x13, 0x17, 0x1B, 0x1F,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00 };
+
+ vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
+ (vector unsigned char) vui_result_upper_range,
+ vuc_mask_merge_result );
+
+ // partial storing
+ vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF };
+
+
+ // get currently stored data
+ vector unsigned char vuc_orig = *((vector unsigned char*)dst);
+
+ // clear currently stored data
+ vuc_orig = spu_and( vuc_orig,
+ spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
+
+ // rotate result according to storing address
+ vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
+
+ // store result
+ *((vector unsigned char*)dst) = spu_or( vuc_result,
+ vuc_orig );
+ dst += 8;
+ }
+}
+
+
+/*
+ * bilinear_scale_line_w16()
+ *
+ * processes a line of yuv-input, width has to be a multiple of 16
+ * scaled yuv-output is written to local store buffer
+ *
+ * @param src buffer for 2 lines input
+ * @param dst_ buffer for 1 line output
+ * @param dst_width the width of the destination line
+ * @param vf_x_scale a float vector, at each entry is the x_scale-factor
+ * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
+ * @param src_linestride the stride of the srcline
+ */
+void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
+
+ unsigned char* dst = dst_;
+
+ unsigned int dst_x;
+ for( dst_x=0; dst_x<dst_width; dst_x+=16) {
+ // address calculation for loading the 4 surrounding pixel of each calculated
+ // destination pixel
+ vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
+ // parallelised processing
+ // first range->pixel 1 2 3 4
+ // second range->pixel 5 6 7 8
+ // third range->pixel 9 10 11 12
+ // fourth range->pixel 13 14 15 16
+ vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
+ vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
+ vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
+ vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
+ vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
+ vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
+ vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
+ vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
+
+ // calculate weight EAST-WEST
+ vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
+ vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
+ vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
+ vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
+ vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
+ vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
+ vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
+ vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
+ vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
+ vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
+ vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
+ vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
+ vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
+ vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
+ vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
+ vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
+ vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
+ vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
+ vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
+ vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
+
+ // calculate address offset
+ //
+ // pixel NORTH WEST
+ vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
+ vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
+ vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
+ vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
+
+ // pixel NORTH EAST-->(offpixelNW+1)
+ vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
+ vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
+ vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
+ vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
+ vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
+
+ // SOUTH-WEST-->(offpixelNW+src_linestride)
+ vector unsigned int vui_srclinestride = spu_splats( src_linestride );
+ vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
+ vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
+ vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
+ vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
+
+ // SOUTH-EAST-->(offpixelNW+src_linestride+1)
+ vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
+ vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
+ vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
+ vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
+
+ // calculate each address
+ vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
+ vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
+ vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
+ vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
+ vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
+
+ vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
+ vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
+ vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
+ vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
+
+ vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
+ vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
+ vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
+ vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
+
+ vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
+ vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
+ vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
+ vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
+
+
+ // get each pixel
+ //
+ // scalar load, afterwards insertion into the right position
+ // NORTH WEST
+ // first range
+ vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ vector unsigned char vuc_pixel_NW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
+ vuc_pixel_NW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
+ vuc_pixel_NW_first_range, 7 );
+ vuc_pixel_NW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
+ vuc_pixel_NW_first_range, 11 );
+ vuc_pixel_NW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
+ vuc_pixel_NW_first_range, 15 );
+ // second range
+ vector unsigned char vuc_pixel_NW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
+ vuc_pixel_NW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
+ vuc_pixel_NW_second_range, 7 );
+ vuc_pixel_NW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
+ vuc_pixel_NW_second_range, 11 );
+ vuc_pixel_NW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
+ vuc_pixel_NW_second_range, 15 );
+ // third range
+ vector unsigned char vuc_pixel_NW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
+ vuc_pixel_NW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
+ vuc_pixel_NW_third_range, 7 );
+ vuc_pixel_NW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
+ vuc_pixel_NW_third_range, 11 );
+ vuc_pixel_NW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
+ vuc_pixel_NW_third_range, 15 );
+ // fourth range
+ vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
+ vuc_pixel_NW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
+ vuc_pixel_NW_fourth_range, 7 );
+ vuc_pixel_NW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
+ vuc_pixel_NW_fourth_range, 11 );
+ vuc_pixel_NW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
+ vuc_pixel_NW_fourth_range, 15 );
+
+ // NORTH EAST
+ // first range
+ vector unsigned char vuc_pixel_NE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
+ vuc_pixel_NE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
+ vuc_pixel_NE_first_range, 7 );
+ vuc_pixel_NE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
+ vuc_pixel_NE_first_range, 11 );
+ vuc_pixel_NE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
+ vuc_pixel_NE_first_range, 15 );
+ // second range
+ vector unsigned char vuc_pixel_NE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
+ vuc_pixel_NE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
+ vuc_pixel_NE_second_range, 7 );
+ vuc_pixel_NE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
+ vuc_pixel_NE_second_range, 11 );
+ vuc_pixel_NE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
+ vuc_pixel_NE_second_range, 15 );
+ // third range
+ vector unsigned char vuc_pixel_NE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
+ vuc_pixel_NE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
+ vuc_pixel_NE_third_range, 7 );
+ vuc_pixel_NE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
+ vuc_pixel_NE_third_range, 11 );
+ vuc_pixel_NE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
+ vuc_pixel_NE_third_range, 15 );
+ // fourth range
+ vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
+ vuc_pixel_NE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
+ vuc_pixel_NE_fourth_range, 7 );
+ vuc_pixel_NE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
+ vuc_pixel_NE_fourth_range, 11 );
+ vuc_pixel_NE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
+ vuc_pixel_NE_fourth_range, 15 );
+
+ // SOUTH WEST
+ // first range
+ vector unsigned char vuc_pixel_SW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
+ vuc_pixel_SW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
+ vuc_pixel_SW_first_range, 7 );
+ vuc_pixel_SW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
+ vuc_pixel_SW_first_range, 11 );
+ vuc_pixel_SW_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
+ vuc_pixel_SW_first_range, 15 );
+ // second range
+ vector unsigned char vuc_pixel_SW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
+ vuc_pixel_SW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
+ vuc_pixel_SW_second_range, 7 );
+ vuc_pixel_SW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
+ vuc_pixel_SW_second_range, 11 );
+ vuc_pixel_SW_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
+ vuc_pixel_SW_second_range, 15 );
+ // third range
+ vector unsigned char vuc_pixel_SW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
+ vuc_pixel_SW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
+ vuc_pixel_SW_third_range, 7 );
+ vuc_pixel_SW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
+ vuc_pixel_SW_third_range, 11 );
+ vuc_pixel_SW_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
+ vuc_pixel_SW_third_range, 15 );
+ // fourth range
+ vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
+ vuc_pixel_SW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
+ vuc_pixel_SW_fourth_range, 7 );
+ vuc_pixel_SW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
+ vuc_pixel_SW_fourth_range, 11 );
+ vuc_pixel_SW_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
+ vuc_pixel_SW_fourth_range, 15 );
+
+ // NORTH EAST
+ // first range
+ vector unsigned char vuc_pixel_SE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
+ vuc_pixel_SE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
+ vuc_pixel_SE_first_range, 7 );
+ vuc_pixel_SE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
+ vuc_pixel_SE_first_range, 11 );
+ vuc_pixel_SE_first_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
+ vuc_pixel_SE_first_range, 15 );
+ // second range
+ vector unsigned char vuc_pixel_SE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
+ vuc_pixel_SE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
+ vuc_pixel_SE_second_range, 7 );
+ vuc_pixel_SE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
+ vuc_pixel_SE_second_range, 11 );
+ vuc_pixel_SE_second_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
+ vuc_pixel_SE_second_range, 15 );
+ // third range
+ vector unsigned char vuc_pixel_SE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
+ vuc_pixel_SE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
+ vuc_pixel_SE_third_range, 7 );
+ vuc_pixel_SE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
+ vuc_pixel_SE_third_range, 11 );
+ vuc_pixel_SE_third_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
+ vuc_pixel_SE_third_range, 15 );
+ // fourth range
+ vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
+ vuc_pixel_SE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
+ vuc_pixel_SE_fourth_range, 7 );
+ vuc_pixel_SE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
+ vuc_pixel_SE_fourth_range, 11 );
+ vuc_pixel_SE_fourth_range = spu_insert(
+ *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
+ vuc_pixel_SE_fourth_range, 15 );
+
+
+
+ // convert to float
+ vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
+ vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
+ vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
+ vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
+
+ vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
+ vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
+ vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
+ vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
+
+ vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
+ vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
+ vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
+ vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
+
+ vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
+ vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
+ vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
+ vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
+
+ // first linear interpolation: EWtop
+ // EWtop = NW + EWweight*(NE-NW)
+ //
+ // first range
+ vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
+ vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
+ vf_EWtop_first_range_tmp,
+ vf_pixel_NW_first_range );
+
+ // second range
+ vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
+ vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
+ vf_EWtop_second_range_tmp,
+ vf_pixel_NW_second_range );
+
+ // third range
+ vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
+ vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
+ vf_EWtop_third_range_tmp,
+ vf_pixel_NW_third_range );
+
+ // fourth range
+ vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
+ vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
+ vf_EWtop_fourth_range_tmp,
+ vf_pixel_NW_fourth_range );
+
+
+
+ // second linear interpolation: EWbottom
+ // EWbottom = SW + EWweight*(SE-SW)
+ //
+ // first range
+ vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
+ vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
+ vf_EWbottom_first_range_tmp,
+ vf_pixel_SW_first_range );
+
+ // second range
+ vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
+ vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
+ vf_EWbottom_second_range_tmp,
+ vf_pixel_SW_second_range );
+ // first range
+ vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
+ vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
+ vf_EWbottom_third_range_tmp,
+ vf_pixel_SW_third_range );
+
+ // first range
+ vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
+ vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
+ vf_EWbottom_fourth_range_tmp,
+ vf_pixel_SW_fourth_range );
+
+
+
+ // third linear interpolation: the bilinear interpolated value
+ // result = EWtop + NSweight*(EWbottom-EWtop);
+ //
+ // first range
+ vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
+ vector float vf_result_first_range = spu_madd( vf_NSweight,
+ vf_result_first_range_tmp,
+ vf_EWtop_first_range );
+
+ // second range
+ vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
+ vector float vf_result_second_range = spu_madd( vf_NSweight,
+ vf_result_second_range_tmp,
+ vf_EWtop_second_range );
+
+ // third range
+ vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
+ vector float vf_result_third_range = spu_madd( vf_NSweight,
+ vf_result_third_range_tmp,
+ vf_EWtop_third_range );
+
+ // fourth range
+ vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
+ vector float vf_result_fourth_range = spu_madd( vf_NSweight,
+ vf_result_fourth_range_tmp,
+ vf_EWtop_fourth_range );
+
+
+
+ // convert back: using saturated arithmetic
+ vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
+ vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
+ vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
+ vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
+
+ // merge results->lower,upper
+ vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
+ 0x13, 0x17, 0x1B, 0x1F,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00 };
+
+ vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x03, 0x07, 0x0B, 0x0F,
+ 0x13, 0x17, 0x1B, 0x1F };
+
+ vector unsigned char vuc_result_first_second =
+ spu_shuffle( (vector unsigned char) vui_result_first_range,
+ (vector unsigned char) vui_result_second_range,
+ vuc_mask_merge_result_first_second );
+
+ vector unsigned char vuc_result_third_fourth =
+ spu_shuffle( (vector unsigned char) vui_result_third_range,
+ (vector unsigned char) vui_result_fourth_range,
+ vuc_mask_merge_result_third_fourth );
+
+ // store result
+ *((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
+ vuc_result_third_fourth );
+ dst += 16;
+ }
+}
+
diff --git a/distrib/sdl-1.2.15/src/video/ps3/spulibs/fb_writer.c b/distrib/sdl-1.2.15/src/video/ps3/spulibs/fb_writer.c
new file mode 100644
index 0000000..0eb51cc
--- /dev/null
+++ b/distrib/sdl-1.2.15/src/video/ps3/spulibs/fb_writer.c
@@ -0,0 +1,193 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com>
+ * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ * SPE code based on research by:
+ * Rene Becker
+ * Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <stdio.h>
+#include <string.h>
+
+// Debugging
+//#define DEBUG
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+ fprintf( stdout, fmt, ##args ); \
+ fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+void cpy_to_fb(unsigned int);
+
+/* fb_writer_spu parms */
+static volatile struct fb_writer_parms_t parms __attribute__ ((aligned(128)));
+
+/* Code running on SPU */
+int main(unsigned long long spe_id __attribute__ ((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+ deprintf("[SPU] fb_writer_spu is up... (on SPE #%llu)\n", spe_id);
+ uint32_t ea_mfc, mbox;
+ // send ready message
+ spu_write_out_mbox(SPU_READY);
+
+ while (1) {
+ /* Check mailbox */
+ mbox = spu_read_in_mbox();
+ deprintf("[SPU] Message is %u\n", mbox);
+ switch (mbox) {
+ case SPU_EXIT:
+ deprintf("[SPU] fb_writer goes down...\n");
+ return 0;
+ case SPU_START:
+ break;
+ default:
+ deprintf("[SPU] Cannot handle message\n");
+ continue;
+ }
+
+ /* Tag Manager setup */
+ unsigned int tags;
+ tags = mfc_multi_tag_reserve(5);
+ if (tags == MFC_TAG_INVALID) {
+ deprintf("[SPU] Failed to reserve mfc tags on fb_writer\n");
+ return 0;
+ }
+
+ /* Framebuffer parms */
+ ea_mfc = spu_read_in_mbox();
+ deprintf("[SPU] Message on fb_writer is %u\n", ea_mfc);
+ spu_mfcdma32(&parms, (unsigned int)ea_mfc,
+ sizeof(struct fb_writer_parms_t), tags,
+ MFC_GET_CMD);
+ deprintf("[SPU] argp = %u\n", (unsigned int)argp);
+ DMA_WAIT_TAG(tags);
+
+ /* Copy parms->data to framebuffer */
+ deprintf("[SPU] Copying to framebuffer started\n");
+ cpy_to_fb(tags);
+ deprintf("[SPU] Copying to framebuffer done!\n");
+
+ mfc_multi_tag_release(tags, 5);
+ deprintf("[SPU] fb_writer_spu... done!\n");
+ /* Send FIN msg */
+ spu_write_out_mbox(SPU_FIN);
+ }
+
+ return 0;
+}
+
+void cpy_to_fb(unsigned int tag_id_base)
+{
+ unsigned int i;
+ unsigned char current_buf;
+ uint8_t *in = parms.data;
+
+ /* Align fb pointer which was centered before */
+ uint8_t *fb =
+ (unsigned char *)((unsigned int)parms.center & 0xFFFFFFF0);
+
+ uint32_t bounded_input_height = parms.bounded_input_height;
+ uint32_t bounded_input_width = parms.bounded_input_width;
+ uint32_t fb_pixel_size = parms.fb_pixel_size;
+
+ uint32_t out_line_stride = parms.out_line_stride;
+ uint32_t in_line_stride = parms.in_line_stride;
+ uint32_t in_line_size = bounded_input_width * fb_pixel_size;
+
+ current_buf = 0;
+
+ /* Local store buffer */
+ static volatile uint8_t buf[4][BUFFER_SIZE]
+ __attribute__ ((aligned(128)));
+ /* do 4-times multibuffering using DMA list, process in two steps */
+ for (i = 0; i < bounded_input_height >> 2; i++) {
+ /* first buffer */
+ DMA_WAIT_TAG(tag_id_base + 1);
+ // retrieve buffer
+ spu_mfcdma32(buf[0], (unsigned int)in, in_line_size,
+ tag_id_base + 1, MFC_GETB_CMD);
+ DMA_WAIT_TAG(tag_id_base + 1);
+ // store buffer
+ spu_mfcdma32(buf[0], (unsigned int)fb, in_line_size,
+ tag_id_base + 1, MFC_PUTB_CMD);
+ in += in_line_stride;
+ fb += out_line_stride;
+ deprintf("[SPU] 1st buffer copied in=0x%x, fb=0x%x\n", in,
+ fb);
+
+ /* second buffer */
+ DMA_WAIT_TAG(tag_id_base + 2);
+ // retrieve buffer
+ spu_mfcdma32(buf[1], (unsigned int)in, in_line_size,
+ tag_id_base + 2, MFC_GETB_CMD);
+ DMA_WAIT_TAG(tag_id_base + 2);
+ // store buffer
+ spu_mfcdma32(buf[1], (unsigned int)fb, in_line_size,
+ tag_id_base + 2, MFC_PUTB_CMD);
+ in += in_line_stride;
+ fb += out_line_stride;
+ deprintf("[SPU] 2nd buffer copied in=0x%x, fb=0x%x\n", in,
+ fb);
+
+ /* third buffer */
+ DMA_WAIT_TAG(tag_id_base + 3);
+ // retrieve buffer
+ spu_mfcdma32(buf[2], (unsigned int)in, in_line_size,
+ tag_id_base + 3, MFC_GETB_CMD);
+ DMA_WAIT_TAG(tag_id_base + 3);
+ // store buffer
+ spu_mfcdma32(buf[2], (unsigned int)fb, in_line_size,
+ tag_id_base + 3, MFC_PUTB_CMD);
+ in += in_line_stride;
+ fb += out_line_stride;
+ deprintf("[SPU] 3rd buffer copied in=0x%x, fb=0x%x\n", in,
+ fb);
+
+ /* fourth buffer */
+ DMA_WAIT_TAG(tag_id_base + 4);
+ // retrieve buffer
+ spu_mfcdma32(buf[3], (unsigned int)in, in_line_size,
+ tag_id_base + 4, MFC_GETB_CMD);
+ DMA_WAIT_TAG(tag_id_base + 4);
+ // store buffer
+ spu_mfcdma32(buf[3], (unsigned int)fb, in_line_size,
+ tag_id_base + 4, MFC_PUTB_CMD);
+ in += in_line_stride;
+ fb += out_line_stride;
+ deprintf("[SPU] 4th buffer copied in=0x%x, fb=0x%x\n", in,
+ fb);
+ deprintf("[SPU] Loop #%i, bounded_input_height=%i\n", i,
+ bounded_input_height >> 2);
+ }
+ DMA_WAIT_TAG(tag_id_base + 2);
+ DMA_WAIT_TAG(tag_id_base + 3);
+ DMA_WAIT_TAG(tag_id_base + 4);
+}
+
+
diff --git a/distrib/sdl-1.2.15/src/video/ps3/spulibs/spu_common.h b/distrib/sdl-1.2.15/src/video/ps3/spulibs/spu_common.h
new file mode 100644
index 0000000..42c328c
--- /dev/null
+++ b/distrib/sdl-1.2.15/src/video/ps3/spulibs/spu_common.h
@@ -0,0 +1,108 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com>
+ * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ * SPE code based on research by:
+ * Rene Becker
+ * Thimo Emmerich
+ */
+
+/* Common definitions/makros for SPUs */
+
+#ifndef _SPU_COMMON_H
+#define _SPU_COMMON_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Tag management */
+#define DMA_WAIT_TAG(_tag) \
+ mfc_write_tag_mask(1<<(_tag)); \
+ mfc_read_tag_status_all();
+
+/* SPU mailbox messages */
+#define SPU_READY 0
+#define SPU_START 1
+#define SPU_FIN 2
+#define SPU_EXIT 3
+
+/* Tags */
+#define RETR_BUF 0
+#define STR_BUF 1
+#define TAG_INIT 2
+
+/* Buffersizes */
+#define MAX_HDTV_WIDTH 1920
+#define MAX_HDTV_HEIGHT 1080
+/* One stride of HDTV */
+#define BUFFER_SIZE 7680
+
+/* fb_writer ppu/spu exchange parms */
+struct fb_writer_parms_t {
+ uint8_t *data;
+ uint8_t *center;
+ uint32_t out_line_stride;
+ uint32_t in_line_stride;
+ uint32_t bounded_input_height;
+ uint32_t bounded_input_width;
+ uint32_t fb_pixel_size;
+
+ /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+ char padding[4];
+} __attribute__((aligned(128)));
+
+/* yuv2rgb ppu/spu exchange parms */
+struct yuv2rgb_parms_t {
+ uint8_t* y_plane;
+ uint8_t* v_plane;
+ uint8_t* u_plane;
+
+ uint8_t* dstBuffer;
+
+ unsigned int src_pixel_width;
+ unsigned int src_pixel_height;
+
+ /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+ char padding[128 - ((4 * sizeof(uint8_t *) + 2 * sizeof(unsigned int)) & 0x7F)];
+} __attribute__((aligned(128)));
+
+/* bilin_scaler ppu/spu exchange parms */
+struct scale_parms_t {
+ uint8_t* y_plane;
+ uint8_t* v_plane;
+ uint8_t* u_plane;
+
+ uint8_t* dstBuffer;
+
+ unsigned int src_pixel_width;
+ unsigned int src_pixel_height;
+
+ unsigned int dst_pixel_width;
+ unsigned int dst_pixel_height;
+
+ /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+ char padding[128 - ((4 * sizeof(uint8_t *) + 4 * sizeof(unsigned int)) & 0x7F)];
+} __attribute__((aligned(128)));
+
+#endif /* _SPU_COMMON_H */
+
+
diff --git a/distrib/sdl-1.2.15/src/video/ps3/spulibs/yuv2rgb_converter.c b/distrib/sdl-1.2.15/src/video/ps3/spulibs/yuv2rgb_converter.c
new file mode 100644
index 0000000..5e16691
--- /dev/null
+++ b/distrib/sdl-1.2.15/src/video/ps3/spulibs/yuv2rgb_converter.c
@@ -0,0 +1,629 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com>
+ * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ * SPE code based on research by:
+ * Rene Becker
+ * Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+// Debugging
+//#define DEBUG
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+ fprintf( stdout, fmt, ##args ); \
+ fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
+
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+
+/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
+unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
+
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+
+void yuv_to_rgb_w16();
+void yuv_to_rgb_w32();
+
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
+
+
+int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+ deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
+ uint32_t ea_mfc, mbox;
+ // send ready message
+ spu_write_out_mbox(SPU_READY);
+
+ while (1) {
+ /* Check mailbox */
+ mbox = spu_read_in_mbox();
+ deprintf("[SPU] Message is %u\n", mbox);
+ switch (mbox) {
+ case SPU_EXIT:
+ deprintf("[SPU] fb_writer goes down...\n");
+ return 0;
+ case SPU_START:
+ break;
+ default:
+ deprintf("[SPU] Cannot handle message\n");
+ continue;
+ }
+
+ /* Tag Manager setup */
+ unsigned int tag_id;
+ tag_id = mfc_multi_tag_reserve(1);
+ if (tag_id == MFC_TAG_INVALID) {
+ deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
+ return 0;
+ }
+
+ /* DMA transfer for the input parameters */
+ ea_mfc = spu_read_in_mbox();
+ deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
+ spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
+ DMA_WAIT_TAG(tag_id);
+
+ /* There are alignment issues that involve handling of special cases
+ * a width of 32 results in a width of 16 in the chrominance
+ * --> choose the proper handling to optimize the performance
+ */
+ deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
+ if (parms_converter.src_pixel_width & 0x1f) {
+ deprintf("[SPU] Using yuv_to_rgb_w16\n");
+ yuv_to_rgb_w16();
+ } else {
+ deprintf("[SPU] Using yuv_to_rgb_w32\n");
+ yuv_to_rgb_w32();
+ }
+
+ mfc_multi_tag_release(tag_id, 1);
+ deprintf("[SPU] yuv2rgb_spu... done!\n");
+ /* Send FIN message */
+ spu_write_out_mbox(SPU_FIN);
+ }
+
+ return 0;
+}
+
+
+/*
+ * float_to_char()
+ *
+ * converts a float to a character using saturated
+ * arithmetic
+ *
+ * @param s float for conversion
+ * @returns converted character
+ */
+inline static unsigned char float_to_char(float s) {
+ vector float vec_s = spu_splats(s);
+ vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+ vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+ vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+ vec_s = spu_sel(vec_s, vec_255, select_2);
+ return (unsigned char) spu_extract(vec_s,0);
+}
+
+
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+ vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+ vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+ vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+ vec_s = spu_sel(vec_s, vec_255, select_2);
+ return spu_convtu(vec_s,0);
+}
+
+
+void yuv_to_rgb_w16() {
+ // Pixel dimensions of the picture
+ uint32_t width, height;
+
+ // Extract parameters
+ width = parms_converter.src_pixel_width;
+ height = parms_converter.src_pixel_height;
+
+ // Plane data management
+ // Y
+ unsigned char* ram_addr_y = parms_converter.y_plane;
+ // V
+ unsigned char* ram_addr_v = parms_converter.v_plane;
+ // U
+ unsigned char* ram_addr_u = parms_converter.u_plane;
+
+ // BGRA
+ unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+ // Strides
+ unsigned int stride_y = width;
+ unsigned int stride_vu = width>>1;
+
+ // Buffer management
+ unsigned int buf_idx = 0;
+ unsigned int size_4lines_y = stride_y<<2;
+ unsigned int size_2lines_y = stride_y<<1;
+ unsigned int size_2lines_vu = stride_vu<<1;
+
+ // 2*width*4byte_per_pixel
+ unsigned int size_2lines_bgra = width<<3;
+
+
+ // start double-buffered processing
+ // 4 lines y
+ spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+ // 2 lines v
+ spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+ // 2 lines u
+ spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+ // Wait for these transfers to be completed
+ DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+ unsigned int i;
+ for(i=0; i<(height>>2)-1; i++) {
+
+ buf_idx^=1;
+
+ // 4 lines y
+ spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+ // 2 lines v
+ spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+ // 2 lines u
+ spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+ DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+ buf_idx^=1;
+
+
+ // Convert YUV to BGRA, store it back (first two lines)
+ yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+ // Next two lines
+ yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+ v_plane[buf_idx] + stride_vu,
+ u_plane[buf_idx] + stride_vu,
+ bgra + size_2lines_bgra,
+ width);
+
+ // Wait for previous storing transfer to be completed
+ DMA_WAIT_TAG(STR_BUF);
+
+ // Store converted lines in two steps->max transfer size 16384
+ spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+ ram_addr_bgra += size_2lines_bgra;
+ spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+ ram_addr_bgra += size_2lines_bgra;
+
+ // Move 4 lines
+ ram_addr_y += size_4lines_y;
+ ram_addr_v += size_2lines_vu;
+ ram_addr_u += size_2lines_vu;
+
+ buf_idx^=1;
+ }
+
+ // Convert YUV to BGRA, store it back (first two lines)
+ yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+ // Next two lines
+ yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+ v_plane[buf_idx] + stride_vu,
+ u_plane[buf_idx] + stride_vu,
+ bgra + size_2lines_bgra,
+ width);
+
+ // Wait for previous storing transfer to be completed
+ DMA_WAIT_TAG(STR_BUF);
+ spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+ ram_addr_bgra += size_2lines_bgra;
+ spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+ // wait for previous storing transfer to be completed
+ DMA_WAIT_TAG(STR_BUF);
+
+}
+
+
+void yuv_to_rgb_w32() {
+ // Pixel dimensions of the picture
+ uint32_t width, height;
+
+ // Extract parameters
+ width = parms_converter.src_pixel_width;
+ height = parms_converter.src_pixel_height;
+
+ // Plane data management
+ // Y
+ unsigned char* ram_addr_y = parms_converter.y_plane;
+ // V
+ unsigned char* ram_addr_v = parms_converter.v_plane;
+ // U
+ unsigned char* ram_addr_u = parms_converter.u_plane;
+
+ // BGRA
+ unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+ // Strides
+ unsigned int stride_y = width;
+ unsigned int stride_vu = width>>1;
+
+ // Buffer management
+ unsigned int buf_idx = 0;
+ unsigned int size_4lines_y = stride_y<<2;
+ unsigned int size_2lines_y = stride_y<<1;
+ unsigned int size_2lines_vu = stride_vu<<1;
+
+ // 2*width*4byte_per_pixel
+ unsigned int size_2lines_bgra = width<<3;
+
+ // start double-buffered processing
+ // 4 lines y
+ spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+ // 2 lines v
+ spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+ // 2 lines u
+ spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+
+ // Wait for these transfers to be completed
+ DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+ unsigned int i;
+ for(i=0; i < (height>>2)-1; i++) {
+ buf_idx^=1;
+ // 4 lines y
+ spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+ deprintf("4lines = %d\n", size_4lines_y);
+ // 2 lines v
+ spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+ deprintf("2lines = %d\n", size_2lines_vu);
+ // 2 lines u
+ spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+ deprintf("2lines = %d\n", size_2lines_vu);
+
+ DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+ buf_idx^=1;
+
+ // Convert YUV to BGRA, store it back (first two lines)
+ yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+ // Next two lines
+ yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+ v_plane[buf_idx] + stride_vu,
+ u_plane[buf_idx] + stride_vu,
+ bgra + size_2lines_bgra,
+ width);
+
+ // Wait for previous storing transfer to be completed
+ DMA_WAIT_TAG(STR_BUF);
+
+ // Store converted lines in two steps->max transfer size 16384
+ spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+ ram_addr_bgra += size_2lines_bgra;
+ spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+ ram_addr_bgra += size_2lines_bgra;
+
+ // Move 4 lines
+ ram_addr_y += size_4lines_y;
+ ram_addr_v += size_2lines_vu;
+ ram_addr_u += size_2lines_vu;
+
+ buf_idx^=1;
+ }
+
+ // Convert YUV to BGRA, store it back (first two lines)
+ yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+ // Next two lines
+ yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+ v_plane[buf_idx] + stride_vu,
+ u_plane[buf_idx] + stride_vu,
+ bgra + size_2lines_bgra,
+ width);
+
+ // Wait for previous storing transfer to be completed
+ DMA_WAIT_TAG(STR_BUF);
+ spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+ ram_addr_bgra += size_2lines_bgra;
+ spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+ // Wait for previous storing transfer to be completed
+ DMA_WAIT_TAG(STR_BUF);
+}
+
+
+/* Some vectors needed by the yuv 2 rgb conversion algorithm */
+const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
+const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
+const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
+const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
+const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
+
+const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
+const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
+const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
+const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
+
+const vector unsigned int vec_alpha = { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
+
+const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
+const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
+
+
+/*
+ * yuv_to_rgb_w16()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 16
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+ // each pixel is stored as an integer
+ unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+
+ unsigned int x;
+ for(x = 0; x < width; x+=2) {
+ // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+ const unsigned char Y_1 = *(y_addr + x);
+ const unsigned char Y_2 = *(y_addr + x + 1);
+ const unsigned char Y_3 = *(y_addr + x + width);
+ const unsigned char Y_4 = *(y_addr + x + width + 1);
+ const unsigned char U = *(u_addr + (x >> 1));
+ const unsigned char V = *(v_addr + (x >> 1));
+
+ float V_minus_128 = (float)((float)V - 128.0f);
+ float U_minus_128 = (float)((float)U - 128.0f);
+
+ float R_precalculate = 1.403f * V_minus_128;
+ float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
+ float B_precalculate = 1.773f * U_minus_128;
+
+ const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
+ const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
+ const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
+ const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
+ const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
+ const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
+ const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
+ const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
+ const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
+ const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
+ const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
+ const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
+
+ *(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
+ *(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
+ *(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
+ *(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
+ }
+}
+
+
+/*
+ * yuv_to_rgb_w32()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 32
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+ // each pixel is stored as an integer
+ unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+
+ unsigned int x;
+ for(x = 0; x < width; x+=32) {
+ // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+
+ const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
+ const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
+ const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
+ const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
+ const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
+ const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
+
+ const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
+ const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
+ const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
+ const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
+
+ const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
+ const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
+ const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
+ const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
+
+ vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
+ vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
+ vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
+ vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
+ vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
+ vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
+ vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
+ vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
+ vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
+ vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
+ vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
+ vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
+ vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
+ vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
+ vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
+ vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
+
+ const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
+ const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
+ const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
+ const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
+
+ const vector float R1_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_upper);
+ const vector float R2_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_lower);
+ const vector float R3_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_upper);
+ const vector float R4_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_lower);
+ const vector float R5_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_upper);
+ const vector float R6_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_lower);
+ const vector float R7_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_upper);
+ const vector float R8_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_lower);
+
+
+ const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
+ const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
+ const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
+ const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
+
+ const vector float G1_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_upper);
+ const vector float G2_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_lower);
+ const vector float G3_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_upper);
+ const vector float G4_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_lower);
+ const vector float G5_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_upper);
+ const vector float G6_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_lower);
+ const vector float G7_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_upper);
+ const vector float G8_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_lower);
+
+
+ const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
+ const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
+ const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
+ const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
+
+ const vector float B1_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_upper);
+ const vector float B2_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_lower);
+ const vector float B3_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_upper);
+ const vector float B4_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_lower);
+ const vector float B5_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_upper);
+ const vector float B6_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_lower);
+ const vector float B7_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_upper);
+ const vector float B8_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_lower);
+
+
+ const vector unsigned int R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
+ const vector unsigned int R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
+ const vector unsigned int R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
+ const vector unsigned int R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
+ const vector unsigned int R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
+ const vector unsigned int R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
+ const vector unsigned int R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
+ const vector unsigned int R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
+ const vector unsigned int R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
+ const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
+ const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
+ const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
+ const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
+ const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
+ const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
+ const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
+
+ const vector unsigned int G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
+ const vector unsigned int G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
+ const vector unsigned int G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
+ const vector unsigned int G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
+ const vector unsigned int G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
+ const vector unsigned int G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
+ const vector unsigned int G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
+ const vector unsigned int G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
+ const vector unsigned int G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
+ const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
+ const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
+ const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
+ const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
+ const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
+ const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
+ const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
+
+ const vector unsigned int B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
+ const vector unsigned int B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
+ const vector unsigned int B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
+ const vector unsigned int B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
+ const vector unsigned int B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
+ const vector unsigned int B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
+ const vector unsigned int B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
+ const vector unsigned int B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
+ const vector unsigned int B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
+ const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
+ const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
+ const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
+ const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
+ const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
+ const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
+ const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
+
+ *((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha, B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
+ *((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha, B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
+ *((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha, B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
+ *((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha, B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
+ *((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha, B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
+ *((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha, B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
+ *((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha, B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
+ *((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha, B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha, B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
+ *((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
+ }
+}
+