/**************************************************************************
 *
 * Copyright 2010 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 **************************************************************************/


/**
 * @file
 * YUV pixel format manipulation.
 *
 * @author Jose Fonseca <jfonseca@vmware.com>
 */


#include "util/u_format.h"
#include "util/u_cpu_detect.h"

#include "lp_bld_arit.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
#include "lp_bld_conv.h"
#include "lp_bld_gather.h"
#include "lp_bld_format.h"
#include "lp_bld_init.h"
#include "lp_bld_logic.h"

/**
 * Extract Y, U, V channels from packed UYVY.
 * @param packed  is a <n x i32> vector with the packed UYVY blocks
 * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
 */
static void
uyvy_to_yuv_soa(struct gallivm_state *gallivm,
                unsigned n,
                LLVMValueRef packed,
                LLVMValueRef i,
                LLVMValueRef *y,
                LLVMValueRef *u,
                LLVMValueRef *v)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_type type;
   LLVMValueRef mask;

   memset(&type, 0, sizeof type);
   type.width = 32;
   type.length = n;

   assert(lp_check_value(type, packed));
   assert(lp_check_value(type, i));

   /*
    * Little endian:
    * y = (uyvy >> (16*i + 8)) & 0xff
    * u = (uyvy        ) & 0xff
    * v = (uyvy >> 16  ) & 0xff
    *
    * Big endian:
    * y = (uyvy >> (-16*i + 16)) & 0xff
    * u = (uyvy >> 24) & 0xff
    * v = (uyvy >>  8) & 0xff
    */

#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   /*
    * Avoid shift with per-element count.
    * No support on x86, gets translated to roughly 5 instructions
    * per element. Didn't measure performance but cuts shader size
    * by quite a bit (less difference if cpu has no sse4.1 support).
    */
   if (util_cpu_caps.has_sse2 && n > 1) {
      LLVMValueRef sel, tmp, tmp2;
      struct lp_build_context bld32;

      lp_build_context_init(&bld32, gallivm, type);

      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
      tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 16), "");
      sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
      *y = lp_build_select(&bld32, sel, tmp, tmp2);
   } else
#endif
   {
      LLVMValueRef shift;
#ifdef PIPE_ARCH_LITTLE_ENDIAN
      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
      shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 8), "");
#else
      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), "");
      shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 16), "");
#endif
      *y = LLVMBuildLShr(builder, packed, shift, "");
   }

#ifdef PIPE_ARCH_LITTLE_ENDIAN
   *u = packed;
   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
#else
   *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), "");
   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
#endif

   mask = lp_build_const_int_vec(gallivm, type, 0xff);

   *y = LLVMBuildAnd(builder, *y, mask, "y");
   *u = LLVMBuildAnd(builder, *u, mask, "u");
   *v = LLVMBuildAnd(builder, *v, mask, "v");
}


/**
 * Extract Y, U, V channels from packed YUYV.
 * @param packed  is a <n x i32> vector with the packed YUYV blocks
 * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
 */
static void
yuyv_to_yuv_soa(struct gallivm_state *gallivm,
                unsigned n,
                LLVMValueRef packed,
                LLVMValueRef i,
                LLVMValueRef *y,
                LLVMValueRef *u,
                LLVMValueRef *v)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_type type;
   LLVMValueRef mask;

   memset(&type, 0, sizeof type);
   type.width = 32;
   type.length = n;

   assert(lp_check_value(type, packed));
   assert(lp_check_value(type, i));

   /*
   * Little endian:
    * y = (yuyv >> 16*i) & 0xff
    * u = (yuyv >> 8   ) & 0xff
    * v = (yuyv >> 24  ) & 0xff
    *
    * Big endian:
    * y = (yuyv >> (-16*i + 24) & 0xff
    * u = (yuyv >> 16)          & 0xff
    * v = (yuyv)                & 0xff
    */

#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   /*
    * Avoid shift with per-element count.
    * No support on x86, gets translated to roughly 5 instructions
    * per element. Didn't measure performance but cuts shader size
    * by quite a bit (less difference if cpu has no sse4.1 support).
    */
   if (util_cpu_caps.has_sse2 && n > 1) {
      LLVMValueRef sel, tmp;
      struct lp_build_context bld32;

      lp_build_context_init(&bld32, gallivm, type);

      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
      sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
       *y = lp_build_select(&bld32, sel, packed, tmp);
   } else
#endif
   {
      LLVMValueRef shift;
#ifdef PIPE_ARCH_LITTLE_ENDIAN
      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
#else
      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), "");
      shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 24), "");
#endif
      *y = LLVMBuildLShr(builder, packed, shift, "");
   }

#ifdef PIPE_ARCH_LITTLE_ENDIAN
   *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), "");
#else
   *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
   *v = packed;
#endif

   mask = lp_build_const_int_vec(gallivm, type, 0xff);

   *y = LLVMBuildAnd(builder, *y, mask, "y");
   *u = LLVMBuildAnd(builder, *u, mask, "u");
   *v = LLVMBuildAnd(builder, *v, mask, "v");
}


static INLINE void
yuv_to_rgb_soa(struct gallivm_state *gallivm,
               unsigned n,
               LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
               LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_type type;
   struct lp_build_context bld;

   LLVMValueRef c0;
   LLVMValueRef c8;
   LLVMValueRef c16;
   LLVMValueRef c128;
   LLVMValueRef c255;

   LLVMValueRef cy;
   LLVMValueRef cug;
   LLVMValueRef cub;
   LLVMValueRef cvr;
   LLVMValueRef cvg;

   memset(&type, 0, sizeof type);
   type.sign = TRUE;
   type.width = 32;
   type.length = n;

   lp_build_context_init(&bld, gallivm, type);

   assert(lp_check_value(type, y));
   assert(lp_check_value(type, u));
   assert(lp_check_value(type, v));

   /*
    * Constants
    */

   c0   = lp_build_const_int_vec(gallivm, type,   0);
   c8   = lp_build_const_int_vec(gallivm, type,   8);
   c16  = lp_build_const_int_vec(gallivm, type,  16);
   c128 = lp_build_const_int_vec(gallivm, type, 128);
   c255 = lp_build_const_int_vec(gallivm, type, 255);

   cy  = lp_build_const_int_vec(gallivm, type,  298);
   cug = lp_build_const_int_vec(gallivm, type, -100);
   cub = lp_build_const_int_vec(gallivm, type,  516);
   cvr = lp_build_const_int_vec(gallivm, type,  409);
   cvg = lp_build_const_int_vec(gallivm, type, -208);

   /*
    *  y -= 16;
    *  u -= 128;
    *  v -= 128;
    */

   y = LLVMBuildSub(builder, y, c16, "");
   u = LLVMBuildSub(builder, u, c128, "");
   v = LLVMBuildSub(builder, v, c128, "");

   /*
    * r = 298 * _y            + 409 * _v + 128;
    * g = 298 * _y - 100 * _u - 208 * _v + 128;
    * b = 298 * _y + 516 * _u            + 128;
    */

   y = LLVMBuildMul(builder, y, cy, "");
   y = LLVMBuildAdd(builder, y, c128, "");

   *r = LLVMBuildMul(builder, v, cvr, "");
   *g = LLVMBuildAdd(builder,
                     LLVMBuildMul(builder, u, cug, ""),
                     LLVMBuildMul(builder, v, cvg, ""),
                     "");
   *b = LLVMBuildMul(builder, u, cub, "");

   *r = LLVMBuildAdd(builder, *r, y, "");
   *g = LLVMBuildAdd(builder, *g, y, "");
   *b = LLVMBuildAdd(builder, *b, y, "");

   /*
    * r >>= 8;
    * g >>= 8;
    * b >>= 8;
    */

   *r = LLVMBuildAShr(builder, *r, c8, "r");
   *g = LLVMBuildAShr(builder, *g, c8, "g");
   *b = LLVMBuildAShr(builder, *b, c8, "b");

   /*
    * Clamp
    */

   *r = lp_build_clamp(&bld, *r, c0, c255);
   *g = lp_build_clamp(&bld, *g, c0, c255);
   *b = lp_build_clamp(&bld, *b, c0, c255);
}


static LLVMValueRef
rgb_to_rgba_aos(struct gallivm_state *gallivm,
                unsigned n,
                LLVMValueRef r, LLVMValueRef g, LLVMValueRef b)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_type type;
   LLVMValueRef a;
   LLVMValueRef rgba;

   memset(&type, 0, sizeof type);
   type.sign = TRUE;
   type.width = 32;
   type.length = n;

   assert(lp_check_value(type, r));
   assert(lp_check_value(type, g));
   assert(lp_check_value(type, b));

   /*
    * Make a 4 x unorm8 vector
    */

#ifdef PIPE_ARCH_LITTLE_ENDIAN
   r = r;
   g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 8), "");
   b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 16), "");
   a = lp_build_const_int_vec(gallivm, type, 0xff000000);
#else
   r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type, 24), "");
   g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 16), "");
   b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 8), "");
   a = lp_build_const_int_vec(gallivm, type, 0x000000ff);
#endif

   rgba = r;
   rgba = LLVMBuildOr(builder, rgba, g, "");
   rgba = LLVMBuildOr(builder, rgba, b, "");
   rgba = LLVMBuildOr(builder, rgba, a, "");

   rgba = LLVMBuildBitCast(builder, rgba,
                           LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n), "");

   return rgba;
}


/**
 * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS
 */
static LLVMValueRef
uyvy_to_rgba_aos(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i)
{
   LLVMValueRef y, u, v;
   LLVMValueRef r, g, b;
   LLVMValueRef rgba;

   uyvy_to_yuv_soa(gallivm, n, packed, i, &y, &u, &v);
   yuv_to_rgb_soa(gallivm, n, y, u, v, &r, &g, &b);
   rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

   return rgba;
}


/**
 * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS
 */
static LLVMValueRef
yuyv_to_rgba_aos(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i)
{
   LLVMValueRef y, u, v;
   LLVMValueRef r, g, b;
   LLVMValueRef rgba;

   yuyv_to_yuv_soa(gallivm, n, packed, i, &y, &u, &v);
   yuv_to_rgb_soa(gallivm, n, y, u, v, &r, &g, &b);
   rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

   return rgba;
}


/**
 * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS
 */
static LLVMValueRef
rgbg_to_rgba_aos(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i)
{
   LLVMValueRef r, g, b;
   LLVMValueRef rgba;

   uyvy_to_yuv_soa(gallivm, n, packed, i, &g, &r, &b);
   rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

   return rgba;
}


/**
 * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS
 */
static LLVMValueRef
grgb_to_rgba_aos(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i)
{
   LLVMValueRef r, g, b;
   LLVMValueRef rgba;

   yuyv_to_yuv_soa(gallivm, n, packed, i, &g, &r, &b);
   rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

   return rgba;
}

/**
 * Convert from <n x i32> packed GR_BR to <4n x i8> RGBA AoS
 */
static LLVMValueRef
grbr_to_rgba_aos(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i)
{
   LLVMValueRef r, g, b;
   LLVMValueRef rgba;

   uyvy_to_yuv_soa(gallivm, n, packed, i, &r, &g, &b);
   rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

   return rgba;
}


/**
 * Convert from <n x i32> packed RG_RB to <4n x i8> RGBA AoS
 */
static LLVMValueRef
rgrb_to_rgba_aos(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i)
{
   LLVMValueRef r, g, b;
   LLVMValueRef rgba;

   yuyv_to_yuv_soa(gallivm, n, packed, i, &r, &g, &b);
   rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

   return rgba;
}

/**
 * @param n  is the number of pixels processed
 * @param packed  is a <n x i32> vector with the packed YUYV blocks
 * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
 * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
 */
LLVMValueRef
lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                   const struct util_format_description *format_desc,
                                   unsigned n,
                                   LLVMValueRef base_ptr,
                                   LLVMValueRef offset,
                                   LLVMValueRef i,
                                   LLVMValueRef j)
{
   LLVMValueRef packed;
   LLVMValueRef rgba;

   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
   assert(format_desc->block.bits == 32);
   assert(format_desc->block.width == 2);
   assert(format_desc->block.height == 1);

   packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);

   (void)j;

   switch (format_desc->format) {
   case PIPE_FORMAT_UYVY:
      rgba = uyvy_to_rgba_aos(gallivm, n, packed, i);
      break;
   case PIPE_FORMAT_YUYV:
      rgba = yuyv_to_rgba_aos(gallivm, n, packed, i);
      break;
   case PIPE_FORMAT_R8G8_B8G8_UNORM:
      rgba = rgbg_to_rgba_aos(gallivm, n, packed, i);
      break;
   case PIPE_FORMAT_G8R8_G8B8_UNORM:
      rgba = grgb_to_rgba_aos(gallivm, n, packed, i);
      break;
   case PIPE_FORMAT_G8R8_B8R8_UNORM:
      rgba = grbr_to_rgba_aos(gallivm, n, packed, i);
      break;
   case PIPE_FORMAT_R8G8_R8B8_UNORM:
      rgba = rgrb_to_rgba_aos(gallivm, n, packed, i);
      break;
   default:
      assert(0);
      rgba =  LLVMGetUndef(LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n));
      break;
   }

   return rgba;
}