/**************************************************************************
 *
 * Copyright 2009 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

/**
 * @file
 * Helper functions for logical operations.
 *
 * @author Jose Fonseca <jfonseca@vmware.com>
 */


#include "util/u_cpu_detect.h"
#include "util/u_memory.h"
#include "util/u_debug.h"

#include "lp_bld_type.h"
#include "lp_bld_const.h"
#include "lp_bld_swizzle.h"
#include "lp_bld_init.h"
#include "lp_bld_intr.h"
#include "lp_bld_debug.h"
#include "lp_bld_logic.h"


/*
 * XXX
 *
 * Selection with vector conditional like
 *
 *    select <4 x i1> %C, %A, %B
 *
 * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
 * supported on some backends (x86) starting with llvm 3.1.
 *
 * Expanding the boolean vector to full SIMD register width, as in
 *
 *    sext <4 x i1> %C to <4 x i32>
 *
 * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but
 * it causes assertion failures in LLVM 2.6. It appears to work correctly on 
 * LLVM 2.7.
 */


/**
 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
 * \param func  one of PIPE_FUNC_x
 * If the ordered argument is true the function will use LLVM's ordered
 * comparisons, otherwise unordered comparisons will be used.
 * The result values will be 0 for false or ~0 for true.
 */
static LLVMValueRef
lp_build_compare_ext(struct gallivm_state *gallivm,
                     const struct lp_type type,
                     unsigned func,
                     LLVMValueRef a,
                     LLVMValueRef b,
                     boolean ordered)
{
   LLVMBuilderRef builder = gallivm->builder;
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
   LLVMValueRef cond;
   LLVMValueRef res;

   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

   if(func == PIPE_FUNC_NEVER)
      return zeros;
   if(func == PIPE_FUNC_ALWAYS)
      return ones;

   assert(func > PIPE_FUNC_NEVER);
   assert(func < PIPE_FUNC_ALWAYS);

   if(type.floating) {
      LLVMRealPredicate op;
      switch(func) {
      case PIPE_FUNC_EQUAL:
         op = ordered ? LLVMRealOEQ : LLVMRealUEQ;
         break;
      case PIPE_FUNC_NOTEQUAL:
         op = ordered ? LLVMRealONE : LLVMRealUNE;
         break;
      case PIPE_FUNC_LESS:
         op = ordered ? LLVMRealOLT : LLVMRealULT;
         break;
      case PIPE_FUNC_LEQUAL:
         op = ordered ? LLVMRealOLE : LLVMRealULE;
         break;
      case PIPE_FUNC_GREATER:
         op = ordered ? LLVMRealOGT : LLVMRealUGT;
         break;
      case PIPE_FUNC_GEQUAL:
         op = ordered ? LLVMRealOGE : LLVMRealUGE;
         break;
      default:
         assert(0);
         return lp_build_undef(gallivm, type);
      }

      cond = LLVMBuildFCmp(builder, op, a, b, "");
      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
   }
   else {
      LLVMIntPredicate op;
      switch(func) {
      case PIPE_FUNC_EQUAL:
         op = LLVMIntEQ;
         break;
      case PIPE_FUNC_NOTEQUAL:
         op = LLVMIntNE;
         break;
      case PIPE_FUNC_LESS:
         op = type.sign ? LLVMIntSLT : LLVMIntULT;
         break;
      case PIPE_FUNC_LEQUAL:
         op = type.sign ? LLVMIntSLE : LLVMIntULE;
         break;
      case PIPE_FUNC_GREATER:
         op = type.sign ? LLVMIntSGT : LLVMIntUGT;
         break;
      case PIPE_FUNC_GEQUAL:
         op = type.sign ? LLVMIntSGE : LLVMIntUGE;
         break;
      default:
         assert(0);
         return lp_build_undef(gallivm, type);
      }

      cond = LLVMBuildICmp(builder, op, a, b, "");
      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
   }

   return res;
}

/**
 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
 * \param func  one of PIPE_FUNC_x
 * The result values will be 0 for false or ~0 for true.
 */
LLVMValueRef
lp_build_compare(struct gallivm_state *gallivm,
                 const struct lp_type type,
                 unsigned func,
                 LLVMValueRef a,
                 LLVMValueRef b)
{
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);

   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

   if(func == PIPE_FUNC_NEVER)
      return zeros;
   if(func == PIPE_FUNC_ALWAYS)
      return ones;

   assert(func > PIPE_FUNC_NEVER);
   assert(func < PIPE_FUNC_ALWAYS);

#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   /*
    * There are no unsigned integer comparison instructions in SSE.
    */

   if (!type.floating && !type.sign &&
       type.width * type.length == 128 &&
       util_cpu_caps.has_sse2 &&
       (func == PIPE_FUNC_LESS ||
        func == PIPE_FUNC_LEQUAL ||
        func == PIPE_FUNC_GREATER ||
        func == PIPE_FUNC_GEQUAL) &&
       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
                      __FUNCTION__, type.length, type.width);
   }
#endif

   return lp_build_compare_ext(gallivm, type, func, a, b, FALSE);
}

/**
 * Build code to compare two values 'a' and 'b' using the given func.
 * \param func  one of PIPE_FUNC_x
 * If the operands are floating point numbers, the function will use
 * ordered comparison which means that it will return true if both
 * operands are not a NaN and the specified condition evaluates to true.
 * The result values will be 0 for false or ~0 for true.
 */
LLVMValueRef
lp_build_cmp_ordered(struct lp_build_context *bld,
                     unsigned func,
                     LLVMValueRef a,
                     LLVMValueRef b)
{
   return lp_build_compare_ext(bld->gallivm, bld->type, func, a, b, TRUE);
}

/**
 * Build code to compare two values 'a' and 'b' using the given func.
 * \param func  one of PIPE_FUNC_x
 * If the operands are floating point numbers, the function will use
 * unordered comparison which means that it will return true if either
 * operand is a NaN or the specified condition evaluates to true.
 * The result values will be 0 for false or ~0 for true.
 */
LLVMValueRef
lp_build_cmp(struct lp_build_context *bld,
             unsigned func,
             LLVMValueRef a,
             LLVMValueRef b)
{
   return lp_build_compare(bld->gallivm, bld->type, func, a, b);
}


/**
 * Return (mask & a) | (~mask & b);
 */
LLVMValueRef
lp_build_select_bitwise(struct lp_build_context *bld,
                        LLVMValueRef mask,
                        LLVMValueRef a,
                        LLVMValueRef b)
{
   LLVMBuilderRef builder = bld->gallivm->builder;
   struct lp_type type = bld->type;
   LLVMValueRef res;

   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

   if (a == b) {
      return a;
   }

   if(type.floating) {
      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
      b = LLVMBuildBitCast(builder, b, int_vec_type, "");
   }

   a = LLVMBuildAnd(builder, a, mask, "");

   /* This often gets translated to PANDN, but sometimes the NOT is
    * pre-computed and stored in another constant. The best strategy depends
    * on available registers, so it is not a big deal -- hopefully LLVM does
    * the right decision attending the rest of the program.
    */
   b = LLVMBuildAnd(builder, b, LLVMBuildNot(builder, mask, ""), "");

   res = LLVMBuildOr(builder, a, b, "");

   if(type.floating) {
      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
      res = LLVMBuildBitCast(builder, res, vec_type, "");
   }

   return res;
}


/**
 * Return mask ? a : b;
 *
 * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value
 * will yield unpredictable results.
 */
LLVMValueRef
lp_build_select(struct lp_build_context *bld,
                LLVMValueRef mask,
                LLVMValueRef a,
                LLVMValueRef b)
{
   LLVMBuilderRef builder = bld->gallivm->builder;
   LLVMContextRef lc = bld->gallivm->context;
   struct lp_type type = bld->type;
   LLVMValueRef res;

   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

   if(a == b)
      return a;

   if (type.length == 1) {
      mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
      res = LLVMBuildSelect(builder, mask, a, b, "");
   }
   else if (!(HAVE_LLVM == 0x0307) &&
            (LLVMIsConstant(mask) ||
             LLVMGetInstructionOpcode(mask) == LLVMSExt)) {
      /* Generate a vector select.
       *
       * Using vector selects should avoid emitting intrinsics hence avoid
       * hindering optimization passes, but vector selects weren't properly
       * supported yet for a long time, and LLVM will generate poor code when
       * the mask is not the result of a comparison.
       * Also, llvm 3.7 may miscompile them (bug 94972).
       */

      /* Convert the mask to a vector of booleans.
       *
       * XXX: In x86 the mask is controlled by the MSB, so if we shifted the
       * mask by `type.width - 1`, LLVM should realize the mask is ready.  Alas
       * what really happens is that LLVM will emit two shifts back to back.
       */
      if (0) {
         LLVMValueRef shift = LLVMConstInt(bld->int_elem_type, bld->type.width - 1, 0);
         shift = lp_build_broadcast(bld->gallivm, bld->int_vec_type, shift);
         mask = LLVMBuildLShr(builder, mask, shift, "");
      }
      LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length);
      mask = LLVMBuildTrunc(builder, mask, bool_vec_type, "");

      res = LLVMBuildSelect(builder, mask, a, b, "");
   }
   else if (((util_cpu_caps.has_sse4_1 &&
              type.width * type.length == 128) ||
             (util_cpu_caps.has_avx &&
              type.width * type.length == 256 && type.width >= 32) ||
             (util_cpu_caps.has_avx2 &&
              type.width * type.length == 256)) &&
            !LLVMIsConstant(a) &&
            !LLVMIsConstant(b) &&
            !LLVMIsConstant(mask)) {
      const char *intrinsic;
      LLVMTypeRef arg_type;
      LLVMValueRef args[3];

      /*
       *  There's only float blend in AVX but can just cast i32/i64
       *  to float.
       */
      if (type.width * type.length == 256) {
         if (type.width == 64) {
           intrinsic = "llvm.x86.avx.blendv.pd.256";
           arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
         }
         else if (type.width == 32) {
            intrinsic = "llvm.x86.avx.blendv.ps.256";
            arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
         } else {
            assert(util_cpu_caps.has_avx2);
            intrinsic = "llvm.x86.avx2.pblendvb";
            arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
         }
      }
      else if (type.floating &&
               type.width == 64) {
         intrinsic = "llvm.x86.sse41.blendvpd";
         arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
      } else if (type.floating &&
                 type.width == 32) {
         intrinsic = "llvm.x86.sse41.blendvps";
         arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 4);
      } else {
         intrinsic = "llvm.x86.sse41.pblendvb";
         arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 16);
      }

      if (arg_type != bld->int_vec_type) {
         mask = LLVMBuildBitCast(builder, mask, arg_type, "");
      }

      if (arg_type != bld->vec_type) {
         a = LLVMBuildBitCast(builder, a, arg_type, "");
         b = LLVMBuildBitCast(builder, b, arg_type, "");
      }

      args[0] = b;
      args[1] = a;
      args[2] = mask;

      res = lp_build_intrinsic(builder, intrinsic,
                               arg_type, args, ARRAY_SIZE(args), 0);

      if (arg_type != bld->vec_type) {
         res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
      }
   }
   else {
      res = lp_build_select_bitwise(bld, mask, a, b);
   }

   return res;
}


/**
 * Return mask ? a : b;
 *
 * mask is a TGSI_WRITEMASK_xxx.
 */
LLVMValueRef
lp_build_select_aos(struct lp_build_context *bld,
                    unsigned mask,
                    LLVMValueRef a,
                    LLVMValueRef b,
                    unsigned num_channels)
{
   LLVMBuilderRef builder = bld->gallivm->builder;
   const struct lp_type type = bld->type;
   const unsigned n = type.length;
   unsigned i, j;

   assert((mask & ~0xf) == 0);
   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

   if(a == b)
      return a;
   if((mask & 0xf) == 0xf)
      return a;
   if((mask & 0xf) == 0x0)
      return b;
   if(a == bld->undef || b == bld->undef)
      return bld->undef;

   /*
    * There are two major ways of accomplishing this:
    * - with a shuffle
    * - with a select
    *
    * The flip between these is empirical and might need to be adjusted.
    */
   if (n <= 4) {
      /*
       * Shuffle.
       */
      LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];

      for(j = 0; j < n; j += num_channels)
         for(i = 0; i < num_channels; ++i)
            shuffles[j + i] = LLVMConstInt(elem_type,
                                           (mask & (1 << i) ? 0 : n) + j + i,
                                           0);

      return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), "");
   }
   else {
      LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels);
      return lp_build_select(bld, mask_vec, a, b);
   }
}


/**
 * Return (scalar-cast)val ? true : false;
 */
LLVMValueRef
lp_build_any_true_range(struct lp_build_context *bld,
                        unsigned real_length,
                        LLVMValueRef val)
{
   LLVMBuilderRef builder = bld->gallivm->builder;
   LLVMTypeRef scalar_type;
   LLVMTypeRef true_type;

   assert(real_length <= bld->type.length);

   true_type = LLVMIntTypeInContext(bld->gallivm->context,
                                    bld->type.width * real_length);
   scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
                                      bld->type.width * bld->type.length);
   val = LLVMBuildBitCast(builder, val, scalar_type, "");
   /*
    * We're using always native types so we can use intrinsics.
    * However, if we don't do per-element calculations, we must ensure
    * the excess elements aren't used since they may contain garbage.
    */
   if (real_length < bld->type.length) {
      val = LLVMBuildTrunc(builder, val, true_type, "");
   }
   return LLVMBuildICmp(builder, LLVMIntNE,
                        val, LLVMConstNull(true_type), "");
}