diff options
12 files changed, 1439 insertions, 1404 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 4540105..cc773d7 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -72,6 +72,7 @@ #endif using namespace llvm; +using namespace SwrJit; ////////////////////////////////////////////////////////////////////////// /// @brief Contructor for JitManager. diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index f474143..7c0eaa9 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -86,7 +86,6 @@ using PassManager = llvm::legacy::PassManager; #pragma pop_macro("DEBUG") -using namespace llvm; ////////////////////////////////////////////////////////////////////////// /// JitInstructionSet /// @brief Subclass of InstructionSet that allows users to override @@ -136,7 +135,7 @@ private: -struct JitLLVMContext : LLVMContext +struct JitLLVMContext : llvm::LLVMContext { }; @@ -150,32 +149,32 @@ struct JitManager ~JitManager(){}; JitLLVMContext mContext; ///< LLVM compiler - IRBuilder<> mBuilder; ///< LLVM IR Builder - ExecutionEngine* mpExec; + llvm::IRBuilder<> mBuilder; ///< LLVM IR Builder + llvm::ExecutionEngine* mpExec; // Need to be rebuilt after a JIT and before building new IR - Module* mpCurrentModule; + llvm::Module* mpCurrentModule; bool mIsModuleFinalized; uint32_t mJitNumber; uint32_t mVWidth; // Built in types. - Type* mInt8Ty; - Type* mInt32Ty; - Type* mInt64Ty; - Type* mFP32Ty; - StructType* mV4FP32Ty; - StructType* mV4Int32Ty; + llvm::Type* mInt8Ty; + llvm::Type* mInt32Ty; + llvm::Type* mInt64Ty; + llvm::Type* mFP32Ty; + llvm::StructType* mV4FP32Ty; + llvm::StructType* mV4Int32Ty; - Type* mSimtFP32Ty; - Type* mSimtInt32Ty; + llvm::Type* mSimtFP32Ty; + llvm::Type* mSimtInt32Ty; - Type* mSimdVectorInt32Ty; - Type* mSimdVectorTy; + llvm::Type* mSimdVectorInt32Ty; + llvm::Type* mSimdVectorTy; // fetch shader types - FunctionType* mFetchShaderTy; + llvm::FunctionType* mFetchShaderTy; JitInstructionSet mArch; std::string mCore; @@ -183,6 +182,6 @@ struct JitManager void SetupNewModule(); bool SetupModuleFromIR(const uint8_t *pIR); - void DumpAsm(Function* pFunction, const char* fileName); - static void DumpToFile(Function *f, const char *fileName); + void DumpAsm(llvm::Function* pFunction, const char* fileName); + static void DumpToFile(llvm::Function *f, const char *fileName); }; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index 940399c..1452d27 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -37,6 +37,9 @@ // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized #define QUANTIZE_THRESHOLD 2 +using namespace llvm; +using namespace SwrJit; + ////////////////////////////////////////////////////////////////////////// /// Interface to Jitting a blend shader ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 01468c4..6ee4d85 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -30,49 +30,53 @@ #include "builder.h" -using namespace llvm; - -////////////////////////////////////////////////////////////////////////// -/// @brief Contructor for Builder. -/// @param pJitMgr - JitManager which contains modules, function passes, etc. -Builder::Builder(JitManager *pJitMgr) - : mpJitMgr(pJitMgr) +namespace SwrJit { - mVWidth = pJitMgr->mVWidth; + using namespace llvm; - mpIRBuilder = &pJitMgr->mBuilder; + ////////////////////////////////////////////////////////////////////////// + /// @brief Contructor for Builder. + /// @param pJitMgr - JitManager which contains modules, function passes, etc. + Builder::Builder(JitManager *pJitMgr) + : mpJitMgr(pJitMgr) + { + mVWidth = pJitMgr->mVWidth; - mVoidTy = Type::getVoidTy(pJitMgr->mContext); - mFP16Ty = Type::getHalfTy(pJitMgr->mContext); - mFP32Ty = Type::getFloatTy(pJitMgr->mContext); - mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); - mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); - mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); - mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); - mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); - mInt8PtrTy = PointerType::get(mInt8Ty, 0); - mInt16PtrTy = PointerType::get(mInt16Ty, 0); - mInt32PtrTy = PointerType::get(mInt32Ty, 0); - mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); - mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure) - mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type - mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); - mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); - mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); - mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); - mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); - mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); - mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false); + mpIRBuilder = &pJitMgr->mBuilder; - if (sizeof(uint32_t*) == 4) - { - mIntPtrTy = mInt32Ty; - mSimdIntPtrTy = mSimdInt32Ty; - } - else - { - SWR_ASSERT(sizeof(uint32_t*) == 8); - mIntPtrTy = mInt64Ty; - mSimdIntPtrTy = mSimdInt64Ty; + mVoidTy = Type::getVoidTy(pJitMgr->mContext); + mFP16Ty = Type::getHalfTy(pJitMgr->mContext); + mFP32Ty = Type::getFloatTy(pJitMgr->mContext); + mDoubleTy = Type::getDoubleTy(pJitMgr->mContext); + mInt1Ty = Type::getInt1Ty(pJitMgr->mContext); + mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); + mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); + mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); + mInt8PtrTy = PointerType::get(mInt8Ty, 0); + mInt16PtrTy = PointerType::get(mInt16Ty, 0); + mInt32PtrTy = PointerType::get(mInt32Ty, 0); + mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); + mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure) + mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type + mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); + mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); + mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); + mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); + mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); + mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); + mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false); + mSimdVectorTRTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(5, mSimdFP32Ty), false); + + if (sizeof(uint32_t*) == 4) + { + mIntPtrTy = mInt32Ty; + mSimdIntPtrTy = mSimdInt32Ty; + } + else + { + SWR_ASSERT(sizeof(uint32_t*) == 8); + mIntPtrTy = mInt64Ty; + mSimdIntPtrTy = mSimdInt64Ty; + } } } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index ddc32f4..515560e 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -32,47 +32,49 @@ #include "JitManager.h" #include "common/formats.h" -using namespace llvm; - -struct Builder +namespace SwrJit { - Builder(JitManager *pJitMgr); - IRBuilder<>* IRB() { return mpIRBuilder; }; - JitManager* JM() { return mpJitMgr; } + using namespace llvm; + struct Builder + { + Builder(JitManager *pJitMgr); + IRBuilder<>* IRB() { return mpIRBuilder; }; + JitManager* JM() { return mpJitMgr; } - JitManager* mpJitMgr; - IRBuilder<>* mpIRBuilder; + JitManager* mpJitMgr; + IRBuilder<>* mpIRBuilder; - uint32_t mVWidth; + uint32_t mVWidth; - // Built in types. - Type* mVoidTy; - Type* mInt1Ty; - Type* mInt8Ty; - Type* mInt16Ty; - Type* mInt32Ty; - Type* mInt64Ty; - Type* mIntPtrTy; - Type* mFP16Ty; - Type* mFP32Ty; - Type* mDoubleTy; - Type* mInt8PtrTy; - Type* mInt16PtrTy; - Type* mInt32PtrTy; - Type* mSimdFP16Ty; - Type* mSimdFP32Ty; - Type* mSimdInt1Ty; - Type* mSimdInt16Ty; - Type* mSimdInt32Ty; - Type* mSimdInt64Ty; - Type* mSimdIntPtrTy; - Type* mSimdVectorTy; - StructType* mV4FP32Ty; - StructType* mV4Int32Ty; + // Built in types. + Type* mVoidTy; + Type* mInt1Ty; + Type* mInt8Ty; + Type* mInt16Ty; + Type* mInt32Ty; + Type* mInt64Ty; + Type* mIntPtrTy; + Type* mFP16Ty; + Type* mFP32Ty; + Type* mDoubleTy; + Type* mInt8PtrTy; + Type* mInt16PtrTy; + Type* mInt32PtrTy; + Type* mSimdFP16Ty; + Type* mSimdFP32Ty; + Type* mSimdInt1Ty; + Type* mSimdInt16Ty; + Type* mSimdInt32Ty; + Type* mSimdInt64Ty; + Type* mSimdIntPtrTy; + Type* mSimdVectorTy; + Type* mSimdVectorTRTy; + StructType* mV4FP32Ty; + StructType* mV4Int32Ty; #include "builder_gen.h" #include "builder_x86.h" #include "builder_misc.h" #include "builder_math.h" - -}; + }; +} diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 13c1daf..d755cc3 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -30,962 +30,1051 @@ #include "builder.h" #include "common/rdtsc_buckets.h" -void __cdecl CallPrint(const char* fmt, ...); - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert an IEEE 754 32-bit single precision float to an -/// 16 bit float with 5 exponent bits and a variable -/// number of mantissa bits. -/// @param val - 32-bit float -/// @todo Maybe move this outside of this file into a header? -static uint16_t Convert32To16Float(float val) -{ - uint32_t sign, exp, mant; - uint32_t roundBits; - // Extract the sign, exponent, and mantissa - uint32_t uf = *(uint32_t*)&val; - sign = (uf & 0x80000000) >> 31; - exp = (uf & 0x7F800000) >> 23; - mant = uf & 0x007FFFFF; +namespace SwrJit +{ + void __cdecl CallPrint(const char* fmt, ...); - // Check for out of range - if (std::isnan(val)) - { - exp = 0x1F; - mant = 0x200; - sign = 1; // set the sign bit for NANs - } - else if (std::isinf(val)) - { - exp = 0x1f; - mant = 0x0; - } - else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value - { - exp = 0x1E; - mant = 0x3FF; - } - else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm + ////////////////////////////////////////////////////////////////////////// + /// @brief Convert an IEEE 754 32-bit single precision float to an + /// 16 bit float with 5 exponent bits and a variable + /// number of mantissa bits. + /// @param val - 32-bit float + /// @todo Maybe move this outside of this file into a header? + static uint16_t Convert32To16Float(float val) { - mant |= 0x00800000; - for (; exp <= 0x70; mant >>= 1, exp++) - ; - exp = 0; - mant = mant >> 13; - } - else if (exp < 0x66) // Too small to represent -> Zero - { - exp = 0; - mant = 0; - } - else - { - // Saves bits that will be shifted off for rounding - roundBits = mant & 0x1FFFu; - // convert exponent and mantissa to 16 bit format - exp = exp - 0x70; - mant = mant >> 13; + uint32_t sign, exp, mant; + uint32_t roundBits; - // Essentially RTZ, but round up if off by only 1 lsb - if (roundBits == 0x1FFFu) + // Extract the sign, exponent, and mantissa + uint32_t uf = *(uint32_t*)&val; + sign = (uf & 0x80000000) >> 31; + exp = (uf & 0x7F800000) >> 23; + mant = uf & 0x007FFFFF; + + // Check for out of range + if (std::isnan(val)) { - mant++; - // check for overflow - if ((mant & 0xC00u) != 0) - exp++; - // make sure only the needed bits are used - mant &= 0x3FF; + exp = 0x1F; + mant = 0x200; + sign = 1; // set the sign bit for NANs + } + else if (std::isinf(val)) + { + exp = 0x1f; + mant = 0x0; + } + else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value + { + exp = 0x1E; + mant = 0x3FF; + } + else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm + { + mant |= 0x00800000; + for (; exp <= 0x70; mant >>= 1, exp++) + ; + exp = 0; + mant = mant >> 13; + } + else if (exp < 0x66) // Too small to represent -> Zero + { + exp = 0; + mant = 0; + } + else + { + // Saves bits that will be shifted off for rounding + roundBits = mant & 0x1FFFu; + // convert exponent and mantissa to 16 bit format + exp = exp - 0x70; + mant = mant >> 13; + + // Essentially RTZ, but round up if off by only 1 lsb + if (roundBits == 0x1FFFu) + { + mant++; + // check for overflow + if ((mant & 0xC00u) != 0) + exp++; + // make sure only the needed bits are used + mant &= 0x3FF; + } } - } - - uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; - return (uint16_t)tmpVal; -} -////////////////////////////////////////////////////////////////////////// -/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision -/// float -/// @param val - 16-bit float -/// @todo Maybe move this outside of this file into a header? -static float ConvertSmallFloatTo32(UINT val) -{ - UINT result; - if ((val & 0x7fff) == 0) - { - result = ((uint32_t)(val & 0x8000)) << 16; - } - else if ((val & 0x7c00) == 0x7c00) - { - result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; - result |= ((uint32_t)val & 0x8000) << 16; + uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; + return (uint16_t)tmpVal; } - else + + ////////////////////////////////////////////////////////////////////////// + /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision + /// float + /// @param val - 16-bit float + /// @todo Maybe move this outside of this file into a header? + static float ConvertSmallFloatTo32(UINT val) { - uint32_t sign = (val & 0x8000) << 16; - uint32_t mant = (val & 0x3ff) << 13; - uint32_t exp = (val >> 10) & 0x1f; - if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals + UINT result; + if ((val & 0x7fff) == 0) + { + result = ((uint32_t)(val & 0x8000)) << 16; + } + else if ((val & 0x7c00) == 0x7c00) + { + result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; + result |= ((uint32_t)val & 0x8000) << 16; + } + else { - mant <<= 1; - while (mant < (0x400 << 13)) + uint32_t sign = (val & 0x8000) << 16; + uint32_t mant = (val & 0x3ff) << 13; + uint32_t exp = (val >> 10) & 0x1f; + if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals { - exp--; mant <<= 1; + while (mant < (0x400 << 13)) + { + exp--; + mant <<= 1; + } + mant &= (0x3ff << 13); } - mant &= (0x3ff << 13); + exp = ((exp - 15 + 127) & 0xff) << 23; + result = sign | exp | mant; } - exp = ((exp - 15 + 127) & 0xff) << 23; - result = sign | exp | mant; - } - return *(float*)&result; -} + return *(float*)&result; + } -Constant *Builder::C(bool i) -{ - return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); -} + Constant *Builder::C(bool i) + { + return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); + } -Constant *Builder::C(char i) -{ - return ConstantInt::get(IRB()->getInt8Ty(), i); -} + Constant *Builder::C(char i) + { + return ConstantInt::get(IRB()->getInt8Ty(), i); + } -Constant *Builder::C(uint8_t i) -{ - return ConstantInt::get(IRB()->getInt8Ty(), i); -} + Constant *Builder::C(uint8_t i) + { + return ConstantInt::get(IRB()->getInt8Ty(), i); + } -Constant *Builder::C(int i) -{ - return ConstantInt::get(IRB()->getInt32Ty(), i); -} + Constant *Builder::C(int i) + { + return ConstantInt::get(IRB()->getInt32Ty(), i); + } -Constant *Builder::C(int64_t i) -{ - return ConstantInt::get(IRB()->getInt64Ty(), i); -} + Constant *Builder::C(int64_t i) + { + return ConstantInt::get(IRB()->getInt64Ty(), i); + } -Constant *Builder::C(uint16_t i) -{ - return ConstantInt::get(mInt16Ty,i); -} + Constant *Builder::C(uint16_t i) + { + return ConstantInt::get(mInt16Ty,i); + } -Constant *Builder::C(uint32_t i) -{ - return ConstantInt::get(IRB()->getInt32Ty(), i); -} + Constant *Builder::C(uint32_t i) + { + return ConstantInt::get(IRB()->getInt32Ty(), i); + } -Constant *Builder::C(float i) -{ - return ConstantFP::get(IRB()->getFloatTy(), i); -} + Constant *Builder::C(float i) + { + return ConstantFP::get(IRB()->getFloatTy(), i); + } -Constant *Builder::PRED(bool pred) -{ - return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); -} + Constant *Builder::PRED(bool pred) + { + return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); + } -Value *Builder::VIMMED1(int i) -{ - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); -} + Value *Builder::VIMMED1(int i) + { + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); + } -Value *Builder::VIMMED1(uint32_t i) -{ - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); -} + Value *Builder::VIMMED1(uint32_t i) + { + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); + } -Value *Builder::VIMMED1(float i) -{ - return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); -} + Value *Builder::VIMMED1(float i) + { + return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); + } -Value *Builder::VIMMED1(bool i) -{ - return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); -} + Value *Builder::VIMMED1(bool i) + { + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); + } -Value *Builder::VUNDEF_IPTR() -{ - return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); -} + Value *Builder::VUNDEF_IPTR() + { + return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); + } -Value *Builder::VUNDEF_I() -{ - return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); -} + Value *Builder::VUNDEF_I() + { + return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); + } -Value *Builder::VUNDEF(Type *ty, uint32_t size) -{ - return UndefValue::get(VectorType::get(ty, size)); -} + Value *Builder::VUNDEF(Type *ty, uint32_t size) + { + return UndefValue::get(VectorType::get(ty, size)); + } -Value *Builder::VUNDEF_F() -{ - return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); -} + Value *Builder::VUNDEF_F() + { + return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); + } -Value *Builder::VUNDEF(Type* t) -{ - return UndefValue::get(VectorType::get(t, mVWidth)); -} + Value *Builder::VUNDEF(Type* t) + { + return UndefValue::get(VectorType::get(t, mVWidth)); + } -#if HAVE_LLVM == 0x306 -Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index) -{ - return VINSERT(vec, val, C((int64_t)index)); -} -#endif + #if HAVE_LLVM == 0x306 + Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index) + { + return VINSERT(vec, val, C((int64_t)index)); + } + #endif -Value *Builder::VBROADCAST(Value *src) -{ - // check if src is already a vector - if (src->getType()->isVectorTy()) + Value *Builder::VBROADCAST(Value *src) { - return src; + // check if src is already a vector + if (src->getType()->isVectorTy()) + { + return src; + } + + return VECTOR_SPLAT(mVWidth, src); } - return VECTOR_SPLAT(mVWidth, src); -} + uint32_t Builder::IMMED(Value* v) + { + SWR_ASSERT(isa<ConstantInt>(v)); + ConstantInt *pValConst = cast<ConstantInt>(v); + return pValConst->getZExtValue(); + } -uint32_t Builder::IMMED(Value* v) -{ - SWR_ASSERT(isa<ConstantInt>(v)); - ConstantInt *pValConst = cast<ConstantInt>(v); - return pValConst->getZExtValue(); -} + int32_t Builder::S_IMMED(Value* v) + { + SWR_ASSERT(isa<ConstantInt>(v)); + ConstantInt *pValConst = cast<ConstantInt>(v); + return pValConst->getSExtValue(); + } -int32_t Builder::S_IMMED(Value* v) -{ - SWR_ASSERT(isa<ConstantInt>(v)); - ConstantInt *pValConst = cast<ConstantInt>(v); - return pValConst->getSExtValue(); -} + Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) + { + std::vector<Value*> indices; + for (auto i : indexList) + indices.push_back(i); + return GEPA(ptr, indices); + } -Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) -{ - std::vector<Value*> indices; - for (auto i : indexList) - indices.push_back(i); - return GEPA(ptr, indices); -} + Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList) + { + std::vector<Value*> indices; + for (auto i : indexList) + indices.push_back(C(i)); + return GEPA(ptr, indices); + } -Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList) -{ - std::vector<Value*> indices; - for (auto i : indexList) - indices.push_back(C(i)); - return GEPA(ptr, indices); -} + LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name) + { + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(C(i)); + return LOAD(GEPA(basePtr, valIndices), name); + } -LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name) -{ - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(C(i)); - return LOAD(GEPA(basePtr, valIndices), name); -} + LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name) + { + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(i); + return LOAD(GEPA(basePtr, valIndices), name); + } -LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name) -{ - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(i); - return LOAD(GEPA(basePtr, valIndices), name); -} + StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices) + { + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(C(i)); + return STORE(val, GEPA(basePtr, valIndices)); + } -StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices) -{ - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(C(i)); - return STORE(val, GEPA(basePtr, valIndices)); -} + StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices) + { + std::vector<Value*> valIndices; + for (auto i : indices) + valIndices.push_back(i); + return STORE(val, GEPA(basePtr, valIndices)); + } -StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices) -{ - std::vector<Value*> valIndices; - for (auto i : indices) - valIndices.push_back(i); - return STORE(val, GEPA(basePtr, valIndices)); -} + CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList) + { + std::vector<Value*> args; + for (auto arg : argsList) + args.push_back(arg); + return CALLA(Callee, args); + } -CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList) -{ - std::vector<Value*> args; - for (auto arg : argsList) + #if HAVE_LLVM > 0x306 + CallInst *Builder::CALL(Value *Callee, Value* arg) + { + std::vector<Value*> args; args.push_back(arg); - return CALLA(Callee, args); -} - -#if HAVE_LLVM > 0x306 -CallInst *Builder::CALL(Value *Callee, Value* arg) -{ - std::vector<Value*> args; - args.push_back(arg); - return CALLA(Callee, args); -} + return CALLA(Callee, args); + } -CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2) -{ - std::vector<Value*> args; - args.push_back(arg1); - args.push_back(arg2); - return CALLA(Callee, args); -} + CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2) + { + std::vector<Value*> args; + args.push_back(arg1); + args.push_back(arg2); + return CALLA(Callee, args); + } -CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3) -{ - std::vector<Value*> args; - args.push_back(arg1); - args.push_back(arg2); - args.push_back(arg3); - return CALLA(Callee, args); -} -#endif - -Value *Builder::VRCP(Value *va) -{ - return FDIV(VIMMED1(1.0f), va); // 1 / a -} + CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3) + { + std::vector<Value*> args; + args.push_back(arg1); + args.push_back(arg2); + args.push_back(arg3); + return CALLA(Callee, args); + } + #endif -Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) -{ - Value* vOut = FMADDPS(vA, vX, vC); - vOut = FMADDPS(vB, vY, vOut); - return vOut; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Generate an i32 masked load operation in LLVM IR. If not -/// supported on the underlying platform, emulate it with float masked load -/// @param src - base address pointer for the load -/// @param vMask - SIMD wide mask that controls whether to access memory load 0 -Value *Builder::MASKLOADD(Value* src,Value* mask) -{ - Value* vResult; - // use avx2 gather instruction is available - if(JM()->mArch.AVX2()) - { - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); - vResult = CALL(func,{src,mask}); - } - else - { - // maskload intrinsic expects integer mask operand in llvm >= 3.8 -#if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8) - mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); -#else - mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth)); -#endif - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); - vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth)); - } - return vResult; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief insert a JIT call to CallPrint -/// - outputs formatted string to both stdout and VS output window -/// - DEBUG builds only -/// Usage example: -/// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); -/// where C(lane) creates a constant value to print, and pIndex is the Value* -/// result from a GEP, printing out the pointer to memory -/// @param printStr - constant string to print, which includes format specifiers -/// @param printArgs - initializer list of Value*'s to print to std out -CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs) -{ - // push the arguments to CallPrint into a vector - std::vector<Value*> printCallArgs; - // save room for the format string. we still need to modify it for vectors - printCallArgs.resize(1); + Value *Builder::VRCP(Value *va) + { + return FDIV(VIMMED1(1.0f), va); // 1 / a + } - // search through the format string for special processing - size_t pos = 0; - std::string tempStr(printStr); - pos = tempStr.find('%', pos); - auto v = printArgs.begin(); + Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) + { + Value* vOut = FMADDPS(vA, vX, vC); + vOut = FMADDPS(vB, vY, vOut); + return vOut; + } - while ((pos != std::string::npos) && (v != printArgs.end())) + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate an i32 masked load operation in LLVM IR. If not + /// supported on the underlying platform, emulate it with float masked load + /// @param src - base address pointer for the load + /// @param vMask - SIMD wide mask that controls whether to access memory load 0 + Value *Builder::MASKLOADD(Value* src,Value* mask) { - Value* pArg = *v; - Type* pType = pArg->getType(); + Value* vResult; + // use avx2 gather instruction is available + if(JM()->mArch.AVX2()) + { + Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); + vResult = CALL(func,{src,mask}); + } + else + { + // maskload intrinsic expects integer mask operand in llvm >= 3.8 + #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8) + mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); + #else + mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth)); + #endif + Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); + vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth)); + } + return vResult; + } - if (pType->isVectorTy()) + ////////////////////////////////////////////////////////////////////////// + /// @brief insert a JIT call to CallPrint + /// - outputs formatted string to both stdout and VS output window + /// - DEBUG builds only + /// Usage example: + /// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); + /// where C(lane) creates a constant value to print, and pIndex is the Value* + /// result from a GEP, printing out the pointer to memory + /// @param printStr - constant string to print, which includes format specifiers + /// @param printArgs - initializer list of Value*'s to print to std out + CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs) + { + // push the arguments to CallPrint into a vector + std::vector<Value*> printCallArgs; + // save room for the format string. we still need to modify it for vectors + printCallArgs.resize(1); + + // search through the format string for special processing + size_t pos = 0; + std::string tempStr(printStr); + pos = tempStr.find('%', pos); + auto v = printArgs.begin(); + + while ((pos != std::string::npos) && (v != printArgs.end())) { - Type* pContainedType = pType->getContainedType(0); + Value* pArg = *v; + Type* pType = pArg->getType(); - if (toupper(tempStr[pos + 1]) == 'X') + if (pType->isVectorTy()) { - tempStr[pos] = '0'; - tempStr[pos + 1] = 'x'; - tempStr.insert(pos + 2, "%08X "); - pos += 7; - - printCallArgs.push_back(VEXTRACT(pArg, C(0))); + Type* pContainedType = pType->getContainedType(0); - std::string vectorFormatStr; - for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) + if (toupper(tempStr[pos + 1]) == 'X') { - vectorFormatStr += "0x%08X "; - printCallArgs.push_back(VEXTRACT(pArg, C(i))); + tempStr[pos] = '0'; + tempStr[pos + 1] = 'x'; + tempStr.insert(pos + 2, "%08X "); + pos += 7; + + printCallArgs.push_back(VEXTRACT(pArg, C(0))); + + std::string vectorFormatStr; + for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) + { + vectorFormatStr += "0x%08X "; + printCallArgs.push_back(VEXTRACT(pArg, C(i))); + } + + tempStr.insert(pos, vectorFormatStr); + pos += vectorFormatStr.size(); } - - tempStr.insert(pos, vectorFormatStr); - pos += vectorFormatStr.size(); - } - else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy())) - { - uint32_t i = 0; - for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) + else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy())) { - tempStr.insert(pos, std::string("%f ")); - pos += 3; + uint32_t i = 0; + for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) + { + tempStr.insert(pos, std::string("%f ")); + pos += 3; + printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + } printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); } - printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) + { + uint32_t i = 0; + for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) + { + tempStr.insert(pos, std::string("%d ")); + pos += 3; + printCallArgs.push_back(VEXTRACT(pArg, C(i))); + } + printCallArgs.push_back(VEXTRACT(pArg, C(i))); + } } - else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) + else { - uint32_t i = 0; - for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) + if (toupper(tempStr[pos + 1]) == 'X') { - tempStr.insert(pos, std::string("%d ")); + tempStr[pos] = '0'; + tempStr.insert(pos + 1, "x%08"); + printCallArgs.push_back(pArg); pos += 3; - printCallArgs.push_back(VEXTRACT(pArg, C(i))); } - printCallArgs.push_back(VEXTRACT(pArg, C(i))); + // for %f we need to cast float Values to doubles so that they print out correctly + else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy())) + { + printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext))); + pos++; + } + else + { + printCallArgs.push_back(pArg); + } } + + // advance to the next arguement + v++; + pos = tempStr.find('%', ++pos); } - else + + // create global variable constant string + Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); + GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); + JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); + + // get a pointer to the first character in the constant string array + std::vector<Constant*> geplist{C(0),C(0)}; + #if HAVE_LLVM == 0x306 + Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); + #else + Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); + #endif + + // insert the pointer to the format string in the argument vector + printCallArgs[0] = strGEP; + + // get pointer to CallPrint function and insert decl into the module if needed + std::vector<Type*> args; + args.push_back(PointerType::get(mInt8Ty,0)); + FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); + Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); + + // if we haven't yet added the symbol to the symbol table + if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) { - if (toupper(tempStr[pos + 1]) == 'X') - { - tempStr[pos] = '0'; - tempStr.insert(pos + 1, "x%08"); - printCallArgs.push_back(pArg); - pos += 3; - } - // for %f we need to cast float Values to doubles so that they print out correctly - else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy())) - { - printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext))); - pos++; - } - else - { - printCallArgs.push_back(pArg); - } + sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); } - // advance to the next arguement - v++; - pos = tempStr.find('%', ++pos); + // insert a call to CallPrint + return CALLA(callPrintFn,printCallArgs); } - // create global variable constant string - Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); - GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); - JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); - - // get a pointer to the first character in the constant string array - std::vector<Constant*> geplist{C(0),C(0)}; -#if HAVE_LLVM == 0x306 - Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); -#else - Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); -#endif - - // insert the pointer to the format string in the argument vector - printCallArgs[0] = strGEP; - - // get pointer to CallPrint function and insert decl into the module if needed - std::vector<Type*> args; - args.push_back(PointerType::get(mInt8Ty,0)); - FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); - Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); - - // if we haven't yet added the symbol to the symbol table - if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) + ////////////////////////////////////////////////////////////////////////// + /// @brief Wrapper around PRINT with initializer list. + CallInst* Builder::PRINT(const std::string &printStr) { - sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); + return PRINT(printStr, {}); } - // insert a call to CallPrint - return CALLA(callPrintFn,printCallArgs); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Wrapper around PRINT with initializer list. -CallInst* Builder::PRINT(const std::string &printStr) -{ - return PRINT(printStr, {}); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a masked gather operation in LLVM IR. If not -/// supported on the underlying platform, emulate it with loads -/// @param vSrc - SIMD wide value that will be loaded if mask is invalid -/// @param pBase - Int8* base VB address pointer value -/// @param vIndices - SIMD wide value of VB byte offsets -/// @param vMask - SIMD wide mask that controls whether to access memory or the src values -/// @param scale - value to scale indices by -Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) -{ - Value* vGather; - - // use avx2 gather instruction if available - if(JM()->mArch.AVX2()) - { - // force mask to <N x float>, required by vgather - vMask = BITCAST(vMask, mSimdFP32Ty); - vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); - } - else + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a masked gather operation in LLVM IR. If not + /// supported on the underlying platform, emulate it with loads + /// @param vSrc - SIMD wide value that will be loaded if mask is invalid + /// @param pBase - Int8* base VB address pointer value + /// @param vIndices - SIMD wide value of VB byte offsets + /// @param vMask - SIMD wide mask that controls whether to access memory or the src values + /// @param scale - value to scale indices by + Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) { - Value* pStack = STACKSAVE(); + Value* vGather; - // store vSrc on the stack. this way we can select between a valid load address and the vSrc address - Value* vSrcPtr = ALLOCA(vSrc->getType()); - STORE(vSrc, vSrcPtr); - - vGather = VUNDEF_F(); - Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); - Value *vOffsets = MUL(vIndices,vScaleVec); - Value *mask = MASK(vMask); - for(uint32_t i = 0; i < mVWidth; ++i) + // use avx2 gather instruction if available + if(JM()->mArch.AVX2()) { - // single component byte index - Value *offset = VEXTRACT(vOffsets,C(i)); - // byte pointer to component - Value *loadAddress = GEP(pBase,offset); - loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); - // pointer to the value to load if we're masking off a component - Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); - Value *selMask = VEXTRACT(mask,C(i)); - // switch in a safe address to load if we're trying to access a vertex - Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); - Value *val = LOAD(validAddress); - vGather = VINSERT(vGather,val,C(i)); + // force mask to <N x float>, required by vgather + vMask = BITCAST(vMask, mSimdFP32Ty); + vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); } - STACKRESTORE(pStack); - } + else + { + Value* pStack = STACKSAVE(); - return vGather; -} + // store vSrc on the stack. this way we can select between a valid load address and the vSrc address + Value* vSrcPtr = ALLOCA(vSrc->getType()); + STORE(vSrc, vSrcPtr); -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a masked gather operation in LLVM IR. If not -/// supported on the underlying platform, emulate it with loads -/// @param vSrc - SIMD wide value that will be loaded if mask is invalid -/// @param pBase - Int8* base VB address pointer value -/// @param vIndices - SIMD wide value of VB byte offsets -/// @param vMask - SIMD wide mask that controls whether to access memory or the src values -/// @param scale - value to scale indices by -Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) -{ - Value* vGather; + vGather = VUNDEF_F(); + Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); + Value *vOffsets = MUL(vIndices,vScaleVec); + Value *mask = MASK(vMask); + for(uint32_t i = 0; i < mVWidth; ++i) + { + // single component byte index + Value *offset = VEXTRACT(vOffsets,C(i)); + // byte pointer to component + Value *loadAddress = GEP(pBase,offset); + loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); + // pointer to the value to load if we're masking off a component + Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); + Value *selMask = VEXTRACT(mask,C(i)); + // switch in a safe address to load if we're trying to access a vertex + Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); + Value *val = LOAD(validAddress); + vGather = VINSERT(vGather,val,C(i)); + } + STACKRESTORE(pStack); + } - // use avx2 gather instruction if available - if(JM()->mArch.AVX2()) - { - vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); + return vGather; } - else - { - Value* pStack = STACKSAVE(); - // store vSrc on the stack. this way we can select between a valid load address and the vSrc address - Value* vSrcPtr = ALLOCA(vSrc->getType()); - STORE(vSrc, vSrcPtr); + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a masked gather operation in LLVM IR. If not + /// supported on the underlying platform, emulate it with loads + /// @param vSrc - SIMD wide value that will be loaded if mask is invalid + /// @param pBase - Int8* base VB address pointer value + /// @param vIndices - SIMD wide value of VB byte offsets + /// @param vMask - SIMD wide mask that controls whether to access memory or the src values + /// @param scale - value to scale indices by + Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) + { + Value* vGather; - vGather = VUNDEF_I(); - Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); - Value *vOffsets = MUL(vIndices, vScaleVec); - Value *mask = MASK(vMask); - for(uint32_t i = 0; i < mVWidth; ++i) + // use avx2 gather instruction if available + if(JM()->mArch.AVX2()) { - // single component byte index - Value *offset = VEXTRACT(vOffsets, C(i)); - // byte pointer to component - Value *loadAddress = GEP(pBase, offset); - loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); - // pointer to the value to load if we're masking off a component - Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); - Value *selMask = VEXTRACT(mask, C(i)); - // switch in a safe address to load if we're trying to access a vertex - Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); - Value *val = LOAD(validAddress, C(0)); - vGather = VINSERT(vGather, val, C(i)); + vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); } + else + { + Value* pStack = STACKSAVE(); - STACKRESTORE(pStack); - } - return vGather; -} + // store vSrc on the stack. this way we can select between a valid load address and the vSrc address + Value* vSrcPtr = ALLOCA(vSrc->getType()); + STORE(vSrc, vSrcPtr); -////////////////////////////////////////////////////////////////////////// -/// @brief convert x86 <N x float> mask to llvm <N x i1> mask -Value* Builder::MASK(Value* vmask) -{ - Value* src = BITCAST(vmask, mSimdInt32Ty); - return ICMP_SLT(src, VIMMED1(0)); -} + vGather = VUNDEF_I(); + Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); + Value *vOffsets = MUL(vIndices, vScaleVec); + Value *mask = MASK(vMask); + for(uint32_t i = 0; i < mVWidth; ++i) + { + // single component byte index + Value *offset = VEXTRACT(vOffsets, C(i)); + // byte pointer to component + Value *loadAddress = GEP(pBase, offset); + loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); + // pointer to the value to load if we're masking off a component + Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); + Value *selMask = VEXTRACT(mask, C(i)); + // switch in a safe address to load if we're trying to access a vertex + Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); + Value *val = LOAD(validAddress, C(0)); + vGather = VINSERT(vGather, val, C(i)); + } -////////////////////////////////////////////////////////////////////////// -/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask -Value* Builder::VMASK(Value* mask) -{ - return S_EXT(mask, mSimdInt32Ty); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a VPSHUFB operation in LLVM IR. If not -/// supported on the underlying platform, emulate it -/// @param a - 256bit SIMD(32x8bit) of 8bit integer values -/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values -/// Byte masks in lower 128 lane of b selects 8 bit values from lower -/// 128bits of a, and vice versa for the upper lanes. If the mask -/// value is negative, '0' is inserted. -Value *Builder::PSHUFB(Value* a, Value* b) -{ - Value* res; - // use avx2 pshufb instruction if available - if(JM()->mArch.AVX2()) + STACKRESTORE(pStack); + } + return vGather; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief convert x86 <N x float> mask to llvm <N x i1> mask + Value* Builder::MASK(Value* vmask) { - res = VPSHUFB(a, b); + Value* src = BITCAST(vmask, mSimdInt32Ty); + return ICMP_SLT(src, VIMMED1(0)); } - else + + ////////////////////////////////////////////////////////////////////////// + /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask + Value* Builder::VMASK(Value* mask) { - Constant* cB = dyn_cast<Constant>(b); - // number of 8 bit elements in b - uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements(); - // output vector - Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); + return S_EXT(mask, mSimdInt32Ty); + } - // insert an 8 bit value from the high and low lanes of a per loop iteration - numElms /= 2; - for(uint32_t i = 0; i < numElms; i++) + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPSHUFB operation in LLVM IR. If not + /// supported on the underlying platform, emulate it + /// @param a - 256bit SIMD(32x8bit) of 8bit integer values + /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values + /// Byte masks in lower 128 lane of b selects 8 bit values from lower + /// 128bits of a, and vice versa for the upper lanes. If the mask + /// value is negative, '0' is inserted. + Value *Builder::PSHUFB(Value* a, Value* b) + { + Value* res; + // use avx2 pshufb instruction if available + if(JM()->mArch.AVX2()) + { + res = VPSHUFB(a, b); + } + else { - ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); - ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms)); + Constant* cB = dyn_cast<Constant>(b); + // number of 8 bit elements in b + uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements(); + // output vector + Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); + + // insert an 8 bit value from the high and low lanes of a per loop iteration + numElms /= 2; + for(uint32_t i = 0; i < numElms; i++) + { + ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i)); + ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms)); - // extract values from constant mask - char valLow128bLane = (char)(cLow128b->getSExtValue()); - char valHigh128bLane = (char)(cHigh128b->getSExtValue()); + // extract values from constant mask + char valLow128bLane = (char)(cLow128b->getSExtValue()); + char valHigh128bLane = (char)(cHigh128b->getSExtValue()); - Value* insertValLow128b; - Value* insertValHigh128b; + Value* insertValLow128b; + Value* insertValHigh128b; - // if the mask value is negative, insert a '0' in the respective output position - // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector - insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); - insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); + // if the mask value is negative, insert a '0' in the respective output position + // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector + insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); + insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); - vShuf = VINSERT(vShuf, insertValLow128b, i); - vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); + vShuf = VINSERT(vShuf, insertValLow128b, i); + vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); + } + res = vShuf; } - res = vShuf; + return res; } - return res; -} -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 -/// bits)in LLVM IR. If not supported on the underlying platform, emulate it -/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only -/// lower 8 values are used. -Value *Builder::PMOVSXBD(Value* a) -{ - // llvm-3.9 removed the pmovsxbd intrinsic -#if HAVE_LLVM < 0x309 - // use avx2 byte sign extend instruction if available - if(JM()->mArch.AVX2()) + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 + /// bits)in LLVM IR. If not supported on the underlying platform, emulate it + /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only + /// lower 8 values are used. + Value *Builder::PMOVSXBD(Value* a) { - Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd); - return CALL(pmovsxbd, std::initializer_list<Value*>{a}); - } - else -#endif - { - // VPMOVSXBD output type - Type* v8x32Ty = VectorType::get(mInt32Ty, 8); - // Extract 8 values from 128bit lane and sign extend - return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); + // llvm-3.9 removed the pmovsxbd intrinsic + #if HAVE_LLVM < 0x309 + // use avx2 byte sign extend instruction if available + if(JM()->mArch.AVX2()) + { + Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd); + return CALL(pmovsxbd, std::initializer_list<Value*>{a}); + } + else + #endif + { + // VPMOVSXBD output type + Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + // Extract 8 values from 128bit lane and sign extend + return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); + } } -} -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 -/// bits)in LLVM IR. If not supported on the underlying platform, emulate it -/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. -Value *Builder::PMOVSXWD(Value* a) -{ - // llvm-3.9 removed the pmovsxwd intrinsic -#if HAVE_LLVM < 0x309 - // use avx2 word sign extend if available - if(JM()->mArch.AVX2()) - { - Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd); - return CALL(pmovsxwd, std::initializer_list<Value*>{a}); - } - else -#endif - { - // VPMOVSXWD output type - Type* v8x32Ty = VectorType::get(mInt32Ty, 8); - // Extract 8 values from 128bit lane and sign extend - return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a VPERMD operation (shuffle 32 bit integer values -/// across 128 bit lanes) in LLVM IR. If not supported on the underlying -/// platform, emulate it -/// @param a - 256bit SIMD lane(8x32bit) of integer values. -/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values -Value *Builder::PERMD(Value* a, Value* idx) -{ - Value* res; - // use avx2 permute instruction if available - if(JM()->mArch.AVX2()) + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 + /// bits)in LLVM IR. If not supported on the underlying platform, emulate it + /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. + Value *Builder::PMOVSXWD(Value* a) { - res = VPERMD(a, idx); + // llvm-3.9 removed the pmovsxwd intrinsic + #if HAVE_LLVM < 0x309 + // use avx2 word sign extend if available + if(JM()->mArch.AVX2()) + { + Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd); + return CALL(pmovsxwd, std::initializer_list<Value*>{a}); + } + else + #endif + { + // VPMOVSXWD output type + Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + // Extract 8 values from 128bit lane and sign extend + return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); + } } - else + + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPERMD operation (shuffle 32 bit integer values + /// across 128 bit lanes) in LLVM IR. If not supported on the underlying + /// platform, emulate it + /// @param a - 256bit SIMD lane(8x32bit) of integer values. + /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values + Value *Builder::PERMD(Value* a, Value* idx) { - if (isa<Constant>(idx)) + Value* res; + // use avx2 permute instruction if available + if(JM()->mArch.AVX2()) { - res = VSHUFFLE(a, a, idx); + res = VPERMD(a, idx); } else { - res = VUNDEF_I(); - for (uint32_t l = 0; l < JM()->mVWidth; ++l) + if (isa<Constant>(idx)) + { + res = VSHUFFLE(a, a, idx); + } + else { - Value* pIndex = VEXTRACT(idx, C(l)); - Value* pVal = VEXTRACT(a, pIndex); - res = VINSERT(res, pVal, C(l)); + res = VUNDEF_I(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } } } + return res; } - return res; -} -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a VPERMPS operation (shuffle 32 bit float values -/// across 128 bit lanes) in LLVM IR. If not supported on the underlying -/// platform, emulate it -/// @param a - 256bit SIMD lane(8x32bit) of float values. -/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values -Value *Builder::PERMPS(Value* a, Value* idx) -{ - Value* res; - // use avx2 permute instruction if available - if (JM()->mArch.AVX2()) - { - // llvm 3.6.0 swapped the order of the args to vpermd - res = VPERMPS(idx, a); - } - else + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPERMPS operation (shuffle 32 bit float values + /// across 128 bit lanes) in LLVM IR. If not supported on the underlying + /// platform, emulate it + /// @param a - 256bit SIMD lane(8x32bit) of float values. + /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values + Value *Builder::PERMPS(Value* a, Value* idx) { - if (isa<Constant>(idx)) + Value* res; + // use avx2 permute instruction if available + if (JM()->mArch.AVX2()) { - res = VSHUFFLE(a, a, idx); + // llvm 3.6.0 swapped the order of the args to vpermd + res = VPERMPS(idx, a); } else { - res = VUNDEF_F(); - for (uint32_t l = 0; l < JM()->mVWidth; ++l) + if (isa<Constant>(idx)) { - Value* pIndex = VEXTRACT(idx, C(l)); - Value* pVal = VEXTRACT(a, pIndex); - res = VINSERT(res, pVal, C(l)); + res = VSHUFFLE(a, a, idx); + } + else + { + res = VUNDEF_F(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } } } - } - return res; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) -/// in LLVM IR. If not supported on the underlying platform, emulate it -/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. -Value *Builder::CVTPH2PS(Value* a) -{ - if (JM()->mArch.F16C()) - { - return VCVTPH2PS(a); + return res; } - else - { - FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); - Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy)); - if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr) + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) + /// in LLVM IR. If not supported on the underlying platform, emulate it + /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. + Value *Builder::CVTPH2PS(Value* a) + { + if (JM()->mArch.F16C()) { - sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32); + return VCVTPH2PS(a); } - - Value* pResult = UndefValue::get(mSimdFP32Ty); - for (uint32_t i = 0; i < mVWidth; ++i) + else { - Value* pSrc = VEXTRACT(a, C(i)); - Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); - pResult = VINSERT(pResult, pConv, C(i)); - } + FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); + Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy)); - return pResult; - } -} + if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr) + { + sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32); + } -////////////////////////////////////////////////////////////////////////// -/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) -/// in LLVM IR. If not supported on the underlying platform, emulate it -/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. -Value *Builder::CVTPS2PH(Value* a, Value* rounding) -{ - if (JM()->mArch.F16C()) - { - return VCVTPS2PH(a, rounding); - } - else - { - // call scalar C function for now - FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); - Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy)); + Value* pResult = UndefValue::get(mSimdFP32Ty); + for (uint32_t i = 0; i < mVWidth; ++i) + { + Value* pSrc = VEXTRACT(a, C(i)); + Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); + pResult = VINSERT(pResult, pConv, C(i)); + } - if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr) - { - sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float); + return pResult; } + } - Value* pResult = UndefValue::get(mSimdInt16Ty); - for (uint32_t i = 0; i < mVWidth; ++i) + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) + /// in LLVM IR. If not supported on the underlying platform, emulate it + /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. + Value *Builder::CVTPS2PH(Value* a, Value* rounding) + { + if (JM()->mArch.F16C()) { - Value* pSrc = VEXTRACT(a, C(i)); - Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); - pResult = VINSERT(pResult, pConv, C(i)); + return VCVTPS2PH(a, rounding); } + else + { + // call scalar C function for now + FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); + Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy)); - return pResult; - } -} + if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr) + { + sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float); + } -Value *Builder::PMAXSD(Value* a, Value* b) -{ - // llvm-3.9 removed the pmax intrinsics -#if HAVE_LLVM >= 0x309 - Value* cmp = ICMP_SGT(a, b); - return SELECT(cmp, a, b); -#else - if (JM()->mArch.AVX2()) - { - Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d); - return CALL(pmaxsd, {a, b}); + Value* pResult = UndefValue::get(mSimdInt16Ty); + for (uint32_t i = 0; i < mVWidth; ++i) + { + Value* pSrc = VEXTRACT(a, C(i)); + Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); + pResult = VINSERT(pResult, pConv, C(i)); + } + + return pResult; + } } - else + + Value *Builder::PMAXSD(Value* a, Value* b) { - // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources - Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd); + // llvm-3.9 removed the pmax intrinsics + #if HAVE_LLVM >= 0x309 + Value* cmp = ICMP_SGT(a, b); + return SELECT(cmp, a, b); + #else + if (JM()->mArch.AVX2()) + { + Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d); + return CALL(pmaxsd, {a, b}); + } + else + { + // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources + Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd); - // low 128 - Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); - Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); - Value* resLo = CALL(pmaxsd, {aLo, bLo}); + // low 128 + Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); + Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); + Value* resLo = CALL(pmaxsd, {aLo, bLo}); - // high 128 - Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); - Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); - Value* resHi = CALL(pmaxsd, {aHi, bHi}); + // high 128 + Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); + Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); + Value* resHi = CALL(pmaxsd, {aHi, bHi}); - // combine - Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); - result = VINSERTI128(result, resHi, C((uint8_t)1)); + // combine + Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); + result = VINSERTI128(result, resHi, C((uint8_t)1)); - return result; + return result; + } + #endif } -#endif -} -Value *Builder::PMINSD(Value* a, Value* b) -{ - // llvm-3.9 removed the pmin intrinsics -#if HAVE_LLVM >= 0x309 - Value* cmp = ICMP_SLT(a, b); - return SELECT(cmp, a, b); -#else - if (JM()->mArch.AVX2()) - { - Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d); - return CALL(pminsd, {a, b}); - } - else + Value *Builder::PMINSD(Value* a, Value* b) { - // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources - Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd); + // llvm-3.9 removed the pmin intrinsics + #if HAVE_LLVM >= 0x309 + Value* cmp = ICMP_SLT(a, b); + return SELECT(cmp, a, b); + #else + if (JM()->mArch.AVX2()) + { + Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d); + return CALL(pminsd, {a, b}); + } + else + { + // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources + Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd); - // low 128 - Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); - Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); - Value* resLo = CALL(pminsd, {aLo, bLo}); + // low 128 + Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); + Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); + Value* resLo = CALL(pminsd, {aLo, bLo}); - // high 128 - Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); - Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); - Value* resHi = CALL(pminsd, {aHi, bHi}); + // high 128 + Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); + Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); + Value* resHi = CALL(pminsd, {aHi, bHi}); - // combine - Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); - result = VINSERTI128(result, resHi, C((uint8_t)1)); + // combine + Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); + result = VINSERTI128(result, resHi, C((uint8_t)1)); - return result; + return result; + } + #endif } -#endif -} -void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput) -{ - const SWR_FORMAT_INFO &info = GetFormatInfo(format); - if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) + void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput) { - // ensure our mask is the correct type - mask = BITCAST(mask, mSimdFP32Ty); - GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); + const SWR_FORMAT_INFO &info = GetFormatInfo(format); + if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) + { + // ensure our mask is the correct type + mask = BITCAST(mask, mSimdFP32Ty); + GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); + } + else + { + // ensure our mask is the correct type + mask = BITCAST(mask, mSimdInt32Ty); + GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); + } } - else + + void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput) { - // ensure our mask is the correct type - mask = BITCAST(mask, mSimdInt32Ty); - GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); + switch(info.bpp / info.numComps) + { + case 16: + { + Value* vGatherResult[2]; + Value *vMask; + + // TODO: vGatherMaskedVal + Value* vGatherMaskedVal = VIMMED1((float)0); + + // always have at least one component out of x or y to fetch + + // save mask as it is zero'd out after each gather + vMask = mask; + + vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + + // if we have at least one component out of x or y to fetch + if(info.numComps > 2) + { + // offset base to the next components(zw) in the vertex to gather + pSrcBase = GEP(pSrcBase, C((char)4)); + vMask = mask; + + vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + else + { + vGatherResult[1] = vGatherMaskedVal; + } + + // Shuffle gathered components into place, each row is a component + Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); + } + break; + case 32: + { + // apply defaults + for (uint32_t i = 0; i < 4; ++i) + { + vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); + } + + for(uint32_t i = 0; i < info.numComps; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; + + // save mask as it is zero'd out after each gather + Value *vMask = mask; + + // Gather a SIMD of components + vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + + // offset base to the next component to gather + pSrcBase = GEP(pSrcBase, C((char)4)); + } + } + break; + default: + SWR_ASSERT(0, "Invalid float format"); + break; + } } -} -void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput) -{ - switch(info.bpp / info.numComps) + void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, + Value* mask, Value* vGatherComponents[], bool bPackedOutput) { - case 16: + switch (info.bpp / info.numComps) { + case 8: + { + Value* vGatherMaskedVal = VIMMED1((int32_t)0); + Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1)); + // e.g. result of an 8x32bit integer gather for 8bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + + Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); + } + break; + case 16: + { Value* vGatherResult[2]; Value *vMask; // TODO: vGatherMaskedVal - Value* vGatherMaskedVal = VIMMED1((float)0); + Value* vGatherMaskedVal = VIMMED1((int32_t)0); // always have at least one component out of x or y to fetch // save mask as it is zero'd out after each gather vMask = mask; - vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -998,7 +1087,7 @@ void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt pSrcBase = GEP(pSrcBase, C((char)4)); vMask = mask; - vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw @@ -1006,617 +1095,532 @@ void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt } else { - vGatherResult[1] = vGatherMaskedVal; + vGatherResult[1] = vGatherMaskedVal; } // Shuffle gathered components into place, each row is a component - Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } - break; - case 32: - { - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); - } + Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - for(uint32_t i = 0; i < info.numComps; i++) + } + break; + case 32: { - uint32_t swizzleIndex = info.swizzle[i]; + // apply defaults + for (uint32_t i = 0; i < 4; ++i) + { + vGatherComponents[i] = VIMMED1((int)info.defaults[i]); + } - // save mask as it is zero'd out after each gather - Value *vMask = mask; + for(uint32_t i = 0; i < info.numComps; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; - // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + // save mask as it is zero'd out after each gather + Value *vMask = mask; - // offset base to the next component to gather - pSrcBase = GEP(pSrcBase, C((char)4)); + // Gather a SIMD of components + vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + + // offset base to the next component to gather + pSrcBase = GEP(pSrcBase, C((char)4)); + } } - } - break; - default: - SWR_ASSERT(0, "Invalid float format"); + break; + default: + SWR_ASSERT(0, "unsupported format"); break; + } } -} -void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput) -{ - switch (info.bpp / info.numComps) + void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) { - case 8: - { - Value* vGatherMaskedVal = VIMMED1((int32_t)0); - Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1)); - // e.g. result of an 8x32bit integer gather for 8bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + // cast types + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } - break; - case 16: - { - Value* vGatherResult[2]; - Value *vMask; + // input could either be float or int vector; do shuffle work in int + vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); + vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); - // TODO: vGatherMaskedVal - Value* vGatherMaskedVal = VIMMED1((int32_t)0); - - // always have at least one component out of x or y to fetch + if(bPackedOutput) + { + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - // save mask as it is zero'd out after each gather - vMask = mask; + // shuffle mask + Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); - // e.g. result of first 8x32bit integer gather for 16bit components + Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); + // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // + // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - // if we have at least one component out of x or y to fetch - if(info.numComps > 2) + // do the same for zw components + Value* vi128ZW = nullptr; + if(info.numComps > 2) { - // offset base to the next components(zw) in the vertex to gather - pSrcBase = GEP(pSrcBase, C((char)4)); - vMask = mask; - - vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); + vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); } - else + + for(uint32_t i = 0; i < 4; i++) { - vGatherResult[1] = vGatherMaskedVal; - } + uint32_t swizzleIndex = info.swizzle[i]; + // todo: fixed for packed + Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); + if(i >= info.numComps) + { + // set the default component val + vGatherOutput[swizzleIndex] = vGatherMaskedVal; + continue; + } + + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - // Shuffle gathered components into place, each row is a component - Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); + // extract packed component 128 bit lanes + vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); + } } - break; - case 32: + else { + // pshufb masks for each component + Value* vConstMask[2]; + // x/z shuffle mask + vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + + // y/w shuffle mask + vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); + + + // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits // apply defaults for (uint32_t i = 0; i < 4; ++i) { - vGatherComponents[i] = VIMMED1((int)info.defaults[i]); + vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); } for(uint32_t i = 0; i < info.numComps; i++) { uint32_t swizzleIndex = info.swizzle[i]; - // save mask as it is zero'd out after each gather - Value *vMask = mask; - - // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + // select correct constMask for x/z or y/w pshufb + uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + uint32_t selectedGather = (i < 2) ? 0 : 1; - // offset base to the next component to gather - pSrcBase = GEP(pSrcBase, C((char)4)); + vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); + // after pshufb mask for x channel; z uses the same shuffle from the second gather + // 256i - 0 1 2 3 4 5 6 7 + // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 } } - break; - default: - SWR_ASSERT(0, "unsupported format"); - break; } -} - -void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) -{ - // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - - // input could either be float or int vector; do shuffle work in int - vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); - vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); - - if(bPackedOutput) - { - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - - Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - // after PERMD: move and pack xy components into each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - - // do the same for zw components - Value* vi128ZW = nullptr; - if(info.numComps > 2) - { - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - } - for(uint32_t i = 0; i < 4; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - // todo: fixed for packed - Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); - if(i >= info.numComps) - { - // set the default component val - vGatherOutput[swizzleIndex] = vGatherMaskedVal; - continue; - } - - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - - // extract packed component 128 bit lanes - vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); - } - - } - else + void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) { - // pshufb masks for each component - Value* vConstMask[2]; - // x/z shuffle mask - vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); - - // y/w shuffle mask - vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); + // cast types + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits - - // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits - // apply defaults - for (uint32_t i = 0; i < 4; ++i) + if(bPackedOutput) { - vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); - } - - for(uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // select correct constMask for x/z or y/w pshufb - uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - uint32_t selectedGather = (i < 2) ? 0 : 1; - - vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); - // after pshufb mask for x channel; z uses the same shuffle from the second gather + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + // shuffle mask + Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); + // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 - } - } -} + // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww -void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) -{ - // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits - - if(bPackedOutput) - { - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - // shuffle mask - Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww - - Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); - // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) - - // do the same for zw components - Value* vi128ZW = nullptr; - if(info.numComps > 2) - { - vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); - } + Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); + // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) - // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex - for(uint32_t i = 0; i < 4; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - // todo: fix for packed - Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); - if(i >= info.numComps) + // do the same for zw components + Value* vi128ZW = nullptr; + if(info.numComps > 2) { - // set the default component val - vGatherOutput[swizzleIndex] = vGatherMaskedVal; - continue; + vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); } - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; + // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + for(uint32_t i = 0; i < 4; i++) + { + uint32_t swizzleIndex = info.swizzle[i]; + // todo: fix for packed + Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); + if(i >= info.numComps) + { + // set the default component val + vGatherOutput[swizzleIndex] = vGatherMaskedVal; + continue; + } + + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - // sign extend - vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); - } - } - // else zero extend - else{ - // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); + // sign extend + vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); + } } - - for(uint32_t i = 0; i < info.numComps; i++){ - uint32_t swizzleIndex = info.swizzle[i]; - - // pshufb masks for each component - Value* vConstMask; - switch(i) + // else zero extend + else{ + // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits + // apply defaults + for (uint32_t i = 0; i < 4; ++i) { - case 0: - // x shuffle mask - vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, - 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); - break; - case 1: - // y shuffle mask - vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, - 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); - break; - case 2: - // z shuffle mask - vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, - 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); - break; - case 3: - // w shuffle mask - vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, - 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); - break; - default: - vConstMask = nullptr; - break; + vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); } - vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); - // after pshufb for x channel - // 256i - 0 1 2 3 4 5 6 7 - // x000 x000 x000 x000 x000 x000 x000 x000 + for(uint32_t i = 0; i < info.numComps; i++){ + uint32_t swizzleIndex = info.swizzle[i]; + + // pshufb masks for each component + Value* vConstMask; + switch(i) + { + case 0: + // x shuffle mask + vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, + 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); + break; + case 1: + // y shuffle mask + vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, + 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); + break; + case 2: + // z shuffle mask + vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, + 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); + break; + case 3: + // w shuffle mask + vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, + 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); + break; + default: + vConstMask = nullptr; + break; + } + + vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); + // after pshufb for x channel + // 256i - 0 1 2 3 4 5 6 7 + // x000 x000 x000 x000 x000 x000 x000 x000 + } } } -} -// Helper function to create alloca in entry block of function -Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType) -{ - auto saveIP = IRB()->saveIP(); - IRB()->SetInsertPoint(&pFunc->getEntryBlock(), - pFunc->getEntryBlock().begin()); - Value* pAlloca = ALLOCA(pType); - IRB()->restoreIP(saveIP); - return pAlloca; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief emulates a scatter operation. -/// @param pDst - pointer to destination -/// @param vSrc - vector of src data to scatter -/// @param vOffsets - vector of byte offsets from pDst -/// @param vMask - mask of valid lanes -void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) -{ - /* Scatter algorithm + // Helper function to create alloca in entry block of function + Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType) + { + auto saveIP = IRB()->saveIP(); + IRB()->SetInsertPoint(&pFunc->getEntryBlock(), + pFunc->getEntryBlock().begin()); + Value* pAlloca = ALLOCA(pType); + IRB()->restoreIP(saveIP); + return pAlloca; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief emulates a scatter operation. + /// @param pDst - pointer to destination + /// @param vSrc - vector of src data to scatter + /// @param vOffsets - vector of byte offsets from pDst + /// @param vMask - mask of valid lanes + void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) + { + /* Scatter algorithm - while(Index = BitScanForward(mask)) - srcElem = srcVector[Index] - offsetElem = offsetVector[Index] - *(pDst + offsetElem) = srcElem - Update mask (&= ~(1<<Index) + while(Index = BitScanForward(mask)) + srcElem = srcVector[Index] + offsetElem = offsetVector[Index] + *(pDst + offsetElem) = srcElem + Update mask (&= ~(1<<Index) - */ + */ - BasicBlock* pCurBB = IRB()->GetInsertBlock(); - Function* pFunc = pCurBB->getParent(); - Type* pSrcTy = vSrc->getType()->getVectorElementType(); + BasicBlock* pCurBB = IRB()->GetInsertBlock(); + Function* pFunc = pCurBB->getParent(); + Type* pSrcTy = vSrc->getType()->getVectorElementType(); - // Store vectors on stack - if (pScatterStackSrc == nullptr) - { - // Save off stack allocations and reuse per scatter. Significantly reduces stack - // requirements for shaders with a lot of scatters. - pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); - pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); - } + // Store vectors on stack + if (pScatterStackSrc == nullptr) + { + // Save off stack allocations and reuse per scatter. Significantly reduces stack + // requirements for shaders with a lot of scatters. + pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); + pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); + } - Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); - Value* pOffsetsArrayPtr = pScatterStackOffsets; - STORE(vSrc, pSrcArrayPtr); - STORE(vOffsets, pOffsetsArrayPtr); + Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); + Value* pOffsetsArrayPtr = pScatterStackOffsets; + STORE(vSrc, pSrcArrayPtr); + STORE(vOffsets, pOffsetsArrayPtr); - // Cast to pointers for random access - pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); - pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); + // Cast to pointers for random access + pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); + pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); - Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty)); + Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty)); - // Get cttz function - Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty }); + // Get cttz function + Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty }); - // Setup loop basic block - BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc); + // Setup loop basic block + BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc); - // compute first set bit - Value* pIndex = CALL(pfnCttz, { pMask, C(false) }); + // compute first set bit + Value* pIndex = CALL(pfnCttz, { pMask, C(false) }); - Value* pIsUndef = ICMP_EQ(pIndex, C(32)); + Value* pIsUndef = ICMP_EQ(pIndex, C(32)); - // Split current block - BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode()); + // Split current block + BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode()); - // Remove unconditional jump created by splitBasicBlock - pCurBB->getTerminator()->eraseFromParent(); + // Remove unconditional jump created by splitBasicBlock + pCurBB->getTerminator()->eraseFromParent(); - // Add terminator to end of original block - IRB()->SetInsertPoint(pCurBB); + // Add terminator to end of original block + IRB()->SetInsertPoint(pCurBB); - // Add conditional branch - COND_BR(pIsUndef, pPostLoop, pLoop); + // Add conditional branch + COND_BR(pIsUndef, pPostLoop, pLoop); - // Add loop basic block contents - IRB()->SetInsertPoint(pLoop); - PHINode* pIndexPhi = PHI(mInt32Ty, 2); - PHINode* pMaskPhi = PHI(mInt32Ty, 2); + // Add loop basic block contents + IRB()->SetInsertPoint(pLoop); + PHINode* pIndexPhi = PHI(mInt32Ty, 2); + PHINode* pMaskPhi = PHI(mInt32Ty, 2); - pIndexPhi->addIncoming(pIndex, pCurBB); - pMaskPhi->addIncoming(pMask, pCurBB); + pIndexPhi->addIncoming(pIndex, pCurBB); + pMaskPhi->addIncoming(pMask, pCurBB); - // Extract elements for this index - Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi }); - Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi }); + // Extract elements for this index + Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi }); + Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi }); - // GEP to this offset in dst - Value* pCurDst = GEP(pDst, pOffsetElem); - pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); - STORE(pSrcElem, pCurDst); + // GEP to this offset in dst + Value* pCurDst = GEP(pDst, pOffsetElem); + pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); + STORE(pSrcElem, pCurDst); - // Update the mask - Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); + // Update the mask + Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); - // Terminator - Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) }); + // Terminator + Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) }); - pIsUndef = ICMP_EQ(pNewIndex, C(32)); - COND_BR(pIsUndef, pPostLoop, pLoop); + pIsUndef = ICMP_EQ(pNewIndex, C(32)); + COND_BR(pIsUndef, pPostLoop, pLoop); - // Update phi edges - pIndexPhi->addIncoming(pNewIndex, pLoop); - pMaskPhi->addIncoming(pNewMask, pLoop); + // Update phi edges + pIndexPhi->addIncoming(pNewIndex, pLoop); + pMaskPhi->addIncoming(pNewMask, pLoop); - // Move builder to beginning of post loop - IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); -} + // Move builder to beginning of post loop + IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); + } -Value* Builder::VABSPS(Value* a) -{ - Value* asInt = BITCAST(a, mSimdInt32Ty); - Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); - return result; -} + Value* Builder::VABSPS(Value* a) + { + Value* asInt = BITCAST(a, mSimdInt32Ty); + Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); + return result; + } -Value *Builder::ICLAMP(Value* src, Value* low, Value* high) -{ - Value *lowCmp = ICMP_SLT(src, low); - Value *ret = SELECT(lowCmp, low, src); + Value *Builder::ICLAMP(Value* src, Value* low, Value* high) + { + Value *lowCmp = ICMP_SLT(src, low); + Value *ret = SELECT(lowCmp, low, src); - Value *highCmp = ICMP_SGT(ret, high); - ret = SELECT(highCmp, high, ret); + Value *highCmp = ICMP_SGT(ret, high); + ret = SELECT(highCmp, high, ret); - return ret; -} + return ret; + } -Value *Builder::FCLAMP(Value* src, Value* low, Value* high) -{ - Value *lowCmp = FCMP_OLT(src, low); - Value *ret = SELECT(lowCmp, low, src); + Value *Builder::FCLAMP(Value* src, Value* low, Value* high) + { + Value *lowCmp = FCMP_OLT(src, low); + Value *ret = SELECT(lowCmp, low, src); - Value *highCmp = FCMP_OGT(ret, high); - ret = SELECT(highCmp, high, ret); + Value *highCmp = FCMP_OGT(ret, high); + ret = SELECT(highCmp, high, ret); - return ret; -} + return ret; + } -Value *Builder::FCLAMP(Value* src, float low, float high) -{ - Value* result = VMAXPS(src, VIMMED1(low)); - result = VMINPS(result, VIMMED1(high)); + Value *Builder::FCLAMP(Value* src, float low, float high) + { + Value* result = VMAXPS(src, VIMMED1(low)); + result = VMINPS(result, VIMMED1(high)); - return result; -} + return result; + } -////////////////////////////////////////////////////////////////////////// -/// @brief save/restore stack, providing ability to push/pop the stack and -/// reduce overall stack requirements for temporary stack use -Value* Builder::STACKSAVE() -{ - Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); -#if HAVE_LLVM == 0x306 - return CALL(pfnStackSave); -#else - return CALLA(pfnStackSave); -#endif -} - -void Builder::STACKRESTORE(Value* pSaved) -{ - Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); - CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved}); -} + ////////////////////////////////////////////////////////////////////////// + /// @brief save/restore stack, providing ability to push/pop the stack and + /// reduce overall stack requirements for temporary stack use + Value* Builder::STACKSAVE() + { + Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); + #if HAVE_LLVM == 0x306 + return CALL(pfnStackSave); + #else + return CALLA(pfnStackSave); + #endif + } -Value *Builder::FMADDPS(Value* a, Value* b, Value* c) -{ - Value* vOut; - // use FMADs if available - if(JM()->mArch.AVX2()) + void Builder::STACKRESTORE(Value* pSaved) + { + Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); + CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved}); + } + + Value *Builder::FMADDPS(Value* a, Value* b, Value* c) { - vOut = VFMADDPS(a, b, c); + Value* vOut; + // use FMADs if available + if(JM()->mArch.AVX2()) + { + vOut = VFMADDPS(a, b, c); + } + else + { + vOut = FADD(FMUL(a, b), c); + } + return vOut; } - else + + Value* Builder::POPCNT(Value* a) { - vOut = FADD(FMUL(a, b), c); + Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); + return CALL(pCtPop, std::initializer_list<Value*>{a}); } - return vOut; -} -Value* Builder::POPCNT(Value* a) -{ - Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); - return CALL(pCtPop, std::initializer_list<Value*>{a}); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief C functions called by LLVM IR -////////////////////////////////////////////////////////////////////////// - -////////////////////////////////////////////////////////////////////////// -/// @brief called in JIT code, inserted by PRINT -/// output to both stdout and visual studio debug console -void __cdecl CallPrint(const char* fmt, ...) -{ - va_list args; - va_start(args, fmt); - vprintf(fmt, args); + ////////////////////////////////////////////////////////////////////////// + /// @brief C functions called by LLVM IR + ////////////////////////////////////////////////////////////////////////// -#if defined( _WIN32 ) - char strBuf[1024]; - vsnprintf_s(strBuf, _TRUNCATE, fmt, args); - OutputDebugString(strBuf); -#endif + ////////////////////////////////////////////////////////////////////////// + /// @brief called in JIT code, inserted by PRINT + /// output to both stdout and visual studio debug console + void __cdecl CallPrint(const char* fmt, ...) + { + va_list args; + va_start(args, fmt); + vprintf(fmt, args); - va_end(args); -} + #if defined( _WIN32 ) + char strBuf[1024]; + vsnprintf_s(strBuf, _TRUNCATE, fmt, args); + OutputDebugString(strBuf); + #endif -Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) -{ -#if HAVE_LLVM == 0x306 - Function *func = - Intrinsic::getDeclaration(JM()->mpCurrentModule, - Intrinsic::x86_avx_vextractf128_si_256); - return CALL(func, {a, imm8}); -#else - bool flag = !imm8->isZeroValue(); - SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < mVWidth / 2; i++) { - idx.push_back(C(flag ? i + mVWidth / 2 : i)); - } - return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); -#endif -} - -Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) -{ -#if HAVE_LLVM == 0x306 - Function *func = - Intrinsic::getDeclaration(JM()->mpCurrentModule, - Intrinsic::x86_avx_vinsertf128_si_256); - return CALL(func, {a, b, imm8}); -#else - bool flag = !imm8->isZeroValue(); - SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < mVWidth; i++) { - idx.push_back(C(i)); - } - Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); - - SmallVector<Constant*,8> idx2; - for (unsigned i = 0; i < mVWidth / 2; i++) { - idx2.push_back(C(flag ? i : i + mVWidth)); - } - for (unsigned i = mVWidth / 2; i < mVWidth; i++) { - idx2.push_back(C(flag ? i + mVWidth / 2 : i)); - } - return VSHUFFLE(a, inter, ConstantVector::get(idx2)); -#endif -} - -// rdtsc buckets macros -void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) -{ - // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into - // buckets framework when single threaded - if (KNOB_SINGLE_THREADED) - { - std::vector<Type*> args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id - }; - - FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) - { - sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); + va_end(args); + } + + Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) + { + #if HAVE_LLVM == 0x306 + Function *func = + Intrinsic::getDeclaration(JM()->mpCurrentModule, + Intrinsic::x86_avx_vextractf128_si_256); + return CALL(func, {a, imm8}); + #else + bool flag = !imm8->isZeroValue(); + SmallVector<Constant*,8> idx; + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx.push_back(C(flag ? i + mVWidth / 2 : i)); } + return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); + #endif + } - CALL(pFunc, { pBucketMgr, pId }); + Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) + { + #if HAVE_LLVM == 0x306 + Function *func = + Intrinsic::getDeclaration(JM()->mpCurrentModule, + Intrinsic::x86_avx_vinsertf128_si_256); + return CALL(func, {a, b, imm8}); + #else + bool flag = !imm8->isZeroValue(); + SmallVector<Constant*,8> idx; + for (unsigned i = 0; i < mVWidth; i++) { + idx.push_back(C(i)); + } + Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); + + SmallVector<Constant*,8> idx2; + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx2.push_back(C(flag ? i : i + mVWidth)); + } + for (unsigned i = mVWidth / 2; i < mVWidth; i++) { + idx2.push_back(C(flag ? i + mVWidth / 2 : i)); + } + return VSHUFFLE(a, inter, ConstantVector::get(idx2)); + #endif } -} -void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) -{ - // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into - // buckets framework when single threaded - if (KNOB_SINGLE_THREADED) - { - std::vector<Type*> args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id - }; - - FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) + // rdtsc buckets macros + void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) + { + // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into + // buckets framework when single threaded + if (KNOB_SINGLE_THREADED) { - sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); } + } - CALL(pFunc, { pBucketMgr, pId }); + void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) + { + // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into + // buckets framework when single threaded + if (KNOB_SINGLE_THREADED) + { + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); + } } -} +}
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 986eced..bdd818b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -35,6 +35,8 @@ #include <tuple> //#define FETCH_DUMP_VERTEX 1 +using namespace llvm; +using namespace SwrJit; bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py index 9c00f22..c6d0941 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py @@ -259,7 +259,11 @@ def generate_gen_cpp(functions, output_file): output_lines += [ '#include \"builder.h\"', - '' + '', + 'namespace SwrJit', + '{', + ' using namespace llvm;', + '', ] for func in functions: @@ -277,14 +281,14 @@ def generate_gen_cpp(functions, output_file): first_arg = False output_lines += [ - '//////////////////////////////////////////////////////////////////////////', - '%sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']), - '{', - ' return IRB()->%s(%s);' % (func['name'], func_args), - '}', + ' //////////////////////////////////////////////////////////////////////////', + ' %sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']), + ' {', + ' return IRB()->%s(%s);' % (func['name'], func_args), + ' }', '', ] - + output_lines.append('}') output_file.write('\n'.join(output_lines) + '\n') """ @@ -326,7 +330,11 @@ def generate_x86_cpp(output_file): output_lines += [ '#include \"builder.h\"', - '' + '', + 'namespace SwrJit', + '{', + ' using namespace llvm;', + '', ] for inst in intrinsics: @@ -344,10 +352,10 @@ def generate_x86_cpp(output_file): first = False output_lines += [ - '//////////////////////////////////////////////////////////////////////////', - 'Value *Builder::%s(%s)' % (inst[0], args), - '{', - ' Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1], + ' //////////////////////////////////////////////////////////////////////////', + ' Value *Builder::%s(%s)' % (inst[0], args), + ' {', + ' Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1], ] if inst[0] == "VPERMD": rev_args = '' @@ -360,21 +368,22 @@ def generate_x86_cpp(output_file): output_lines += [ '#if (HAVE_LLVM == 0x306) && (LLVM_VERSION_PATCH == 0)', - ' return CALL(func, std::initializer_list<Value*>{%s});' % rev_args, + ' return CALL(func, std::initializer_list<Value*>{%s});' % rev_args, '#else', ] output_lines += [ - ' return CALL(func, std::initializer_list<Value*>{%s});' % pass_args, + ' return CALL(func, std::initializer_list<Value*>{%s});' % pass_args, ] if inst[0] == "VPERMD": output_lines += [ '#endif', ] output_lines += [ - '}', + ' }', '', ] + output_lines.append('}') output_file.write('\n'.join(output_lines) + '\n') """ diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py index d6babd3..e88158c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py @@ -59,6 +59,10 @@ header = r""" #pragma once +namespace SwrJit +{ + using namespace llvm; + """ """ @@ -120,7 +124,7 @@ def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_a elif is_array: llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count) - return [' members.push_back( %s ); // %s' % (llvm_type, name)] + return [' members.push_back( %s ); // %s' % (llvm_type, name)] """ """ @@ -151,12 +155,12 @@ def gen_llvm_types(input_file, output_file): struct_name = match.group(3).strip() output_lines += [ - '//////////////////////////////////////////////////////////////////////////', - '/// Generate LLVM type information for %s' % struct_name, - 'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name), - '{', - ' LLVMContext& ctx = pJitMgr->mContext;', - ' std::vector<Type*> members;', + ' //////////////////////////////////////////////////////////////////////////', + ' /// Generate LLVM type information for %s' % struct_name, + ' INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name), + ' {', + ' LLVMContext& ctx = pJitMgr->mContext;', + ' std::vector<Type*> members;', '', ] @@ -309,16 +313,17 @@ def gen_llvm_types(input_file, output_file): if (end_of_struct): output_lines += [ '', - ' return StructType::get(ctx, members, false);', - '}', + ' return StructType::get(ctx, members, false);', + ' }', '', ] for i in range(len(llvm_args)): - output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i)) + output_lines.append(' static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i)) output_lines.append('') + output_lines.append('}') output_file.write('\n'.join(output_lines) + '\n') """ diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index 289422b..c4fb372 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -36,6 +36,9 @@ #include <sstream> #include <unordered_set> +using namespace llvm; +using namespace SwrJit; + ////////////////////////////////////////////////////////////////////////// /// Interface to Jitting a fetch shader ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index ecb4545..38a916e 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -44,6 +44,8 @@ #include "swr_state.h" #include "swr_screen.h" +using namespace SwrJit; + static unsigned locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info); diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp index 180ade1..6eb5ea6 100644 --- a/src/gallium/drivers/swr/swr_tex_sample.cpp +++ b/src/gallium/drivers/swr/swr_tex_sample.cpp @@ -60,6 +60,7 @@ #include "swr_tex_sample.h" #include "swr_context_llvm.h" +using namespace SwrJit; /** * This provides the bridge between the sampler state store in |