diff options
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/jitter')
11 files changed, 246 insertions, 92 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 734c897..de856c4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -47,6 +47,10 @@ #include "llvm/Analysis/CFGPrinter.h" #include "llvm/IRReader/IRReader.h" +#if LLVM_USE_INTEL_JITEVENTS +#include "llvm/ExecutionEngine/JITEventListener.h" +#endif + #include "core/state.h" #include "common/containers.hpp" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index c974a61..4ffb0fb 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -53,6 +53,10 @@ #include "llvm/Config/config.h" #endif +#ifndef HAVE_LLVM +#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR +#endif + #include "llvm/IR/Verifier.h" #include "llvm/ExecutionEngine/MCJIT.h" #include "llvm/Support/FileSystem.h" @@ -60,11 +64,10 @@ #include "llvm/Analysis/Passes.h" -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 #include "llvm/PassManager.h" #else #include "llvm/IR/LegacyPassManager.h" -using namespace llvm::legacy; #endif #include "llvm/CodeGen/Passes.h" @@ -166,7 +169,6 @@ struct JitManager FunctionType* mTrinaryFPTy; FunctionType* mUnaryIntTy; FunctionType* mBinaryIntTy; - FunctionType* mTrinaryIntTy; Type* mSimtFP32Ty; Type* mSimtInt32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index 954524a..a64f860 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -576,9 +576,12 @@ struct BlendJit : public Builder src1[i] = LOAD(pSrc1, { i }); } Value* currentMask = VIMMED1(-1); - if(state.desc.alphaToCoverageEnable) + if (state.desc.alphaToCoverageEnable) { - currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty); + Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); + uint32_t bits = (1 << state.desc.numSamples) - 1; + currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); + currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty); } // alpha test @@ -702,6 +705,12 @@ struct BlendJit : public Builder currentMask = AND(sampleMask, currentMask); } + if (state.desc.alphaToCoverageEnable) + { + Value* sampleMasked = SHL(C(1), sampleNum); + currentMask = AND(currentMask, VBROADCAST(sampleMasked)); + } + if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || state.desc.oMaskEnable) { @@ -717,7 +726,13 @@ struct BlendJit : public Builder JitManager::DumpToFile(blendFunc, ""); - FunctionPassManager passes(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); passes.add(createCFGSimplificationPass()); passes.add(createEarlyCSEPass()); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index c15bdf1..757ea3f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -38,6 +38,8 @@ using namespace llvm; Builder::Builder(JitManager *pJitMgr) : mpJitMgr(pJitMgr) { + mVWidth = pJitMgr->mVWidth; + mpIRBuilder = &pJitMgr->mBuilder; mVoidTy = Type::getVoidTy(pJitMgr->mContext); @@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr) mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); + mInt8PtrTy = PointerType::get(mInt8Ty, 0); + mInt16PtrTy = PointerType::get(mInt16Ty, 0); + mInt32PtrTy = PointerType::get(mInt32Ty, 0); mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure) mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type - mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth); - mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth); - mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth); - mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth); - mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth); + mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); + mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); + mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); + mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); + mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); + mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false); if (sizeof(uint32_t*) == 4) { diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 4921661..239ef2a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -43,6 +43,8 @@ struct Builder JitManager* mpJitMgr; IRBuilder<>* mpIRBuilder; + uint32_t mVWidth; + // Built in types. Type* mVoidTy; Type* mInt1Ty; @@ -54,12 +56,16 @@ struct Builder Type* mFP16Ty; Type* mFP32Ty; Type* mDoubleTy; + Type* mInt8PtrTy; + Type* mInt16PtrTy; + Type* mInt32PtrTy; Type* mSimdFP16Ty; Type* mSimdFP32Ty; Type* mSimdInt16Ty; Type* mSimdInt32Ty; Type* mSimdInt64Ty; Type* mSimdIntPtrTy; + Type* mSimdVectorTy; StructType* mV4FP32Ty; StructType* mV4Int32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 5394fc7..486dad8 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -28,6 +28,8 @@ * ******************************************************************************/ #include "builder.h" +#include "common/rdtsc_buckets.h" + #include "llvm/Support/DynamicLibrary.h" void __cdecl CallPrint(const char* fmt, ...); @@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred) Value *Builder::VIMMED1(int i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VIMMED1(uint32_t i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VIMMED1(float i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); } Value *Builder::VIMMED1(bool i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VUNDEF_IPTR() { - return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth)); + return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); } Value *Builder::VUNDEF_I() { - return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth)); + return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); } Value *Builder::VUNDEF(Type *ty, uint32_t size) @@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size) Value *Builder::VUNDEF_F() { - return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth)); + return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } Value *Builder::VUNDEF(Type* t) { - return UndefValue::get(VectorType::get(t, JM()->mVWidth)); + return UndefValue::get(VectorType::get(t, mVWidth)); } -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index) { return VINSERT(vec, val, C((int64_t)index)); @@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src) return src; } - return VECTOR_SPLAT(JM()->mVWidth, src); + return VECTOR_SPLAT(mVWidth, src); } uint32_t Builder::IMMED(Value* v) @@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v) return pValConst->getZExtValue(); } +int32_t Builder::S_IMMED(Value* v) +{ + SWR_ASSERT(isa<ConstantInt>(v)); + ConstantInt *pValConst = cast<ConstantInt>(v); + return pValConst->getSExtValue(); +} + Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) { std::vector<Value*> indices; @@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask) else { Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); - Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth)); - vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth)); + Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); + vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth)); } return vResult; } @@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list // get a pointer to the first character in the constant string array std::vector<Constant*> geplist{C(0),C(0)}; -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); #else Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); @@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); Value *vOffsets = MUL(vIndices,vScaleVec); Value *mask = MASK(vMask); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index Value *offset = VEXTRACT(vOffsets,C(i)); @@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); Value *vOffsets = MUL(vIndices, vScaleVec); Value *mask = MASK(vMask); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index Value *offset = VEXTRACT(vOffsets, C(i)); @@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx) } else { - res = VSHUFFLE(a, a, idx); + if (isa<Constant>(idx)) + { + res = VSHUFFLE(a, a, idx); + } + else + { + res = VUNDEF_I(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } + } } return res; } ////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VPERMPS operation (shuffle 32 bit float values +/// across 128 bit lanes) in LLVM IR. If not supported on the underlying +/// platform, emulate it +/// @param a - 256bit SIMD lane(8x32bit) of float values. +/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values +Value *Builder::PERMPS(Value* a, Value* idx) +{ + Value* res; + // use avx2 permute instruction if available + if (JM()->mArch.AVX2()) + { + // llvm 3.6.0 swapped the order of the args to vpermd + res = VPERMPS(idx, a); + } + else + { + if (isa<Constant>(idx)) + { + res = VSHUFFLE(a, a, idx); + } + else + { + res = VUNDEF_F(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } + } + } + + return res; +} + +////////////////////////////////////////////////////////////////////////// /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. @@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a) } Value* pResult = UndefValue::get(mSimdFP32Ty); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); @@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding) } Value* pResult = UndefValue::get(mSimdInt16Ty); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); @@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) { // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // input could either be float or int vector; do shuffle work in int vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); @@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp if(bPackedOutput) { - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, @@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) { // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits if(bPackedOutput) { - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); @@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) { Value* pStack = STACKSAVE(); + Type* pSrcTy = vSrc->getType()->getVectorElementType(); + // allocate tmp stack for masked off lanes - Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType()); + Value* vTmpPtr = ALLOCA(pSrcTy); Value *mask = MASK(vMask); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value *offset = VEXTRACT(vOffsets, C(i)); // byte pointer to component Value *storeAddress = GEP(pDst, offset); - storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0)); + storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0)); Value *selMask = VEXTRACT(mask, C(i)); Value *srcElem = VEXTRACT(vSrc, C(i)); // switch in a safe address to load if we're trying to access a vertex @@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high) Value* Builder::STACKSAVE() { Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 return CALL(pfnStackSave); #else return CALLA(pfnStackSave); @@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...) vsnprintf_s(strBuf, _TRUNCATE, fmt, args); OutputDebugString(strBuf); #endif + + va_end(args); } Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) { -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_si_256); @@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) #else bool flag = !imm8->isZeroValue(); SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { - idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); #endif @@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) { -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_si_256); @@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) #else bool flag = !imm8->isZeroValue(); SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < JM()->mVWidth; i++) { + for (unsigned i = 0; i < mVWidth; i++) { idx.push_back(C(i)); } Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); SmallVector<Constant*,8> idx2; - for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { - idx2.push_back(C(flag ? i : i + JM()->mVWidth)); + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx2.push_back(C(flag ? i : i + mVWidth)); } - for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) { - idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + for (unsigned i = mVWidth / 2; i < mVWidth; i++) { + idx2.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, inter, ConstantVector::get(idx2)); #endif } + +// rdtsc buckets macros +void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) +{ + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); +} + +void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) +{ + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); +} + diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 48e0558..f43ef69 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -59,7 +59,7 @@ Value *VUNDEF_F(); Value *VUNDEF_I(); Value *VUNDEF(Type* ty, uint32_t size); Value *VUNDEF_IPTR(); -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Value *VINSERT(Value *vec, Value *val, uint64_t index); #endif Value *VBROADCAST(Value *src); @@ -67,6 +67,7 @@ Value *VRCP(Value *va); Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); uint32_t IMMED(Value* i); +int32_t S_IMMED(Value* i); Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList); Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList); @@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b); Value *PMOVSXBD(Value* a); Value *PMOVSXWD(Value* a); Value *PERMD(Value* a, Value* idx); +Value *PERMPS(Value* a, Value* idx); Value *CVTPH2PS(Value* a); Value *CVTPS2PH(Value* a, Value* rounding); Value *PMAXSD(Value* a, Value* b); @@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); } Value *VEXTRACTI128(Value* a, Constant* imm8); Value *VINSERTI128(Value* a, Value* b, Constant* imm8); + +// rdtsc buckets macros +void RDTSC_START(Value* pBucketMgr, Value* pId); +void RDTSC_STOP(Value* pBucketMgr, Value* pId); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index c5a180e..2c2c56b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) std::vector<Value*> vtxInputIndices(2, C(0)); // GEP pVtxOut = GEP(pVtxOut, C(0)); - pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0)); + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); // SWR_FETCH_CONTEXT::pStreams Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); @@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) verifyFunction(*fetch); - FunctionPassManager setupPasses(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + setupPasses(JM()->mpCurrentModule); ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) setupPasses.add(createBreakCriticalEdgesPass()); @@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) JitManager::DumpToFile(fetch, "se"); - FunctionPassManager optPasses(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + optPasses(JM()->mpCurrentModule); ///@todo Haven't touched these either. Need to remove some of these and add others. optPasses.add(createCFGSimplificationPass()); @@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet SWRL::UncheckedFixedVector<Value*, 16> vectors; - std::vector<Constant*> pMask(JM()->mVWidth); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + std::vector<Constant*> pMask(mVWidth); + for(uint32_t i = 0; i < mVWidth; ++i) { pMask[i] = (C(i < 4 ? i : 4)); } @@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride); // Load from the stream. - for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane) + for(uint32_t lane = 0; lane < mVWidth; ++lane) { // Get index Value* index = VEXTRACT(vIndices, C(lane)); @@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet vectors.push_back(wvec); } - std::vector<Constant*> v01Mask(JM()->mVWidth); - std::vector<Constant*> v23Mask(JM()->mVWidth); - std::vector<Constant*> v02Mask(JM()->mVWidth); - std::vector<Constant*> v13Mask(JM()->mVWidth); + std::vector<Constant*> v01Mask(mVWidth); + std::vector<Constant*> v23Mask(mVWidth); + std::vector<Constant*> v02Mask(mVWidth); + std::vector<Constant*> v13Mask(mVWidth); // Concatenate the vectors together. elements[0] = VUNDEF_F(); elements[1] = VUNDEF_F(); elements[2] = VUNDEF_F(); elements[3] = VUNDEF_F(); - for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b) + for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b) { v01Mask[4 * b + 0] = C(0 + 4 * b); v01Mask[4 * b + 1] = C(1 + 4 * b); - v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); - v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth); + v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); + v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth); v23Mask[4 * b + 0] = C(2 + 4 * b); v23Mask[4 * b + 1] = C(3 + 4 * b); - v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth); - v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth); + v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); v02Mask[4 * b + 0] = C(0 + 4 * b); v02Mask[4 * b + 1] = C(2 + 4 * b); - v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); - v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth); + v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); + v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth); v13Mask[4 * b + 0] = C(1 + 4 * b); v13Mask[4 * b + 1] = C(3 + 4 * b); - v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth); - v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth); + v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); - std::vector<Constant*> iMask(JM()->mVWidth); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + std::vector<Constant*> iMask(mVWidth); + for(uint32_t i = 0; i < mVWidth; ++i) { if(((4 * b) <= i) && (i < (4 * (b + 1)))) { - iMask[i] = C(i % 4 + JM()->mVWidth); + iMask[i] = C(i % 4 + mVWidth); } else { @@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) STORE(C((uint8_t)0), pZeroIndex); // Load a SIMD of index pointers - for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + for(int64_t lane = 0; lane < mVWidth; lane++) { // Calculate the address of the requested index Value *pIndex = GEP(pIndices, C(lane)); @@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) STORE(C((uint16_t)0), pZeroIndex); // Load a SIMD of index pointers - for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + for(int64_t lane = 0; lane < mVWidth; lane++) { // Calculate the address of the requested index Value *pIndex = GEP(pIndices, C(lane)); @@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) const uint32_t (&swizzle)[4] = std::get<9>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + Type* vGatherTy = mSimdInt32Ty; + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits // have to do extra work for sign extending if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ - Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask, including any swizzling const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; @@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) Value* (&vVertexElements)[4] = std::get<8>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // have to do extra work for sign extending if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| @@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py index 1814b7c..e73b232 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py @@ -27,7 +27,7 @@ import json as JSON import operator header = r"""/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -84,16 +84,16 @@ inst_aliases = { } intrinsics = [ - ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]], + ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]], ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]], - ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]], - ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]], - ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]], - ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]], - ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]], - ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]], - ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]], - ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]], + ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]], + ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]], + ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]], + ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]], + ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]], + ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]], + ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]], + ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]], ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]], ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]], ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]], @@ -103,6 +103,7 @@ intrinsics = [ ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components ["VPERMD", "x86_avx2_permd", ["idx", "a"]], + ["VPERMPS", "x86_avx2_permps", ["idx", "a"]], ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]], ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]], ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py index 7bba435..0b53a92 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py @@ -28,7 +28,7 @@ import operator header = r""" /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index 6c5f22b..36baa8d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -293,7 +293,13 @@ struct StreamOutJit : public Builder JitManager::DumpToFile(soFunc, "SoFunc"); - FunctionPassManager passes(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); passes.add(createCFGSimplificationPass()); passes.add(createEarlyCSEPass()); |