11 files changed, 246 insertions, 92 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 734c897..de856c4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -47,6 +47,10 @@
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/IRReader/IRReader.h"
 
+#if LLVM_USE_INTEL_JITEVENTS
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#endif
+
 #include "core/state.h"
 #include "common/containers.hpp"
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c974a61..4ffb0fb 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -53,6 +53,10 @@
 #include "llvm/Config/config.h"
 #endif
 
+#ifndef HAVE_LLVM
+#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR
+#endif
+
 #include "llvm/IR/Verifier.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/Support/FileSystem.h"
@@ -60,11 +64,10 @@
 
 #include "llvm/Analysis/Passes.h"
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 #include "llvm/PassManager.h"
 #else
 #include "llvm/IR/LegacyPassManager.h"
-using namespace llvm::legacy;
 #endif
 
 #include "llvm/CodeGen/Passes.h"
@@ -166,7 +169,6 @@ struct JitManager
     FunctionType* mTrinaryFPTy;
     FunctionType* mUnaryIntTy;
     FunctionType* mBinaryIntTy;
-    FunctionType* mTrinaryIntTy;
 
     Type* mSimtFP32Ty;
     Type* mSimtInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 954524a..a64f860 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -576,9 +576,12 @@ struct BlendJit : public Builder
             src1[i] = LOAD(pSrc1, { i });
         }
         Value* currentMask = VIMMED1(-1);
-        if(state.desc.alphaToCoverageEnable)
+        if (state.desc.alphaToCoverageEnable)
         {
-            currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
+            Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
+            uint32_t bits = (1 << state.desc.numSamples) - 1;
+            currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+            currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
         }
 
         // alpha test
@@ -702,6 +705,12 @@ struct BlendJit : public Builder
             currentMask = AND(sampleMask, currentMask);
         }
 
+        if (state.desc.alphaToCoverageEnable)
+        {
+            Value* sampleMasked = SHL(C(1), sampleNum);
+            currentMask = AND(currentMask, VBROADCAST(sampleMasked));
+        }
+
         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
            state.desc.oMaskEnable)
         {
@@ -717,7 +726,13 @@ struct BlendJit : public Builder
 
         JitManager::DumpToFile(blendFunc, "");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index c15bdf1..757ea3f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -38,6 +38,8 @@ using namespace llvm;
 Builder::Builder(JitManager *pJitMgr)
     : mpJitMgr(pJitMgr)
 {
+    mVWidth = pJitMgr->mVWidth;
+
     mpIRBuilder = &pJitMgr->mBuilder;
 
     mVoidTy = Type::getVoidTy(pJitMgr->mContext);
@@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr)
     mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
     mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
     mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+    mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+    mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+    mInt32PtrTy = PointerType::get(mInt32Ty, 0);
     mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
     mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
     mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
-    mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
-    mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
-    mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
-    mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
-    mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+    mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+    mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+    mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+    mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+    mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+    mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
 
     if (sizeof(uint32_t*) == 4)
     {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 4921661..239ef2a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -43,6 +43,8 @@ struct Builder
     JitManager* mpJitMgr;
     IRBuilder<>* mpIRBuilder;
 
+    uint32_t             mVWidth;
+
     // Built in types.
     Type*                mVoidTy;
     Type*                mInt1Ty;
@@ -54,12 +56,16 @@ struct Builder
     Type*                mFP16Ty;
     Type*                mFP32Ty;
     Type*                mDoubleTy;
+    Type*                mInt8PtrTy;
+    Type*                mInt16PtrTy;
+    Type*                mInt32PtrTy;
     Type*                mSimdFP16Ty;
     Type*                mSimdFP32Ty;
     Type*                mSimdInt16Ty;
     Type*                mSimdInt32Ty;
     Type*                mSimdInt64Ty;
     Type*                mSimdIntPtrTy;
+    Type*                mSimdVectorTy;
     StructType*          mV4FP32Ty;
     StructType*          mV4Int32Ty;
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 5394fc7..486dad8 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -28,6 +28,8 @@
 * 
 ******************************************************************************/
 #include "builder.h"
+#include "common/rdtsc_buckets.h"
+
 #include "llvm/Support/DynamicLibrary.h"
 
 void __cdecl CallPrint(const char* fmt, ...);
@@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred)
 
 Value *Builder::VIMMED1(int i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(uint32_t i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(float i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 }
 
 Value *Builder::VIMMED1(bool i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VUNDEF_IPTR()
 {
-    return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 }
 
 Value *Builder::VUNDEF_I()
 {
-    return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type *ty, uint32_t size)
@@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size)
 
 Value *Builder::VUNDEF_F()
 {
-    return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type* t)
 {
-    return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(t, mVWidth));
 }
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 {
     return VINSERT(vec, val, C((int64_t)index));
@@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src)
         return src;
     }
 
-    return VECTOR_SPLAT(JM()->mVWidth, src);
+    return VECTOR_SPLAT(mVWidth, src);
 }
 
 uint32_t Builder::IMMED(Value* v)
@@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v)
     return pValConst->getZExtValue();
 }
 
+int32_t Builder::S_IMMED(Value* v)
+{
+    SWR_ASSERT(isa<ConstantInt>(v));
+    ConstantInt *pValConst = cast<ConstantInt>(v);
+    return pValConst->getSExtValue();
+}
+
 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 {
     std::vector<Value*> indices;
@@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask)
     else
     {
         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
-        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+        Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
+        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
     }
     return vResult;
 }
@@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list
 
     // get a pointer to the first character in the constant string array
     std::vector<Constant*> geplist{C(0),C(0)};
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 #else
     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
@@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
         Value *vOffsets = MUL(vIndices,vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets,C(i));
@@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
         Value *vOffsets = MUL(vIndices, vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets, C(i));
@@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx)
     }
     else
     {
-        res = VSHUFFLE(a, a, idx);
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_I();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
     }
     return res;
 }
 
 //////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
+/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of float values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMPS(Value* a, Value* idx)
+{
+    Value* res;
+    // use avx2 permute instruction if available
+    if (JM()->mArch.AVX2())
+    {
+        // llvm 3.6.0 swapped the order of the args to vpermd
+        res = VPERMPS(idx, a);
+    }
+    else
+    {
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_F();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
+    }
+
+    return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
@@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a)
         }
 
         Value* pResult = UndefValue::get(mSimdFP32Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
@@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding)
         }
 
         Value* pResult = UndefValue::get(mSimdInt16Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
@@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // input could either be float or int vector; do shuffle work in int
     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
@@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 
     if(bPackedOutput) 
     {
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
@@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     if(bPackedOutput)
     {
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
         // shuffle mask
         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
@@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 {
     Value* pStack = STACKSAVE();
 
+    Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
     // allocate tmp stack for masked off lanes
-    Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+    Value* vTmpPtr = ALLOCA(pSrcTy);
 
     Value *mask = MASK(vMask);
-    for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+    for (uint32_t i = 0; i < mVWidth; ++i)
     {
         Value *offset = VEXTRACT(vOffsets, C(i));
         // byte pointer to component
         Value *storeAddress = GEP(pDst, offset);
-        storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+        storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
         Value *selMask = VEXTRACT(mask, C(i));
         Value *srcElem = VEXTRACT(vSrc, C(i));
         // switch in a safe address to load if we're trying to access a vertex 
@@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high)
 Value* Builder::STACKSAVE()
 {
     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     return CALL(pfnStackSave);
 #else
     return CALLA(pfnStackSave);
@@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...)
     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
     OutputDebugString(strBuf);
 #endif
+
+    va_end(args);
 }
 
 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vextractf128_si_256);
@@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
 #endif
@@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 
 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vinsertf128_si_256);
@@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth; i++) {
+    for (unsigned i = 0; i < mVWidth; i++) {
         idx.push_back(C(i));
     }
     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
 
     SmallVector<Constant*,8> idx2;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx2.push_back(C(flag ? i : i + mVWidth));
     }
-    for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
-        idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+        idx2.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
 #endif
 }
+
+// rdtsc buckets macros
+void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
+void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 48e0558..f43ef69 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -59,7 +59,7 @@ Value *VUNDEF_F();
 Value *VUNDEF_I();
 Value *VUNDEF(Type* ty, uint32_t size);
 Value *VUNDEF_IPTR();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *VINSERT(Value *vec, Value *val, uint64_t index);
 #endif
 Value *VBROADCAST(Value *src);
@@ -67,6 +67,7 @@ Value *VRCP(Value *va);
 Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
 
 uint32_t IMMED(Value* i);
+int32_t S_IMMED(Value* i);
 
 Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
 Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
@@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b);
 Value *PMOVSXBD(Value* a);
 Value *PMOVSXWD(Value* a);
 Value *PERMD(Value* a, Value* idx);
+Value *PERMPS(Value* a, Value* idx);
 Value *CVTPH2PS(Value* a);
 Value *CVTPS2PH(Value* a, Value* rounding);
 Value *PMAXSD(Value* a, Value* b);
@@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
 
 Value *VEXTRACTI128(Value* a, Constant* imm8);
 Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
+
+// rdtsc buckets macros
+void RDTSC_START(Value* pBucketMgr, Value* pId);
+void RDTSC_STOP(Value* pBucketMgr, Value* pId);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c5a180e..2c2c56b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     std::vector<Value*>    vtxInputIndices(2, C(0));
     // GEP
     pVtxOut = GEP(pVtxOut, C(0));
-    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 
     // SWR_FETCH_CONTEXT::pStreams
     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
@@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     verifyFunction(*fetch);
 
-    FunctionPassManager setupPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            setupPasses(JM()->mpCurrentModule);
 
     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
     setupPasses.add(createBreakCriticalEdgesPass());
@@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     JitManager::DumpToFile(fetch, "se");
 
-    FunctionPassManager optPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            optPasses(JM()->mpCurrentModule);
 
     ///@todo Haven't touched these either. Need to remove some of these and add others.
     optPasses.add(createCFGSimplificationPass());
@@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
 
     SWRL::UncheckedFixedVector<Value*, 16>    vectors;
 
-    std::vector<Constant*>    pMask(JM()->mVWidth);
-    for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+    std::vector<Constant*>    pMask(mVWidth);
+    for(uint32_t i = 0; i < mVWidth; ++i)
     {
         pMask[i] = (C(i < 4 ? i : 4));
     }
@@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
         Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
 
         // Load from the stream.
-        for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+        for(uint32_t lane = 0; lane < mVWidth; ++lane)
         {
             // Get index
             Value* index = VEXTRACT(vIndices, C(lane));
@@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
             vectors.push_back(wvec);
         }
 
-        std::vector<Constant*>        v01Mask(JM()->mVWidth);
-        std::vector<Constant*>        v23Mask(JM()->mVWidth);
-        std::vector<Constant*>        v02Mask(JM()->mVWidth);
-        std::vector<Constant*>        v13Mask(JM()->mVWidth);
+        std::vector<Constant*>        v01Mask(mVWidth);
+        std::vector<Constant*>        v23Mask(mVWidth);
+        std::vector<Constant*>        v02Mask(mVWidth);
+        std::vector<Constant*>        v13Mask(mVWidth);
 
         // Concatenate the vectors together.
         elements[0] = VUNDEF_F(); 
         elements[1] = VUNDEF_F(); 
         elements[2] = VUNDEF_F(); 
         elements[3] = VUNDEF_F(); 
-        for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+        for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
         {
             v01Mask[4 * b + 0] = C(0 + 4 * b);
             v01Mask[4 * b + 1] = C(1 + 4 * b);
-            v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+            v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 
             v23Mask[4 * b + 0] = C(2 + 4 * b);
             v23Mask[4 * b + 1] = C(3 + 4 * b);
-            v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
-            v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
+            v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
             v02Mask[4 * b + 0] = C(0 + 4 * b);
             v02Mask[4 * b + 1] = C(2 + 4 * b);
-            v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+            v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 
             v13Mask[4 * b + 0] = C(1 + 4 * b);
             v13Mask[4 * b + 1] = C(3 + 4 * b);
-            v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
-            v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
+            v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
-            std::vector<Constant*>    iMask(JM()->mVWidth);
-            for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+            std::vector<Constant*>    iMask(mVWidth);
+            for(uint32_t i = 0; i < mVWidth; ++i)
             {
                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
                 {
-                    iMask[i] = C(i % 4 + JM()->mVWidth);
+                    iMask[i] = C(i % 4 + mVWidth);
                 }
                 else
                 {
@@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint8_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint16_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
     const uint32_t (&swizzle)[4] = std::get<9>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = mSimdInt32Ty;
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
-        Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask, including any swizzling
         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
@@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
     Value* (&vVertexElements)[4] = std::get<8>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
@@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
 
         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index 1814b7c..e73b232 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -27,7 +27,7 @@ import json as JSON
 import operator
 
 header = r"""/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -84,16 +84,16 @@ inst_aliases = {
 }
 
 intrinsics = [
-	    ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+        ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
         ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
-	    ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
-	    ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
-	    ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
-	    ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
-	    ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
-	    ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
-	    ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
-	    ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+        ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+        ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+        ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+        ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+        ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+        ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+        ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+        ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
         ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
         ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
         ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
@@ -103,6 +103,7 @@ intrinsics = [
         ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]],  # sign extend packed 8bit components
         ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]],  # sign extend packed 16bit components
         ["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+        ["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
         ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
         ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
         ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
index 7bba435..0b53a92 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -28,7 +28,7 @@ import operator
 
 header = r"""
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 6c5f22b..36baa8d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -293,7 +293,13 @@ struct StreamOutJit : public Builder
 
         JitManager::DumpToFile(soFunc, "SoFunc");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());