aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp1258
1 files changed, 844 insertions, 414 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6866be7..8b92e70 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25,7 +25,6 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/VariadicFunction.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -77,119 +76,6 @@ static cl::opt<int> ReciprocalEstimateRefinementSteps(
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
-static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl,
- unsigned vectorWidth) {
- assert((vectorWidth == 128 || vectorWidth == 256) &&
- "Unsupported vector width");
- EVT VT = Vec.getValueType();
- EVT ElVT = VT.getVectorElementType();
- unsigned Factor = VT.getSizeInBits()/vectorWidth;
- EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
- VT.getVectorNumElements()/Factor);
-
- // Extract from UNDEF is UNDEF.
- if (Vec.getOpcode() == ISD::UNDEF)
- return DAG.getUNDEF(ResultVT);
-
- // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
- unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
-
- // This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
- * ElemsPerChunk);
-
- // If the input is a buildvector just emit a smaller one.
- if (Vec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
- ElemsPerChunk));
-
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
-}
-
-/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
-/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
-/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
-/// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
-/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert((Vec.getValueType().is256BitVector() ||
- Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
-}
-
-/// Generate a DAG to grab 256-bits from a 512-bit vector.
-static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
-}
-
-static SDValue InsertSubVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl, unsigned vectorWidth) {
- assert((vectorWidth == 128 || vectorWidth == 256) &&
- "Unsupported vector width");
- // Inserting UNDEF is Result
- if (Vec.getOpcode() == ISD::UNDEF)
- return Result;
- EVT VT = Vec.getValueType();
- EVT ElVT = VT.getVectorElementType();
- EVT ResultVT = Result.getValueType();
-
- // Insert the relevant vectorWidth bits.
- unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
-
- // This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
- * ElemsPerChunk);
-
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
-}
-
-/// Generate a DAG to put 128-bits into a vector > 128 bits. This
-/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
-/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
-/// simple superregister reference. Idx is an index in the 128 bits
-/// we want. It need not be aligned to a 128-bit boundary. That makes
-/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG,SDLoc dl) {
- assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
-}
-
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
-}
-
-/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
-/// instructions. This is used because creating CONCAT_VECTOR nodes of
-/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
-/// large BUILD_VECTORS.
-static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
-}
-
-static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
-}
-
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -871,35 +757,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// MMX-sized vectors (other than x86mmx) are expected to be expanded
// into smaller operations.
- setOperationAction(ISD::MULHS, MVT::v8i8, Expand);
- setOperationAction(ISD::MULHS, MVT::v4i16, Expand);
- setOperationAction(ISD::MULHS, MVT::v2i32, Expand);
- setOperationAction(ISD::MULHS, MVT::v1i64, Expand);
- setOperationAction(ISD::AND, MVT::v8i8, Expand);
- setOperationAction(ISD::AND, MVT::v4i16, Expand);
- setOperationAction(ISD::AND, MVT::v2i32, Expand);
- setOperationAction(ISD::AND, MVT::v1i64, Expand);
- setOperationAction(ISD::OR, MVT::v8i8, Expand);
- setOperationAction(ISD::OR, MVT::v4i16, Expand);
- setOperationAction(ISD::OR, MVT::v2i32, Expand);
- setOperationAction(ISD::OR, MVT::v1i64, Expand);
- setOperationAction(ISD::XOR, MVT::v8i8, Expand);
- setOperationAction(ISD::XOR, MVT::v4i16, Expand);
- setOperationAction(ISD::XOR, MVT::v2i32, Expand);
- setOperationAction(ISD::XOR, MVT::v1i64, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);
+ for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
+ setOperationAction(ISD::MULHS, MMXTy, Expand);
+ setOperationAction(ISD::AND, MMXTy, Expand);
+ setOperationAction(ISD::OR, MMXTy, Expand);
+ setOperationAction(ISD::XOR, MMXTy, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand);
+ setOperationAction(ISD::SELECT, MMXTy, Expand);
+ setOperationAction(ISD::BITCAST, MMXTy, Expand);
+ }
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
- setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
- setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
- setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
- setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);
- setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);
- setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
- setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
@@ -1065,27 +932,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
- setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
- setOperationAction(ISD::FRINT, MVT::f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-
- setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
- setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
- setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+ for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ }
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
@@ -1474,7 +1327,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
@@ -1576,6 +1428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SUB, MVT::v32i16, Legal);
setOperationAction(ISD::SUB, MVT::v64i8, Legal);
setOperationAction(ISD::MUL, MVT::v32i16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
const MVT VT = (MVT::SimpleValueType)i;
@@ -1599,7 +1455,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
setOperationAction(ISD::AND, MVT::v8i32, Legal);
setOperationAction(ISD::OR, MVT::v8i32, Legal);
@@ -3189,7 +3048,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3906,21 +3765,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
return true;
}
-/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
-/// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
- unsigned NumElems) {
- for (unsigned i = 0; i != NumElems; ++i) {
- int idx = Mask[i];
- if (idx < 0)
- continue;
- else if (idx < (int)NumElems)
- Mask[i] = idx + NumElems;
- else
- Mask[i] = idx - NumElems;
- }
-}
-
/// isVEXTRACTIndex - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
/// suitable for instruction that extract 128 or 256 bit vectors
@@ -4083,9 +3927,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
} else if (VT.getScalarType() == MVT::i1) {
- assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
+
+ assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
+ && "Unexpected vector type");
+ assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
+ && "Unexpected vector type");
SDValue Cst = DAG.getConstant(0, MVT::i1);
- SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
llvm_unreachable("Unexpected vector type");
@@ -4093,6 +3941,162 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
+
+ // Extract from UNDEF is UNDEF.
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getUNDEF(ResultVT);
+
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
+ * ElemsPerChunk);
+
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
+ ElemsPerChunk));
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl, unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
+
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
+ * ElemsPerChunk);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+ // For insertion into the zero index (low half) of a 256-bit vector, it is
+ // more efficient to generate a blend with immediate instead of an insert*128.
+ // We are still creating an INSERT_SUBVECTOR below with an undef node to
+ // extend the subvector to the size of the result vector. Make sure that
+ // we are not recursing on that node by checking for undef here.
+ if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+ Result.getOpcode() != ISD::UNDEF) {
+ EVT ResultVT = Result.getValueType();
+ SDValue ZeroIndex = DAG.getIntPtrConstant(0);
+ SDValue Undef = DAG.getUNDEF(ResultVT);
+ SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+ Vec, ZeroIndex);
+
+ // The blend instruction, and therefore its mask, depend on the data type.
+ MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+ if (ScalarType.isFloatingPoint()) {
+ // Choose either vblendps (float) or vblendpd (double).
+ unsigned ScalarSize = ScalarType.getSizeInBits();
+ assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+ unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+ SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+ // AVX2 is needed for 256-bit integer blend support.
+ // Integers must be cast to 32-bit because there is only vpblendd;
+ // vpblendw can't be used for this because it has a handicapped mask.
+
+ // If we don't have AVX2, then cast to float. Using a wrong domain blend
+ // is still more efficient than using the wrong domain vinsertf128 that
+ // will be created by InsertSubVector().
+ MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+ SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
+ Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+ Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+ return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+ }
+
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
/// getOnesVector - Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
@@ -5567,8 +5571,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
}
- SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
- if (Broadcast.getNode())
+ if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
return Broadcast;
unsigned EVTBits = ExtVT.getSizeInBits();
@@ -5635,12 +5638,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
- if (VT.is256BitVector() || VT.is512BitVector()) {
+ if (VT.is512BitVector()) {
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
Item, DAG.getIntPtrConstant(0));
}
- assert(VT.is128BitVector() && "Expected an SSE value type!");
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
@@ -5742,24 +5746,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
- if (EVTBits == 8 && NumElems == 16) {
- SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this);
- if (V.getNode()) return V;
- }
+ if (EVTBits == 8 && NumElems == 16)
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this))
+ return V;
- if (EVTBits == 16 && NumElems == 8) {
- SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this);
- if (V.getNode()) return V;
- }
+ if (EVTBits == 16 && NumElems == 8)
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this))
+ return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
- if (EVTBits == 32 && NumElems == 4) {
- SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
- if (V.getNode())
+ if (EVTBits == 32 && NumElems == 4)
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
return V;
- }
// If element VT is == 32 bits, turn it into a number of shuffles.
SmallVector<SDValue, 8> V(NumElems);
@@ -5807,13 +5807,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
V[i] = Op.getOperand(i);
// Check for elements which are consecutive loads.
- SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
- if (LD.getNode())
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
return LD;
// Check for a build vector from mostly shuffle plus few inserting.
- SDValue Sh = buildFromShuffleMostly(Op, DAG);
- if (Sh.getNode())
+ if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
@@ -5893,8 +5891,64 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
- MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG & DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+ unsigned NumOfOperands = Op.getNumOperands();
+
+ assert(isPowerOf2_32(NumOfOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ if (NumOfOperands > 2) {
+ MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+ ResVT.getVectorNumElements()/2);
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned i = 0; i < NumOfOperands/2; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ Ops.clear();
+ for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ if (IsZeroV1 && IsZeroV2)
+ return getZeroVector(ResVT, Subtarget, DAG, dl);
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0);
+ SDValue Undef = DAG.getUNDEF(ResVT);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
+
+ V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
+ V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
+ if (IsZeroV1)
+ return V2;
+
+ V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ // Zero the upper bits of V1
+ V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
+ V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
+ if (IsZeroV2)
+ return V1;
+ return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
@@ -6935,8 +6989,8 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
"a sorted mask where the broadcast "
"comes from V1.");
- // Go up the chain of (vector) values to try and find a scalar load that
- // we can combine with the broadcast.
+ // Go up the chain of (vector) values to find a scalar load that we can
+ // combine with the broadcast.
for (;;) {
switch (V.getOpcode()) {
case ISD::CONCAT_VECTORS: {
@@ -6973,12 +7027,12 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
- // If the scalar isn't a load we can't broadcast from it in AVX1, only with
- // AVX2.
+ // If the scalar isn't a load, we can't broadcast from it in AVX1.
+ // Only AVX2 has register broadcasts.
if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
return SDValue();
} else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
- // We can't broadcast from a vector register w/o AVX2, and we can only
+ // We can't broadcast from a vector register without AVX2, and we can only
// broadcast from the zero-element of a vector register.
return SDValue();
}
@@ -7689,10 +7743,18 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// The exact breakdown of how to form these dword pairs and align them on the
/// correct sides is really tricky. See the comments within the function for
/// more of the details.
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
- SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+ SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
+ MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+ assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
@@ -7845,9 +7907,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
int PSHUFDMask[] = {0, 1, 2, 3};
PSHUFDMask[ADWord] = BDWord;
PSHUFDMask[BDWord] = ADWord;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ V = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+ DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
// Adjust the mask to match the new locations of A and B.
@@ -7859,8 +7921,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
- return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
- Mask);
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+ DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
@@ -8083,15 +8145,15 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// Now enact all the shuffles we've computed to move the inputs into their
// target half.
if (!isNoopShuffleMask(PSHUFLMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
if (!isNoopShuffleMask(PSHUFHMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
if (!isNoopShuffleMask(PSHUFDMask))
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ V = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+ DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
// At this point, each half should contain all its inputs, and we can then
@@ -8105,7 +8167,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// Do a half shuffle for the low mask.
if (!isNoopShuffleMask(LoMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(LoMask, DAG));
// Do a half shuffle with the high mask after shifting its values down.
@@ -8113,7 +8175,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
if (M >= 0)
M -= 4;
if (!isNoopShuffleMask(HiMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(HiMask, DAG));
return V;
@@ -8232,8 +8294,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, Subtarget, DAG))
return Rotate;
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, V1, Mask, Subtarget,
- DAG);
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG);
}
assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
@@ -8946,7 +9008,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
int LaneSize = Mask.size() / 2;
// If there are only inputs from one 128-bit lane, splitting will in fact be
- // less expensive. The flags track wether the given lane contains an element
+ // less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
bool LaneCrossing[2] = {false, false};
for (int i = 0, Size = Mask.size(); i < Size; ++i)
@@ -8986,34 +9048,78 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ // TODO: If minimizing size and one of the inputs is a zero vector and the
+ // the zero vector has only one use, we could use a VPERM2X128 to save the
+ // instruction bytes needed to explicitly generate the zero vector.
+
// Blends are faster and handle all the non-lane-crossing cases.
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
- // Check for patterns which can be matched with a single insert of a 128-bit
- // subvector.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
- }
- if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) {
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
- DAG.getIntPtrConstant(2));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // If either input operand is a zero vector, use VPERM2X128 because its mask
+ // allows us to replace the zero input with an implicit zero.
+ if (!IsV1Zero && !IsV2Zero) {
+ // Check for patterns which can be matched with a single insert of a 128-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
+ if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+ }
+
+ // Otherwise form a 128-bit permutation. After accounting for undefs,
+ // convert the 64-bit shuffle mask selection values into 128-bit
+ // selection bits by dividing the indexes by 2 and shifting into positions
+ // defined by a vperm2*128 instruction's immediate control byte.
+
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
+ int MaskLO = Mask[0];
+ if (MaskLO == SM_SentinelUndef)
+ MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+
+ int MaskHI = Mask[2];
+ if (MaskHI == SM_SentinelUndef)
+ MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+
+ unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+
+ // If either input is a zero vector, replace it with an undef input.
+ // Shuffle mask values < 4 are selecting elements of V1.
+ // Shuffle mask values >= 4 are selecting elements of V2.
+ // Adjust each half of the permute mask by clearing the half that was
+ // selecting the zero vector and setting the zero mask bit.
+ if (IsV1Zero) {
+ V1 = DAG.getUNDEF(VT);
+ if (MaskLO < 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI < 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+ if (IsV2Zero) {
+ V2 = DAG.getUNDEF(VT);
+ if (MaskLO >= 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI >= 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
}
- // Otherwise form a 128-bit permutation.
- // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
- unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
DAG.getConstant(PermMask, MVT::i8));
}
@@ -9326,6 +9432,15 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; });
+ if (NumV2Elements == 1 && Mask[0] >= 8)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
@@ -9557,6 +9672,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
Mask, DAG);
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v16 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
@@ -10118,8 +10242,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
- SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG);
- if (BlendOp.getNode())
+ if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
return BlendOp;
// Variable blends are only legal from SSE4.1 onward.
@@ -10421,17 +10544,31 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
// into that, and then insert the subvector back into the result.
if (VT.is256BitVector() || VT.is512BitVector()) {
- // Get the desired 128-bit vector half.
+ // With a 256-bit vector, we can insert into the zero element efficiently
+ // using a blend if we have AVX or AVX2 and the right data type.
+ if (VT.is256BitVector() && IdxVal == 0) {
+ // TODO: It is worthwhile to cast integer to floating point and back
+ // and incur a domain crossing penalty if that's what we'll end up
+ // doing anyway after extracting to a 128-bit vector.
+ if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+ (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
+ SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ N2 = DAG.getIntPtrConstant(1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+ }
+ }
+
+ // Get the desired 128-bit vector chunk.
SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
- // Insert the element into the desired half.
+ // Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
DAG.getConstant(IdxIn128, MVT::i32));
- // Insert the changed part back to the 256-bit vector
+ // Insert the changed part back into the bigger vector
return Insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
@@ -10456,16 +10593,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
if (EltVT == MVT::f32) {
- // Bits [7:6] of the constant are the source select. This will always be
- // zero here. The DAG Combiner may combine an extract_elt index into
- // these
- // bits. For example (insert (extract, 3), 2) could be matched by
- // putting
- // the '3' into bits [7:6] of X86ISD::INSERTPS.
- // Bits [5:4] of the constant are the destination select. This is the
- // value of the incoming immediate.
- // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N2 = DAG.getIntPtrConstant(1);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ }
N2 = DAG.getIntPtrConstant(IdxVal << 4);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
@@ -10593,6 +10743,37 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ if (OpVT.getVectorElementType() == MVT::i1) {
+ if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal
+ return Op;
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0);
+ SDValue Undef = DAG.getUNDEF(OpVT);
+ unsigned NumElems = OpVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
+
+ if (IdxVal == OpVT.getVectorNumElements() / 2) {
+ // Zero upper bits of the Vec
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+
+ SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
+ Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
+ }
+ if (IdxVal == 0) {
+ SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
+ // Zero upper bits of the Vec2
+ Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
+ Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits);
+ // Zero lower bits of the Vec
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ // Merge them together
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
+ }
+ }
return SDValue();
}
@@ -13149,9 +13330,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op1.getValueType();
SDValue CC;
- // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
- // are available. Otherwise fp cmovs get lowered into a less efficient branch
- // sequence later on.
+ // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available or VBLENDV if AVX is available.
+ // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
if (Cond.getOpcode() == ISD::SETCC &&
((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
(Subtarget->hasSSE1() && VT == MVT::f32)) &&
@@ -13166,8 +13347,42 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(SSECC, MVT::i8));
return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
}
+
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
DAG.getConstant(SSECC, MVT::i8));
+
+ // If we have AVX, we can use a variable vector select (VBLENDV) instead
+ // of 3 logic instructions for size savings and potentially speed.
+ // Unfortunately, there is no scalar form of VBLENDV.
+
+ // If either operand is a constant, don't try this. We can expect to
+ // optimize away at least one of the logic instructions later in that
+ // case, so that sequence would be faster than a variable blend.
+
+ // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+
+ if (Subtarget->hasAVX() &&
+ !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+
+ // Convert to vectors, do a VSELECT, and convert back to scalar.
+ // All of the conversions should be optimized away.
+
+ EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+ SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+ SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+ SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+ EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+ VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
+
+ SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ VSel, DAG.getIntPtrConstant(0));
+ }
SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
@@ -14595,6 +14810,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ // Operands intentionally swapped. Mask is last operand to intrinsic,
+ // but second operand for node/instruction.
+ return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(1));
+
case Intrinsic::x86_avx512_mask_valign_q_512:
case Intrinsic::x86_avx512_mask_valign_d_512:
// Vector source operands are swapped.
@@ -16039,21 +16261,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
- SDValue V;
assert(VT.isVector() && "Custom lowering only for vector shifts!");
assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
- V = LowerScalarImmediateShift(Op, DAG, Subtarget);
- if (V.getNode())
+ if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
return V;
- V = LowerScalarVariableShift(Op, DAG, Subtarget);
- if (V.getNode())
+ if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
return V;
if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
return Op;
+
// AVX2 has VPSLLV/VPSRAV/VPSRLV.
if (Subtarget->hasInt256()) {
if (Op.getOpcode() == ISD::SRL &&
@@ -16068,6 +16288,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
return Op;
}
+ // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+ // shifts per-lane and then shuffle the partial results back together.
+ if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+ // Splat the shift amounts so the scalar shifts above will catch it.
+ SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+ SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+ SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+ return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+ }
+
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
// Do this only if the vector shift count is a constant build_vector.
@@ -16238,7 +16469,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
- }
+ }
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.is256BitVector()) {
@@ -16254,12 +16485,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
SDValue Amt1, Amt2;
if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
// Constant shift amount
- SmallVector<SDValue, 4> Amt1Csts;
- SmallVector<SDValue, 4> Amt2Csts;
- for (unsigned i = 0; i != NumElems/2; ++i)
- Amt1Csts.push_back(Amt->getOperand(i));
- for (unsigned i = NumElems/2; i != NumElems; ++i)
- Amt2Csts.push_back(Amt->getOperand(i));
+ SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
+ ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
+ ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
@@ -16386,14 +16614,17 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return needsCmpXchgNb(PTy->getElementType());
}
-bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+TargetLoweringBase::AtomicRMWExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
// and default to library calls otherwise.
- if (MemType->getPrimitiveSizeInBits() > NativeWidth)
- return needsCmpXchgNb(MemType);
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+ return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg
+ : AtomicRMWExpansionKind::None;
+ }
AtomicRMWInst::BinOp Op = AI->getOperation();
switch (Op) {
@@ -16403,13 +16634,14 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
// It's better to use xadd, xsub or xchg for these in all cases.
- return false;
+ return AtomicRMWExpansionKind::None;
case AtomicRMWInst::Or:
case AtomicRMWInst::And:
case AtomicRMWInst::Xor:
// If the atomicrmw's result isn't actually used, we can just add a "lock"
// prefix to a normal instruction for these operations.
- return !AI->use_empty();
+ return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg
+ : AtomicRMWExpansionKind::None;
case AtomicRMWInst::Nand:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
@@ -16417,7 +16649,7 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::UMin:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
- return true;
+ return AtomicRMWExpansionKind::CmpXChg;
}
}
@@ -16874,7 +17106,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
- case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
@@ -17719,7 +17951,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
// 9 ) EFLAGS (implicit-def)
assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
- assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+ static_assert(X86::AddrNumOperands == 5,
+ "VAARG_64 assumes 5 address operands");
unsigned DestReg = MI->getOperand(0).getReg();
MachineOperand &Base = MI->getOperand(1);
@@ -18095,6 +18328,92 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// fallthrough --> copy0MBB
MachineBasicBlock *thisMBB = BB;
MachineFunction *F = BB->getParent();
+
+ // We also lower double CMOVs:
+ // (CMOV (CMOV F, T, cc1), T, cc2)
+ // to two successives branches. For that, we look for another CMOV as the
+ // following instruction.
+ //
+ // Without this, we would add a PHI between the two jumps, which ends up
+ // creating a few copies all around. For instance, for
+ //
+ // (sitofp (zext (fcmp une)))
+ //
+ // we would generate:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // movaps %xmm0, %xmm1
+ // jne .LBB5_2
+ // xorps %xmm1, %xmm1
+ // .LBB5_2:
+ // jp .LBB5_4
+ // movaps %xmm1, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ // because this custom-inserter would have generated:
+ //
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ // | \
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // B: empty
+ // C: Z = PHI [X, A], [Y, B]
+ // D: empty
+ // E: PHI [X, C], [Z, D]
+ //
+ // If we lower both CMOVs in a single step, we can instead generate:
+ //
+ // A
+ // | \
+ // | C
+ // | /|
+ // |/ |
+ // | |
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // D: empty
+ // E: PHI [X, A], [X, C], [Y, D]
+ //
+ // Which, in our sitofp/fcmp example, gives us something like:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // jne .LBB5_4
+ // jp .LBB5_4
+ // xorps %xmm0, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ MachineInstr *NextCMOV = nullptr;
+ MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+ if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
+ NextCMOV = &*NextMIIt;
+
+ MachineBasicBlock *jcc1MBB = nullptr;
+
+ // If we have a double CMOV, we lower it to two successive branches to
+ // the same block. EFLAGS is used by both, so mark it as live in the second.
+ if (NextCMOV) {
+ jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, jcc1MBB);
+ jcc1MBB->addLiveIn(X86::EFLAGS);
+ }
+
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, copy0MBB);
@@ -18103,8 +18422,10 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
- if (!MI->killsRegister(X86::EFLAGS) &&
- !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+
+ MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
+ if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
sinkMBB->addLiveIn(X86::EFLAGS);
}
@@ -18115,7 +18436,19 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
- BB->addSuccessor(copy0MBB);
+ if (NextCMOV) {
+ // The fallthrough block may be jcc1MBB, if we have a double CMOV.
+ BB->addSuccessor(jcc1MBB);
+
+ // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
+ // jump to the sinkMBB.
+ jcc1MBB->addSuccessor(copy0MBB);
+ jcc1MBB->addSuccessor(sinkMBB);
+ } else {
+ BB->addSuccessor(copy0MBB);
+ }
+
+ // The true block target of the first (or only) branch is always sinkMBB.
BB->addSuccessor(sinkMBB);
// Create the conditional branch instruction.
@@ -18123,6 +18456,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+ if (NextCMOV) {
+ unsigned Opc2 = X86::GetCondBranchFromCond(
+ (X86::CondCode)NextCMOV->getOperand(3).getImm());
+ BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
+ }
+
// copy0MBB:
// %FalseValue = ...
// # fallthrough to sinkMBB
@@ -18131,10 +18470,22 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
- BuildMI(*sinkMBB, sinkMBB->begin(), DL,
- TII->get(X86::PHI), MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
- .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+ MachineInstrBuilder MIB =
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
+ MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+ .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+ // If we have a double CMOV, the second Jcc provides the same incoming
+ // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+ if (NextCMOV) {
+ MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+ DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
+ .addReg(MI->getOperand(0).getReg());
+ NextCMOV->eraseFromParent();
+ }
MI->eraseFromParent(); // The pseudo instruction is gone now.
return sinkMBB;
@@ -18218,7 +18569,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
// Calls into a routine in libgcc to allocate more space from the heap.
const uint32_t *RegMask =
- Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
@@ -18303,7 +18654,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
- Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
@@ -19132,9 +19483,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// Note that even with AVX we prefer the PSHUFD form of shuffle for integer
// vectors because it can have a load folded into it that UNPCK cannot. This
// doesn't preclude something switching to the shorter encoding post-RA.
- if (FloatDomain) {
- if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
- bool Lo = Mask.equals(0, 0);
+ //
+ // FIXME: Should teach these routines about AVX vector widths.
+ if (FloatDomain && VT.getSizeInBits() == 128) {
+ if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
+ bool Lo = Mask.equals({0, 0});
unsigned Shuffle;
MVT ShuffleVT;
// Check if we have SSE3 which will let us use MOVDDUP. That instruction
@@ -19163,8 +19516,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
return true;
}
if (Subtarget->hasSSE3() &&
- (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
- bool Lo = Mask.equals(0, 0, 2, 2);
+ (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
+ bool Lo = Mask.equals({0, 0, 2, 2});
unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root->getOpcode() == Shuffle)
@@ -19177,8 +19530,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
/*AddTo*/ true);
return true;
}
- if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
- bool Lo = Mask.equals(0, 0, 1, 1);
+ if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
+ bool Lo = Mask.equals({0, 0, 1, 1});
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root->getOpcode() == Shuffle)
@@ -19196,12 +19549,12 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
- if (!FloatDomain &&
- (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
- Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
- Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
- Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
- 15))) {
+ if (!FloatDomain && VT.getSizeInBits() == 128 &&
+ (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals(
+ {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
bool Lo = Mask[0] == 0;
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
if (Depth == 1 && Root->getOpcode() == Shuffle)
@@ -19237,9 +19590,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// in practice PSHUFB tends to be *very* fast so we're more aggressive.
if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
SmallVector<SDValue, 16> PSHUFBMask;
- assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
- int Ratio = 16 / Mask.size();
- for (unsigned i = 0; i < 16; ++i) {
+ int NumBytes = VT.getSizeInBits() / 8;
+ int Ratio = NumBytes / Mask.size();
+ for (int i = 0; i < NumBytes; ++i) {
if (Mask[i / Ratio] == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
@@ -19249,12 +19602,13 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
: 255;
PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
}
- Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input);
DCI.AddToWorklist(Op.getNode());
SDValue PSHUFBMaskOp =
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
+ DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
DCI.AddToWorklist(PSHUFBMaskOp.getNode());
- Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
+ Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);
@@ -19312,10 +19666,6 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
return false; // Bail if we hit a non-vector.
- // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
- // version should be added.
- if (VT.getSizeInBits() != 128)
- return false;
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
@@ -19418,12 +19768,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
/// PSHUF-style masks that can be reused with such instructions.
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
bool IsUnary;
- bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+ bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
(void)HaveMask;
assert(HaveMask);
+ // If we have more than 128-bits, only the low 128-bits of shuffle mask
+ // matter. Check that the upper masks are repeats and remove them.
+ if (VT.getSizeInBits() > 128) {
+ int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+ for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+ for (int j = 0; j < LaneElts; ++j)
+ assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+ "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+ Mask.resize(LaneElts);
+ }
+
switch (N.getOpcode()) {
case X86ISD::PSHUFD:
return Mask;
@@ -19496,7 +19860,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
case X86ISD::UNPCKH:
// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
// shuffle into a preceding word shuffle.
- if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+ if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
+ V.getSimpleValueType().getScalarType() != MVT::i16)
return SDValue();
// Search for a half-shuffle which we can combine with.
@@ -19670,8 +20035,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
break;
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
- assert(VT == MVT::v8i16);
- (void)VT;
+ assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
return SDValue(); // We combined away this shuffle, so we're done.
@@ -19679,17 +20043,18 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations. Note that it has to at least flip the
// dwords as otherwise it would have been removed as a no-op.
- if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
+ if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
int DMask[] = {0, 1, 2, 3};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
DMask[DOffset + 0] = DOffset + 1;
DMask[DOffset + 1] = DOffset + 0;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+ MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
DCI.AddToWorklist(V.getNode());
- V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+ V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
getV4X86ShuffleImm8ForMask(DMask, DAG));
DCI.AddToWorklist(V.getNode());
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
}
// Look for shuffle patterns which can be implemented as a single unpack.
@@ -19717,18 +20082,14 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
int MappedMask[8];
for (int i = 0; i < 8; ++i)
MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
- const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
- const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
- if (std::equal(std::begin(MappedMask), std::end(MappedMask),
- std::begin(UnpackLoMask)) ||
- std::equal(std::begin(MappedMask), std::end(MappedMask),
- std::begin(UnpackHiMask))) {
+ if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
// We can replace all three shuffles with an unpack.
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+ V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
DCI.AddToWorklist(V.getNode());
return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
: X86ISD::UNPCKH,
- DL, MVT::v8i16, V, V);
+ DL, VT, V, V);
}
}
}
@@ -19876,10 +20237,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // Only handle 128 wide vector from here on.
- if (!VT.is128BitVector())
- return SDValue();
-
// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
// consecutive, non-overlapping, and in the right order.
@@ -20987,6 +21344,49 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
return SDValue();
}
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+/// (X86or (X86setcc) (X86setcc))
+/// (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+ X86::CondCode &CC1, SDValue &Flags,
+ bool &isAnd) {
+ if (Cond->getOpcode() == X86ISD::CMP) {
+ ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
+ if (!CondOp1C || !CondOp1C->isNullValue())
+ return false;
+
+ Cond = Cond->getOperand(0);
+ }
+
+ isAnd = false;
+
+ SDValue SetCC0, SetCC1;
+ switch (Cond->getOpcode()) {
+ default: return false;
+ case ISD::AND:
+ case X86ISD::AND:
+ isAnd = true;
+ // fallthru
+ case ISD::OR:
+ case X86ISD::OR:
+ SetCC0 = Cond->getOperand(0);
+ SetCC1 = Cond->getOperand(1);
+ break;
+ };
+
+ // Make sure we have SETCC nodes, using the same flags value.
+ if (SetCC0.getOpcode() != X86ISD::SETCC ||
+ SetCC1.getOpcode() != X86ISD::SETCC ||
+ SetCC0->getOperand(1) != SetCC1->getOperand(1))
+ return false;
+
+ CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+ CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+ Flags = SetCC0->getOperand(1);
+ return true;
+}
+
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -21156,6 +21556,44 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // Fold and/or of setcc's to double CMOV:
+ // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+ // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+ //
+ // This combine lets us generate:
+ // cmovcc1 (jcc1 if we don't have CMOV)
+ // cmovcc2 (same)
+ // instead of:
+ // setcc1
+ // setcc2
+ // and/or
+ // cmovne (jne if we don't have CMOV)
+ // When we can't use the CMOV instruction, it might increase branch
+ // mispredicts.
+ // When we can use CMOV, or when there is no mispredict, this improves
+ // throughput and reduces register pressure.
+ //
+ if (CC == X86::COND_NE) {
+ SDValue Flags;
+ X86::CondCode CC0, CC1;
+ bool isAndSetCC;
+ if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+ if (isAndSetCC) {
+ std::swap(FalseOp, TrueOp);
+ CC0 = X86::GetOppositeBranchCondition(CC0);
+ CC1 = X86::GetOppositeBranchCondition(CC1);
+ }
+
+ SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8),
+ Flags};
+ SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags};
+ SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+ return CMOV;
+ }
+ }
+
return SDValue();
}
@@ -21166,24 +21604,16 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
default: return SDValue();
// SSE/AVX/AVX2 blend intrinsics.
case Intrinsic::x86_avx2_pblendvb:
- case Intrinsic::x86_avx2_pblendw:
- case Intrinsic::x86_avx2_pblendd_128:
- case Intrinsic::x86_avx2_pblendd_256:
// Don't try to simplify this intrinsic if we don't have AVX2.
if (!Subtarget->hasAVX2())
return SDValue();
// FALL-THROUGH
- case Intrinsic::x86_avx_blend_pd_256:
- case Intrinsic::x86_avx_blend_ps_256:
case Intrinsic::x86_avx_blendv_pd_256:
case Intrinsic::x86_avx_blendv_ps_256:
// Don't try to simplify this intrinsic if we don't have AVX.
if (!Subtarget->hasAVX())
return SDValue();
// FALL-THROUGH
- case Intrinsic::x86_sse41_pblendw:
- case Intrinsic::x86_sse41_blendpd:
- case Intrinsic::x86_sse41_blendps:
case Intrinsic::x86_sse41_blendvps:
case Intrinsic::x86_sse41_blendvpd:
case Intrinsic::x86_sse41_pblendvb: {
@@ -21640,7 +22070,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
// an and with a mask.
// We'd like to try to combine that into a shuffle with zero
// plus a bitcast, removing the and.
- if (N0.getOpcode() != ISD::BITCAST ||
+ if (N0.getOpcode() != ISD::BITCAST ||
N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
@@ -21670,7 +22100,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
unsigned ResSize = N1.getValueType().getScalarSizeInBits();
// Make sure the splat matches the mask we expect
- if (SplatBitSize > ResSize ||
+ if (SplatBitSize > ResSize ||
(SplatValue + 1).exactLogBase2() != (int)SrcSize)
return SDValue();
@@ -21724,12 +22154,10 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget);
- if (Zext.getNode())
+ if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
return Zext;
- SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
- if (R.getNode())
+ if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
return R;
EVT VT = N->getValueType(0);
@@ -22521,7 +22949,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// If A and B occur in reverse order in RHS, then "swap" them (which means
// rewriting the mask).
if (A != C)
- CommuteVectorShuffleMask(RMask, NumElts);
+ ShuffleVectorSDNode::commuteMask(RMask);
// At this point LHS and RHS are equivalent to
// LHS = VECTOR_SHUFFLE A, B, LMask
@@ -22630,7 +23058,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
-
+
return SDValue();
}
@@ -22864,45 +23292,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
- LHS.getValueType(), RHS, LHS.getOperand(1));
- return DAG.getSetCC(SDLoc(N), N->getValueType(0),
- addV, DAG.getConstant(0, addV.getValueType()), CC);
+ SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS,
+ LHS.getOperand(1));
+ return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
+ DAG.getConstant(0, addV.getValueType()), CC);
}
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
- RHS.getValueType(), LHS, RHS.getOperand(1));
- return DAG.getSetCC(SDLoc(N), N->getValueType(0),
- addV, DAG.getConstant(0, addV.getValueType()), CC);
+ SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS,
+ RHS.getOperand(1));
+ return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
+ DAG.getConstant(0, addV.getValueType()), CC);
}
- if (VT.getScalarType() == MVT::i1) {
- bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
- bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
- if (!IsSEXT0 && !IsVZero0)
- return SDValue();
- bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ if (VT.getScalarType() == MVT::i1 &&
+ (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+ bool IsSEXT0 =
+ (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
- if (!IsSEXT1 && !IsVZero1)
- return SDValue();
+ if (!IsSEXT0 || !IsVZero1) {
+ // Swap the operands and update the condition code.
+ std::swap(LHS, RHS);
+ CC = ISD::getSetCCSwappedOperands(CC);
+
+ IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+ IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+ }
if (IsSEXT0 && IsVZero1) {
- assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
- if (CC == ISD::SETEQ)
+ assert(VT == LHS.getOperand(0).getValueType() &&
+ "Uexpected operand type");
+ if (CC == ISD::SETGT)
+ return DAG.getConstant(0, VT);
+ if (CC == ISD::SETLE)
+ return DAG.getConstant(1, VT);
+ if (CC == ISD::SETEQ || CC == ISD::SETGE)
return DAG.getNOT(DL, LHS.getOperand(0), VT);
+
+ assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+ "Unexpected condition code!");
return LHS.getOperand(0);
}
- if (IsSEXT1 && IsVZero0) {
- assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
- if (CC == ISD::SETEQ)
- return DAG.getNOT(DL, RHS.getOperand(0), VT);
- return RHS.getOperand(0);
- }
}
return SDValue();
@@ -22940,7 +23374,7 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
// countS and just gets an f32 from that address.
unsigned DestIndex =
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
-
+
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
// Create this as a scalar to vector to match the instruction pattern.
@@ -22964,7 +23398,7 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
// pattern-matching possibilities related to scalar math ops in SSE/AVX.
// x86InstrInfo knows how to commute this back after instruction selection
// if it would help register allocation.
-
+
// TODO: If optimizing for size or a processor that doesn't suffer from
// partial register update stalls, this should be transformed into a MOVSD
// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
@@ -23503,27 +23937,23 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
-namespace {
- // Helper to match a string separated by whitespace.
- bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
- s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
-
- for (unsigned i = 0, e = args.size(); i != e; ++i) {
- StringRef piece(*args[i]);
- if (!s.startswith(piece)) // Check if the piece matches.
- return false;
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+ S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
- s = s.substr(piece.size());
- StringRef::size_type pos = s.find_first_not_of(" \t");
- if (pos == 0) // We matched a prefix.
- return false;
+ for (StringRef Piece : Pieces) {
+ if (!S.startswith(Piece)) // Check if the piece matches.
+ return false;
- s = s.substr(pos);
- }
+ S = S.substr(Piece.size());
+ StringRef::size_type Pos = S.find_first_not_of(" \t");
+ if (Pos == 0) // We matched a prefix.
+ return false;
- return s.empty();
+ S = S.substr(Pos);
}
- const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+
+ return S.empty();
}
static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
@@ -23563,12 +23993,12 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
// ops instead of emitting the bswap asm. For now, we don't support 486 or
// lower so don't worry about this.
// bswap $0
- if (matchAsm(AsmPieces[0], "bswap", "$0") ||
- matchAsm(AsmPieces[0], "bswapl", "$0") ||
- matchAsm(AsmPieces[0], "bswapq", "$0") ||
- matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
- matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
- matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
+ if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
return IntrinsicLowering::LowerToByteSwap(CI);
@@ -23577,8 +24007,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
// rorw $$8, ${0:w} --> llvm.bswap.i16
if (CI->getType()->isIntegerTy(16) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
- (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
- matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
+ (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+ matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
@@ -23590,9 +24020,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
case 3:
if (CI->getType()->isIntegerTy(32) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
- matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
- matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
- matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+ matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+ matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+ matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
@@ -23607,9 +24037,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
- if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
- matchAsm(AsmPieces[1], "bswap", "%edx") &&
- matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+ if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+ matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+ matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
return IntrinsicLowering::LowerToByteSwap(CI);
}
}