aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp7836
1 files changed, 2921 insertions, 4915 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f05b6c6..6866be7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15,6 +15,7 @@
#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
@@ -66,11 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
"rather than promotion."),
cl::Hidden);
-static cl::opt<bool> ExperimentalVectorShuffleLowering(
- "x86-experimental-vector-shuffle-lowering", cl::init(true),
- cl::desc("Enable an experimental vector shuffle lowering code path."),
- cl::Hidden);
-
static cl::opt<int> ReciprocalEstimateRefinementSteps(
"x86-recip-refinement-steps", cl::init(1),
cl::desc("Specify the number of Newton-Raphson iterations applied to the "
@@ -107,21 +103,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+ makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
- VecIdx);
-
- return Result;
-
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
}
+
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
/// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, SDLoc dl) {
@@ -158,25 +151,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
* ElemsPerChunk);
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
- VecIdx);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
}
+
/// Generate a DAG to put 128-bits into a vector > 128 bits. This
/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
/// simple superregister reference. Idx is an index in the 128 bits
-/// we want. It need not be aligned to a 128-bit bounday. That makes
+/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl) {
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG,SDLoc dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl) {
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
}
@@ -199,44 +190,23 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-// FIXME: This should stop caching the target machine as soon as
-// we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
- : TargetLowering(TM) {
- Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
TD = getDataLayout();
- resetOperationActions();
-}
-
-void X86TargetLowering::resetOperationActions() {
- const TargetMachine &TM = getTargetMachine();
- static bool FirstTimeThrough = true;
-
- // If none of the target options have changed, then we don't need to reset the
- // operation actions.
- if (!FirstTimeThrough && TO == TM.Options) return;
-
- if (!FirstTimeThrough) {
- // Reinitialize the actions.
- initActions();
- FirstTimeThrough = false;
- }
-
- TO = TM.Options;
-
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
- // X86 is weird, it always uses i8 for shift amounts and setcc results.
+ // X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
- // For 64-bit since we have so many registers use the ILP scheduler, for
- // 32-bit code use the register pressure specific scheduling.
+ // For 64-bit, since we have so many registers, use the ILP scheduler.
+ // For 32-bit, use the register pressure specific scheduling.
// For Atom, always use ILP scheduling.
if (Subtarget->isAtom())
setSchedulingPreference(Sched::ILP);
@@ -244,14 +214,14 @@ void X86TargetLowering::resetOperationActions() {
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
- const X86RegisterInfo *RegInfo =
- TM.getSubtarget<X86Subtarget>().getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
- // Bypass expensive divides on Atom when compiling with O2
- if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
- addBypassSlowDiv(32, 8);
- if (Subtarget->is64Bit())
+ // Bypass expensive divides on Atom when compiling with O2.
+ if (TM.getOptLevel() >= CodeGenOpt::Default) {
+ if (Subtarget->hasSlowDivide32())
+ addBypassSlowDiv(32, 8);
+ if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
addBypassSlowDiv(64, 16);
}
@@ -296,7 +266,8 @@ void X86TargetLowering::resetOperationActions() {
if (Subtarget->is64Bit())
addRegisterClass(MVT::i64, &X86::GR64RegClass);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// We don't accept any truncstore of integer registers.
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
@@ -521,7 +492,9 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
@@ -805,9 +778,7 @@ void X86TargetLowering::resetOperationActions() {
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
+ for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::ADD , VT, Expand);
setOperationAction(ISD::SUB , VT, Expand);
setOperationAction(ISD::FADD, VT, Expand);
@@ -876,18 +847,19 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
- for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
- setTruncStoreAction(VT,
- (MVT::SimpleValueType)InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(InnerVT, VT, Expand);
+
+ setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
- // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
- // we have to deal with them whether we ask for Expansion or not. Setting
- // Expand causes its own optimisation problems though, so leave them legal.
- if (VT.getVectorElementType() == MVT::i1)
- setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+ // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+ // types, we have to deal with them whether we ask for Expansion or not.
+ // Setting Expand causes its own optimisation problems though, so leave
+ // them legal.
+ if (VT.getVectorElementType() == MVT::i1)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+ }
}
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -942,6 +914,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
@@ -991,6 +964,14 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ // Only provide customized ctpop vector bit twiddling for vector types we
+ // know to perform better than using the popcnt instructions on each vector
+ // element. If popcnt isn't supported, always provide the custom version.
+ if (!Subtarget->hasPOPCNT()) {
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
+ }
+
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
@@ -1002,6 +983,7 @@ void X86TargetLowering::resetOperationActions() {
continue;
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
@@ -1009,20 +991,24 @@ void X86TargetLowering::resetOperationActions() {
// memory vector types which we can load as a scalar (or sequence of
// scalars) and extend in-register to a legal 128-bit vector type. For sext
// loads these must work with a single scalar load.
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+ }
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
@@ -1070,7 +1056,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
@@ -1103,20 +1090,32 @@ void X86TargetLowering::resetOperationActions() {
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
- // There is no BLENDI for byte vectors. We don't need to custom lower
- // some vselects for now.
+ // We directly match byte blends in the backend as they match the VSELECT
+ // condition form.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+ }
+
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
// i8 and i16 vectors are custom because the source register and source
// source memory operand types are not the same width. f32 vectors are
@@ -1212,7 +1211,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
setOperationAction(ISD::SRL, MVT::v16i16, Custom);
setOperationAction(ISD::SRL, MVT::v32i8, Custom);
@@ -1232,11 +1232,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
-
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
@@ -1280,12 +1275,34 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
- setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
-
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+ // Only provide customized ctpop vector bit twiddling for vector types we
+ // know to perform better than using the popcnt instructions on each
+ // vector element. If popcnt isn't supported, always provide the custom
+ // version.
+ if (!Subtarget->hasPOPCNT())
+ setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
+
+ // Custom CTPOP always performs better on natively supported v8i32
+ setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
+
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
@@ -1314,21 +1331,23 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SRA, MVT::v8i32, Custom);
// Custom lower several nodes for 256-bit types.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
+ for (MVT VT : MVT::vector_valuetypes()) {
+ if (VT.getScalarSizeInBits() >= 32) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
- if (VT.is128BitVector())
+ if (VT.is128BitVector()) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+ }
// Do not attempt to custom lower other non-256-bit vectors
if (!VT.is256BitVector())
continue;
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
@@ -1336,6 +1355,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
+ if (Subtarget->hasInt256())
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
@@ -1367,12 +1390,14 @@ void X86TargetLowering::resetOperationActions() {
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
setOperationAction(ISD::XOR, MVT::i1, Legal);
setOperationAction(ISD::OR, MVT::i1, Legal);
setOperationAction(ISD::AND, MVT::i1, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal);
setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
@@ -1434,6 +1459,17 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v16f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v8f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v8f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal);
+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
@@ -1486,16 +1522,13 @@ void X86TargetLowering::resetOperationActions() {
}
// Custom lower several nodes.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
+ for (MVT VT : MVT::vector_valuetypes()) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
// Extract subvector is special because the value type
// (result) is 256/128-bit but the source is 512-bit wide.
- if (VT.is128BitVector() || VT.is256BitVector())
+ if (VT.is128BitVector() || VT.is256BitVector()) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+ }
if (VT.getVectorElementType() == MVT::i1)
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
@@ -1511,12 +1544,14 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
}
}
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
- // Do not attempt to promote non-256-bit vectors
+ // Do not attempt to promote non-512-bit vectors.
if (!VT.is512BitVector())
continue;
@@ -1536,17 +1571,22 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::LOAD, MVT::v64i8, Legal);
setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v64i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v32i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v64i8, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i16, Legal);
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
const MVT VT = (MVT::SimpleValueType)i;
const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- // Do not attempt to promote non-256-bit vectors
+ // Do not attempt to promote non-512-bit vectors.
if (!VT.is512BitVector())
continue;
- if ( EltSize < 32) {
+ if (EltSize < 32) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Legal);
}
@@ -1560,14 +1600,13 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
- }
- // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
- // of this type with custom code.
- for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
- VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
- setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
- Custom);
+ setOperationAction(ISD::AND, MVT::v8i32, Legal);
+ setOperationAction(ISD::OR, MVT::v8i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v8i32, Legal);
+ setOperationAction(ISD::AND, MVT::v4i32, Legal);
+ setOperationAction(ISD::OR, MVT::v4i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v4i32, Legal);
}
// We want to custom lower some of our intrinsics.
@@ -1607,9 +1646,8 @@ void X86TargetLowering::resetOperationActions() {
setLibcallName(RTLIB::SINCOS_F32, "sincosf");
setLibcallName(RTLIB::SINCOS_F64, "sincos");
if (Subtarget->isTargetDarwin()) {
- // For MacOSX, we don't want to the normal expansion of a libcall to
- // sincos. We want to issue a libcall to __sincos_stret to avoid memory
- // traffic.
+ // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+ // We want to issue a libcall to __sincos_stret to avoid memory traffic.
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
}
@@ -1627,6 +1665,7 @@ void X86TargetLowering::resetOperationActions() {
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
@@ -1640,7 +1679,9 @@ void X86TargetLowering::resetOperationActions() {
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MSTORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
@@ -1650,11 +1691,10 @@ void X86TargetLowering::resetOperationActions() {
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::BUILD_VECTOR);
- if (Subtarget->is64Bit())
- setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
- computeRegisterProperties();
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// On Darwin, -Os means optimize for size without hurting performance,
// do not reduce the limit.
@@ -1668,7 +1708,7 @@ void X86TargetLowering::resetOperationActions() {
// Predictable cmov don't hurt on atom because it's in-order.
PredictableSelectIsExpensive = !Subtarget->isAtom();
-
+ EnableExtLdPromotion = true;
setPrefFunctionAlignment(4); // 2^4 bytes.
verifyIntrinsicTables();
@@ -1676,8 +1716,7 @@ void X86TargetLowering::resetOperationActions() {
// This has so far only been implemented for 64-bit MachO.
bool X86TargetLowering::useLoadStackGuardNode() const {
- return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
- Subtarget->is64Bit();
+ return Subtarget->isTargetMachO() && Subtarget->is64Bit();
}
TargetLoweringBase::LegalizeTypeAction
@@ -1733,7 +1772,7 @@ EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
return VT.changeVectorElementTypeToInteger();
}
-/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
if (MaxAlign == 16)
@@ -1758,7 +1797,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
}
}
-/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
@@ -1777,7 +1816,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
return Align;
}
-/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// Returns the target specific optimal type for load
/// and store operations as a result of memset, memcpy, and memmove
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
@@ -1796,8 +1835,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
MachineFunction &MF) const {
const Function *F = MF.getFunction();
if ((!IsMemset || ZeroMemset) &&
- !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat)) {
+ !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
@@ -1843,7 +1881,7 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return true;
}
-/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// Return the entry encoding for a jump table in the
/// current function. The returned value is a member of the
/// MachineJumpTableInfo::JTEntryKind enum.
unsigned X86TargetLowering::getJumpTableEncoding() const {
@@ -1869,8 +1907,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
MCSymbolRefExpr::VK_GOTOFF, Ctx);
}
-/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
-/// jumptable.
+/// Returns relocation base for the given PIC jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
if (!Subtarget->is64Bit())
@@ -1880,9 +1917,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
return Table;
}
-/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
-/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
-/// MCExpr.
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
MCContext &Ctx) const {
@@ -1894,14 +1930,14 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
}
-// FIXME: Why this routine is here? Move to RegInfo!
-std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
- return TargetLowering::findRepresentativeClass(VT);
+ return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
break;
@@ -1994,7 +2030,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
SDValue ValToCopy = OutVals[i];
EVT ValVT = ValToCopy.getValueType();
- // Promote values to the appropriate types
+ // Promote values to the appropriate types.
if (VA.getLocInfo() == CCValAssign::SExt)
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::ZExt)
@@ -2005,7 +2041,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
assert(VA.getLocInfo() != CCValAssign::FPExt &&
- "Unexpected FP-extend for return value.");
+ "Unexpected FP-extend for return value.");
// If this is x86-64, and we disabled SSE, we can't return FP values,
// or SSE or MMX vectors.
@@ -2060,14 +2096,15 @@ X86TargetLowering::LowerReturn(SDValue Chain,
// Win32 requires us to put the sret argument to %eax as well.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
- if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
- MachineFunction &MF = DAG.getMachineFunction();
- X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- unsigned Reg = FuncInfo->getSRetReturnReg();
- assert(Reg &&
- "SRetReturnReg should have been set in LowerFormalArguments().");
- SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case FuncInfo->setSRetReturnReg() will have been called.
+ if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
+ "No need for an sret register");
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
unsigned RetValReg
= (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
@@ -2141,7 +2178,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
return VT.bitsLT(MinVT) ? MinVT : VT;
}
-/// LowerCallResult - Lower the result values of a call into the
+/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
SDValue
@@ -2221,8 +2258,7 @@ callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
return StackStructReturn;
}
-/// ArgsAreStructReturn - Determines whether a function uses struct
-/// return semantics.
+/// Determines whether a function uses struct return semantics.
static StructReturnType
argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
if (Ins.empty())
@@ -2236,10 +2272,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
return StackStructReturn;
}
-/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
-/// by "Src" to address "Dst" with size and alignment information specified by
-/// the specific parameter attribute. The copy will be passed as a byval
-/// function parameter.
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
static SDValue
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
@@ -2251,7 +2286,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
MachinePointerInfo(), MachinePointerInfo());
}
-/// IsTailCallConvention - Return true if the calling convention is one that
+/// Return true if the calling convention is one that
/// supports tail call optimization.
static bool IsTailCallConvention(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
@@ -2276,7 +2311,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
return true;
}
-/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// Return true if the function is being made into
/// a tailcall target by changing its ABI.
static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
bool GuaranteedTailCallOpt) {
@@ -2356,8 +2391,7 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
}
const Function *Fn = MF.getFunction();
- bool NoImplicitFloatOps = Fn->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+ bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
@@ -2523,18 +2557,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
MFI->CreateFixedObject(1, StackSize, true));
}
+ // Figure out if XMM registers are in use.
+ assert(!(MF.getTarget().Options.UseSoftFloat &&
+ Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
// 64-bit calling conventions support varargs and register parameters, so we
- // have to do extra work to spill them in the prologue or forward them to
- // musttail calls.
- if (Is64Bit && isVarArg &&
- (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
+ // have to do extra work to spill them in the prologue.
+ if (Is64Bit && isVarArg && MFI->hasVAStart()) {
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
- unsigned NumIntRegs =
- CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
- unsigned NumXMMRegs =
- CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
@@ -2557,90 +2592,99 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
}
}
- // Store them to the va_list returned by va_start.
- if (MFI->hasVAStart()) {
- if (IsWin64) {
- const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
- // Get to the caller-allocated home save location. Add 8 to account
- // for the return address.
- int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
- FuncInfo->setRegSaveFrameIndex(
+ if (IsWin64) {
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
- // Fixup to set vararg frame on shadow area (4 x i64).
- if (NumIntRegs < 4)
- FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
- } else {
- // For X86-64, if there are vararg parameters that are passed via
- // registers, then we must store them to their spots on the stack so
- // they may be loaded by deferencing the result of va_next.
- FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
- FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
- FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
- ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
- }
-
- // Store the integer parameter registers.
- SmallVector<SDValue, 8> MemOps;
- SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
- getPointerTy());
- unsigned Offset = FuncInfo->getVarArgsGPOffset();
- for (SDValue Val : LiveGPRs) {
- SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
- DAG.getIntPtrConstant(Offset));
- SDValue Store =
- DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo::getFixedStack(
- FuncInfo->getRegSaveFrameIndex(), Offset),
- false, false, 0);
- MemOps.push_back(Store);
- Offset += 8;
- }
-
- if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
- // Now store the XMM (fp + vector) parameter registers.
- SmallVector<SDValue, 12> SaveXMMOps;
- SaveXMMOps.push_back(Chain);
- SaveXMMOps.push_back(ALVal);
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getRegSaveFrameIndex()));
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getVarArgsFPOffset()));
- SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
- LiveXMMRegs.end());
- MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other, SaveXMMOps));
- }
-
- if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
} else {
- // Add all GPRs, al, and XMMs to the list of forwards. We will add then
- // to the liveout set on a musttail call.
- assert(MFI->hasMustTailInVarArgFunc());
- auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
- typedef X86MachineFunctionInfo::Forward Forward;
-
- for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
- unsigned VReg =
- MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
- Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
- }
-
- if (!ArgXMMs.empty()) {
- unsigned ALVReg =
- MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
- Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
-
- for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
- unsigned VReg =
- MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
- Forwards.push_back(
- Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
- }
- }
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by deferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy());
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+ DAG.getIntPtrConstant(Offset));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ FuncInfo->getRegSaveFrameIndex(), Offset),
+ false, false, 0);
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+ // Now store the XMM (fp + vector) parameter registers.
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getRegSaveFrameIndex()));
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getVarArgsFPOffset()));
+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+ LiveXMMRegs.end());
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ }
+
+ if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget->hasAVX512() &&
+ (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget->hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget->hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Conservatively forward AL on x86_64, since it might be used for varargs.
+ if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+ unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &F : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
}
}
@@ -2688,7 +2732,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
false, false, 0);
}
-/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// Emit a load of return address if tail call
/// optimization is performed and it is required.
SDValue
X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
@@ -2705,7 +2749,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
return SDValue(OutRetAddr.getNode(), 1);
}
-/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
+/// Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx,
@@ -2838,8 +2882,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2952,7 +2995,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
@@ -2960,7 +3003,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
- if (Is64Bit && isVarArg && IsMustTail) {
+ if (isVarArg && IsMustTail) {
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
@@ -3044,10 +3087,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
- } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ } else if (Callee->getOpcode() == ISD::GlobalAddress) {
// If the callee is a GlobalAddress node (quite common, every direct call
// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
// it.
+ GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
// We should use extra load for direct calls to dllimported functions in
// non-JIT mode.
@@ -3073,11 +3117,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// unless we're building with the leopard linker or later, which
// automatically synthesizes these stubs.
OpFlags = X86II::MO_DARWIN_STUB;
- } else if (Subtarget->isPICStyleRIPRel() &&
- isa<Function>(GV) &&
- cast<Function>(GV)->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NonLazyBind)) {
+ } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
+ cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
// If the function is marked as non-lazy, generate an indirect call
// which loads from the GOT directly. This avoids runtime overhead
// at the cost of eager binding (and one extra byte of encoding).
@@ -3117,7 +3158,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
OpFlags);
- } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+ } else if (Subtarget->isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
@@ -3146,7 +3188,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3235,11 +3277,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
@@ -3276,7 +3315,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
return false;
} else {
unsigned Opcode = Def->getOpcode();
- if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
@@ -3341,6 +3381,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+ // Win64 functions have extra shadow space for argument homing. Don't do the
+ // sibcall if the caller and callee have mismatched expectations for this
+ // space.
+ if (IsCalleeWin64 != IsCallerWin64)
+ return false;
+
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
return true;
@@ -3352,8 +3398,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
@@ -3465,8 +3510,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
@@ -3494,7 +3538,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// In PIC we need an extra register to formulate the address computation
// for the callee.
unsigned MaxInRegs =
- (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+ (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -3563,17 +3607,6 @@ static bool isTargetShuffle(unsigned Opcode) {
}
static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
- SDValue V1, SelectionDAG &DAG) {
- switch(Opc) {
- default: llvm_unreachable("Unknown x86 shuffle node");
- case X86ISD::MOVSHDUP:
- case X86ISD::MOVSLDUP:
- case X86ISD::MOVDDUP:
- return DAG.getNode(Opc, dl, VT, V1);
- }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue V1, unsigned TargetMask,
SelectionDAG &DAG) {
switch(Opc) {
@@ -3588,20 +3621,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
}
static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
- SDValue V1, SDValue V2, unsigned TargetMask,
- SelectionDAG &DAG) {
- switch(Opc) {
- default: llvm_unreachable("Unknown x86 shuffle node");
- case X86ISD::PALIGNR:
- case X86ISD::VALIGN:
- case X86ISD::SHUFP:
- case X86ISD::VPERM2X128:
- return DAG.getNode(Opc, dl, VT, V1, V2,
- DAG.getConstant(TargetMask, MVT::i8));
- }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue V1, SDValue V2, SelectionDAG &DAG) {
switch(Opc) {
default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3620,8 +3639,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
@@ -3661,7 +3679,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
// For kernel code model we know that all object resist in the negative half
// of 32bits address space. We may not accept negative offsets, since they may
// be just off and we may accept pretty large positive ones.
- if (M == CodeModel::Kernel && Offset > 0)
+ if (M == CodeModel::Kernel && Offset >= 0)
return true;
return false;
@@ -3823,6 +3841,18 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
return false;
}
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+ // relocation target a movq or addq instruction: don't let the load shrink.
+ SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+ if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+ return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+ return true;
+}
+
/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -3835,6 +3865,24 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget->hasLZCNT();
+}
+
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -3849,7 +3897,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (L, L+Pos]. or is undef.
+/// sequential range (Low, Low+Size]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
unsigned Pos, unsigned Size, int Low) {
for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
@@ -3858,176 +3906,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
return true;
}
-/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFD. That is, it doesn't reference the other
-/// operand - by default will match for first operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
- bool TestSecondOperand = false) {
- if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
- VT != MVT::v2f64 && VT != MVT::v2i64)
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
- unsigned Lo = TestSecondOperand ? NumElems : 0;
- unsigned Hi = Lo + NumElems;
-
- for (unsigned i = 0; i < NumElems; ++i)
- if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
- return false;
-
- return true;
-}
-
-/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
- return false;
-
- // Lower quadword copied in order or undef.
- if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
- return false;
-
- // Upper quadword shuffled.
- for (unsigned i = 4; i != 8; ++i)
- if (!isUndefOrInRange(Mask[i], 4, 8))
- return false;
-
- if (VT == MVT::v16i16) {
- // Lower quadword copied in order or undef.
- if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
- return false;
-
- // Upper quadword shuffled.
- for (unsigned i = 12; i != 16; ++i)
- if (!isUndefOrInRange(Mask[i], 12, 16))
- return false;
- }
-
- return true;
-}
-
-/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
- return false;
-
- // Upper quadword copied in order.
- if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
- return false;
-
- // Lower quadword shuffled.
- for (unsigned i = 0; i != 4; ++i)
- if (!isUndefOrInRange(Mask[i], 0, 4))
- return false;
-
- if (VT == MVT::v16i16) {
- // Upper quadword copied in order.
- if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
- return false;
-
- // Lower quadword shuffled.
- for (unsigned i = 8; i != 12; ++i)
- if (!isUndefOrInRange(Mask[i], 8, 12))
- return false;
- }
-
- return true;
-}
-
-/// \brief Return true if the mask specifies a shuffle of elements that is
-/// suitable for input to intralane (palignr) or interlane (valign) vector
-/// right-shift.
-static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- // Do not handle 64-bit element shuffles with palignr.
- if (NumLaneElts == 2)
- return false;
-
- for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
- unsigned i;
- for (i = 0; i != NumLaneElts; ++i) {
- if (Mask[i+l] >= 0)
- break;
- }
-
- // Lane is all undef, go to next lane
- if (i == NumLaneElts)
- continue;
-
- int Start = Mask[i+l];
-
- // Make sure its in this lane in one of the sources
- if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
- !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
- return false;
-
- // If not lane 0, then we must match lane 0
- if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
- return false;
-
- // Correct second source to be contiguous with first source
- if (Start >= (int)NumElts)
- Start -= NumElts - NumLaneElts;
-
- // Make sure we're shifting in the right direction.
- if (Start <= (int)(i+l))
- return false;
-
- Start -= i;
-
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != NumLaneElts; ++i) {
- int Idx = Mask[i+l];
-
- // Make sure its in this lane
- if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
- !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
- return false;
-
- // If not lane 0, then we must match lane 0
- if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
- return false;
-
- if (Idx >= (int)NumElts)
- Idx -= NumElts - NumLaneElts;
-
- if (!isUndefOrEqual(Idx, Start+i))
- return false;
-
- }
- }
-
- return true;
-}
-
-/// \brief Return true if the node specifies a shuffle of elements that is
-/// suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
- (VT.is256BitVector() && !Subtarget->hasInt256()) ||
- VT.is512BitVector())
- // FIXME: Add AVX512BW.
- return false;
-
- return isAlignrMask(Mask, VT, false);
-}
-
-/// \brief Return true if the node specifies a shuffle of elements that is
-/// suitable for input to VALIGN.
-static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- // FIXME: Add AVX512VL.
- if (!VT.is512BitVector() || !Subtarget->hasAVX512())
- return false;
- return isAlignrMask(Mask, VT, true);
-}
-
/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
/// the two vector operands have swapped position.
static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
@@ -4043,664 +3921,6 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
}
}
-/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128/256-bit
-/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
-/// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
-
- unsigned NumElems = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElems = NumElems/NumLanes;
-
- if (NumLaneElems != 2 && NumLaneElems != 4)
- return false;
-
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- bool symetricMaskRequired =
- (VT.getSizeInBits() >= 256) && (EltSize == 32);
-
- // VSHUFPSY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
- // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
- //
- // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
- // Y3..Y0, Y3..Y0, X3..X0, X3..X0
- //
- // VSHUFPDY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X3 X2 X1 X0
- // SRC2 => Y3 Y2 Y1 Y0
- //
- // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
- //
- SmallVector<int, 4> MaskVal(NumLaneElems, -1);
- unsigned HalfLaneElems = NumLaneElems/2;
- for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
- for (unsigned i = 0; i != NumLaneElems; ++i) {
- int Idx = Mask[i+l];
- unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
- if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
- return false;
- // For VSHUFPSY, the mask of the second half must be the same as the
- // first but with the appropriate offsets. This works in the same way as
- // VPERMILPS works with masks.
- if (!symetricMaskRequired || Idx < 0)
- continue;
- if (MaskVal[i] < 0) {
- MaskVal[i] = Idx - l;
- continue;
- }
- if ((signed)(Idx - l) != MaskVal[i])
- return false;
- }
- }
-
- return true;
-}
-
-/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 4)
- return false;
-
- // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
- return isUndefOrEqual(Mask[0], 6) &&
- isUndefOrEqual(Mask[1], 7) &&
- isUndefOrEqual(Mask[2], 2) &&
- isUndefOrEqual(Mask[3], 3);
-}
-
-/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
-/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
-/// <2, 3, 2, 3>
-static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 4)
- return false;
-
- return isUndefOrEqual(Mask[0], 2) &&
- isUndefOrEqual(Mask[1], 3) &&
- isUndefOrEqual(Mask[2], 2) &&
- isUndefOrEqual(Mask[3], 3);
-}
-
-/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 2 && NumElems != 4)
- return false;
-
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i + NumElems))
- return false;
-
- for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
-
- return true;
-}
-
-/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 2 && NumElems != 4)
- return false;
-
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
-
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i + e], i + NumElems))
- return false;
-
- return true;
-}
-
-/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to INSERTPS.
-/// i. e: If all but one element come from the same vector.
-static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
- // TODO: Deal with AVX's VINSERTPS
- if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
- return false;
-
- unsigned CorrectPosV1 = 0;
- unsigned CorrectPosV2 = 0;
- for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
- if (Mask[i] == -1) {
- ++CorrectPosV1;
- ++CorrectPosV2;
- continue;
- }
-
- if (Mask[i] == i)
- ++CorrectPosV1;
- else if (Mask[i] == i + 4)
- ++CorrectPosV2;
- }
-
- if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
- // We have 3 elements (undefs count as elements from any vector) from one
- // vector, and one from another.
- return true;
-
- return false;
-}
-
-//
-// Some special combinations that can be optimized.
-//
-static
-SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- SDLoc dl(SVOp);
-
- if (VT != MVT::v8i32 && VT != MVT::v8f32)
- return SDValue();
-
- ArrayRef<int> Mask = SVOp->getMask();
-
- // These are the special masks that may be optimized.
- static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
- static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
- bool MatchEvenMask = true;
- bool MatchOddMask = true;
- for (int i=0; i<8; ++i) {
- if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
- MatchEvenMask = false;
- if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
- MatchOddMask = false;
- }
-
- if (!MatchEvenMask && !MatchOddMask)
- return SDValue();
-
- SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
-
- SDValue Op0 = SVOp->getOperand(0);
- SDValue Op1 = SVOp->getOperand(1);
-
- if (MatchEvenMask) {
- // Shift the second operand right to 32 bits.
- static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
- Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
- } else {
- // Shift the first operand left to 32 bits.
- static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
- Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
- }
- static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
- return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
-}
-
-/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
- bool HasInt256, bool V2IsSplat = false) {
-
- assert(VT.getSizeInBits() >= 128 &&
- "Unsupported vector type for unpckl");
-
- unsigned NumElts = VT.getVectorNumElements();
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
- "Unsupported vector type for unpckh");
-
- // AVX defines UNPCK* to operate independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (V2IsSplat) {
- if (!isUndefOrEqual(BitI1, NumElts))
- return false;
- } else {
- if (!isUndefOrEqual(BitI1, j + NumElts))
- return false;
- }
- }
- }
-
- return true;
-}
-
-/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
- bool HasInt256, bool V2IsSplat = false) {
- assert(VT.getSizeInBits() >= 128 &&
- "Unsupported vector type for unpckh");
-
- unsigned NumElts = VT.getVectorNumElements();
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
- "Unsupported vector type for unpckh");
-
- // AVX defines UNPCK* to operate independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (V2IsSplat) {
- if (isUndefOrEqual(BitI1, NumElts))
- return false;
- } else {
- if (!isUndefOrEqual(BitI1, j+NumElts))
- return false;
- }
- }
- }
- return true;
-}
-
-/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
-/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
-/// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- unsigned NumElts = VT.getVectorNumElements();
- bool Is256BitVec = VT.is256BitVector();
-
- if (VT.is512BitVector())
- return false;
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Unsupported vector type for unpckh");
-
- if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
- // FIXME: Need a better way to get rid of this, there's no latency difference
- // between UNPCKLPD and MOVDDUP, the later should always be checked first and
- // the former later. We should also remove the "_undef" special mask.
- if (NumElts == 4 && Is256BitVec)
- return false;
-
- // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
- // independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
-
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (!isUndefOrEqual(BitI1, j))
- return false;
- }
- }
-
- return true;
-}
-
-/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
-/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
-/// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- unsigned NumElts = VT.getVectorNumElements();
-
- if (VT.is512BitVector())
- return false;
-
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Unsupported vector type for unpckh");
-
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
- // independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (!isUndefOrEqual(BitI1, j))
- return false;
- }
- }
- return true;
-}
-
-// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
-// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
-static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
- if (!VT.is512BitVector())
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfSize = NumElts/2;
- if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
- if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
- *Imm = 1;
- return true;
- }
- }
- if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
- if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
- *Imm = 0;
- return true;
- }
- }
- return false;
-}
-
-/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSS,
-/// MOVSD, and MOVD, i.e. setting the lowest element.
-static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
- if (VT.getVectorElementType().getSizeInBits() < 32)
- return false;
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
-
- if (!isUndefOrEqual(Mask[0], NumElts))
- return false;
-
- for (unsigned i = 1; i != NumElts; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
-
- return true;
-}
-
-/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
-/// as permutations between 128-bit chunks or halves. As an example: this
-/// shuffle bellow:
-/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
-/// The first half comes from the second half of V1 and the second half from the
-/// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
- if (!HasFp256 || !VT.is256BitVector())
- return false;
-
- // The shuffle result is divided into half A and half B. In total the two
- // sources have 4 halves, namely: C, D, E, F. The final values of A and
- // B must come from C, D, E or F.
- unsigned HalfSize = VT.getVectorNumElements()/2;
- bool MatchA = false, MatchB = false;
-
- // Check if A comes from one of C, D, E, F.
- for (unsigned Half = 0; Half != 4; ++Half) {
- if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
- MatchA = true;
- break;
- }
- }
-
- // Check if B comes from one of C, D, E, F.
- for (unsigned Half = 0; Half != 4; ++Half) {
- if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
- MatchB = true;
- break;
- }
- }
-
- return MatchA && MatchB;
-}
-
-/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
-static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
- MVT VT = SVOp->getSimpleValueType(0);
-
- unsigned HalfSize = VT.getVectorNumElements()/2;
-
- unsigned FstHalf = 0, SndHalf = 0;
- for (unsigned i = 0; i < HalfSize; ++i) {
- if (SVOp->getMaskElt(i) > 0) {
- FstHalf = SVOp->getMaskElt(i)/HalfSize;
- break;
- }
- }
- for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
- if (SVOp->getMaskElt(i) > 0) {
- SndHalf = SVOp->getMaskElt(i)/HalfSize;
- break;
- }
- }
-
- return (FstHalf | (SndHalf << 4));
-}
-
-// Symetric in-lane mask. Each lane has 4 elements (for imm8)
-static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- if (EltSize < 32)
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- Imm8 = 0;
- if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
- for (unsigned i = 0; i != NumElts; ++i) {
- if (Mask[i] < 0)
- continue;
- Imm8 |= Mask[i] << (i*2);
- }
- return true;
- }
-
- unsigned LaneSize = 4;
- SmallVector<int, 4> MaskVal(LaneSize, -1);
-
- for (unsigned l = 0; l != NumElts; l += LaneSize) {
- for (unsigned i = 0; i != LaneSize; ++i) {
- if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
- return false;
- if (Mask[i+l] < 0)
- continue;
- if (MaskVal[i] < 0) {
- MaskVal[i] = Mask[i+l] - l;
- Imm8 |= MaskVal[i] << (i*2);
- continue;
- }
- if (Mask[i+l] != (signed)(MaskVal[i]+l))
- return false;
- }
- }
- return true;
-}
-
-/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
-/// Note that VPERMIL mask matching is different depending whether theunderlying
-/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
-/// to the same elements of the low, but to the higher half of the source.
-/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- if (VT.getSizeInBits() < 256 || EltSize < 32)
- return false;
- bool symetricMaskRequired = (EltSize == 32);
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned LaneSize = NumElts/NumLanes;
- // 2 or 4 elements in one lane
-
- SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
- for (unsigned l = 0; l != NumElts; l += LaneSize) {
- for (unsigned i = 0; i != LaneSize; ++i) {
- if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
- return false;
- if (symetricMaskRequired) {
- if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
- ExpectedMaskVal[i] = Mask[i+l] - l;
- continue;
- }
- if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
- return false;
- }
- }
- }
- return true;
-}
-
-/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
-/// of what x86 movss want. X86 movs requires the lowest element to be lowest
-/// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
- bool V2IsSplat = false, bool V2IsUndef = false) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumOps = VT.getVectorNumElements();
- if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
- return false;
-
- if (!isUndefOrEqual(Mask[0], 0))
- return false;
-
- for (unsigned i = 1; i != NumOps; ++i)
- if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
- (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
- (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
- return false;
-
- return true;
-}
-
-/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if ((VT.is128BitVector() && NumElems != 4) ||
- (VT.is256BitVector() && NumElems != 8) ||
- (VT.is512BitVector() && NumElems != 16))
- return false;
-
- // "i+1" is the value the indexed mask element must have
- for (unsigned i = 0; i != NumElems; i += 2)
- if (!isUndefOrEqual(Mask[i], i+1) ||
- !isUndefOrEqual(Mask[i+1], i+1))
- return false;
-
- return true;
-}
-
-/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if ((VT.is128BitVector() && NumElems != 4) ||
- (VT.is256BitVector() && NumElems != 8) ||
- (VT.is512BitVector() && NumElems != 16))
- return false;
-
- // "i" is the value the indexed mask element must have
- for (unsigned i = 0; i != NumElems; i += 2)
- if (!isUndefOrEqual(Mask[i], i) ||
- !isUndefOrEqual(Mask[i+1], i))
- return false;
-
- return true;
-}
-
-/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// version of MOVDDUP.
-static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
- if (!HasFp256 || !VT.is256BitVector())
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- if (NumElts != 4)
- return false;
-
- for (unsigned i = 0; i != NumElts/2; ++i)
- if (!isUndefOrEqual(Mask[i], 0))
- return false;
- for (unsigned i = NumElts/2; i != NumElts; ++i)
- if (!isUndefOrEqual(Mask[i], NumElts/2))
- return false;
- return true;
-}
-
-/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128-bit
-/// version of MOVDDUP.
-static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned e = VT.getVectorNumElements() / 2;
- for (unsigned i = 0; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
- for (unsigned i = 0; i != e; ++i)
- if (!isUndefOrEqual(Mask[e+i], i))
- return false;
- return true;
-}
-
/// isVEXTRACTIndex - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
/// suitable for instruction that extract 128 or 256 bit vectors
@@ -4754,125 +3974,6 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
return isVEXTRACTIndex(N, 256);
}
-/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
-/// Handles 128-bit and 256-bit.
-static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getSimpleValueType(0);
-
- assert((VT.getSizeInBits() >= 128) &&
- "Unsupported vector type for PSHUF/SHUFP");
-
- // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
- // independently on 128-bit lanes.
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
- "Only supports 2, 4 or 8 elements per lane");
-
- unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
- unsigned Mask = 0;
- for (unsigned i = 0; i != NumElts; ++i) {
- int Elt = N->getMaskElt(i);
- if (Elt < 0) continue;
- Elt &= NumLaneElts - 1;
- unsigned ShAmt = (i << Shift) % 8;
- Mask |= Elt << ShAmt;
- }
-
- return Mask;
-}
-
-/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
-static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getSimpleValueType(0);
-
- assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
- "Unsupported vector type for PSHUFHW");
-
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned Mask = 0;
- for (unsigned l = 0; l != NumElts; l += 8) {
- // 8 nodes per lane, but we only care about the last 4.
- for (unsigned i = 0; i < 4; ++i) {
- int Elt = N->getMaskElt(l+i+4);
- if (Elt < 0) continue;
- Elt &= 0x3; // only 2-bits.
- Mask |= Elt << (i * 2);
- }
- }
-
- return Mask;
-}
-
-/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
-static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getSimpleValueType(0);
-
- assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
- "Unsupported vector type for PSHUFHW");
-
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned Mask = 0;
- for (unsigned l = 0; l != NumElts; l += 8) {
- // 8 nodes per lane, but we only care about the first 4.
- for (unsigned i = 0; i < 4; ++i) {
- int Elt = N->getMaskElt(l+i);
- if (Elt < 0) continue;
- Elt &= 0x3; // only 2-bits
- Mask |= Elt << (i * 2);
- }
- }
-
- return Mask;
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
-/// VALIGN (if Interlane is true) instructions.
-static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
- bool InterLane) {
- MVT VT = SVOp->getSimpleValueType(0);
- unsigned EltSize = InterLane ? 1 :
- VT.getVectorElementType().getSizeInBits() >> 3;
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- int Val = 0;
- unsigned i;
- for (i = 0; i != NumElts; ++i) {
- Val = SVOp->getMaskElt(i);
- if (Val >= 0)
- break;
- }
- if (Val >= (int)NumElts)
- Val -= NumElts - NumLaneElts;
-
- assert(Val - i > 0 && "PALIGNR imm should be positive");
- return (Val - i) * EltSize;
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the PALIGNR instruction.
-static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
- return getShuffleAlignrImmediate(SVOp, false);
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the VALIGN instruction.
-static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
- return getShuffleAlignrImmediate(SVOp, true);
-}
-
-
static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
@@ -4947,119 +4048,6 @@ bool X86::isZeroNode(SDValue Elt) {
return false;
}
-/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
-/// match movhlps. The lower half elements should come from upper half of
-/// V1 (and in order), and the upper half elements should come from the upper
-/// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
- if (VT.getVectorNumElements() != 4)
- return false;
- for (unsigned i = 0, e = 2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i+2))
- return false;
- for (unsigned i = 2; i != 4; ++i)
- if (!isUndefOrEqual(Mask[i], i+4))
- return false;
- return true;
-}
-
-/// isScalarLoadToVector - Returns true if the node is a scalar load that
-/// is promoted to a vector. It also returns the LoadSDNode by reference if
-/// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
- if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
- return false;
- N = N->getOperand(0).getNode();
- if (!ISD::isNON_EXTLoad(N))
- return false;
- if (LD)
- *LD = cast<LoadSDNode>(N);
- return true;
-}
-
-// Test whether the given value is a vector value which will be legalized
-// into a load.
-static bool WillBeConstantPoolLoad(SDNode *N) {
- if (N->getOpcode() != ISD::BUILD_VECTOR)
- return false;
-
- // Check for any non-constant elements.
- for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
- switch (N->getOperand(i).getNode()->getOpcode()) {
- case ISD::UNDEF:
- case ISD::ConstantFP:
- case ISD::Constant:
- break;
- default:
- return false;
- }
-
- // Vectors of all-zeros and all-ones are materialized with special
- // instructions rather than being loaded.
- return !ISD::isBuildVectorAllZeros(N) &&
- !ISD::isBuildVectorAllOnes(N);
-}
-
-/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
-/// match movlp{s|d}. The lower half elements should come from lower half of
-/// V1 (and in order), and the upper half elements should come from the upper
-/// half of V2 (and in order). And since V1 will become the source of the
-/// MOVLP, it must be either a vector load or a scalar load to vector.
-static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
- ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
- return false;
- // Is V2 is a vector load, don't do this transformation. We will try to use
- // load folding shufps op.
- if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 2 && NumElems != 4)
- return false;
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
- for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i+NumElems))
- return false;
- return true;
-}
-
-/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
-/// to an zero vector.
-/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
-static bool isZeroShuffle(ShuffleVectorSDNode *N) {
- SDValue V1 = N->getOperand(0);
- SDValue V2 = N->getOperand(1);
- unsigned NumElems = N->getValueType(0).getVectorNumElements();
- for (unsigned i = 0; i != NumElems; ++i) {
- int Idx = N->getMaskElt(i);
- if (Idx >= (int)NumElems) {
- unsigned Opc = V2.getOpcode();
- if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
- continue;
- if (Opc != ISD::BUILD_VECTOR ||
- !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
- return false;
- } else if (Idx >= 0) {
- unsigned Opc = V1.getOpcode();
- if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
- continue;
- if (Opc != ISD::BUILD_VECTOR ||
- !X86::isZeroNode(V1.getOperand(Idx)))
- return false;
- }
- }
- return true;
-}
-
/// getZeroVector - Returns a vector of specified type with all zero elements.
///
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
@@ -5131,16 +4119,6 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
-/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
-/// that point to V2 points to its first element.
-static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
- for (unsigned i = 0; i != NumElems; ++i) {
- if (Mask[i] > (int)NumElems) {
- Mask[i] = NumElems;
- }
- }
-}
-
/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
@@ -5177,92 +4155,6 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
-// a generic shuffle instruction because the target has no such instructions.
-// Generate shuffles which repeat i16 and i8 several times until they can be
-// represented by v4f32 and then be manipulated by target suported shuffles.
-static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
- MVT VT = V.getSimpleValueType();
- int NumElems = VT.getVectorNumElements();
- SDLoc dl(V);
-
- while (NumElems > 4) {
- if (EltNo < NumElems/2) {
- V = getUnpackl(DAG, dl, VT, V, V);
- } else {
- V = getUnpackh(DAG, dl, VT, V, V);
- EltNo -= NumElems/2;
- }
- NumElems >>= 1;
- }
- return V;
-}
-
-/// getLegalSplat - Generate a legal splat with supported x86 shuffles
-static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
- MVT VT = V.getSimpleValueType();
- SDLoc dl(V);
-
- if (VT.is128BitVector()) {
- V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
- int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
- V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
- &SplatMask[0]);
- } else if (VT.is256BitVector()) {
- // To use VPERMILPS to splat scalars, the second half of indicies must
- // refer to the higher part, which is a duplication of the lower one,
- // because VPERMILPS can only handle in-lane permutations.
- int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
- EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
-
- V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
- V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
- &SplatMask[0]);
- } else
- llvm_unreachable("Vector size not supported");
-
- return DAG.getNode(ISD::BITCAST, dl, VT, V);
-}
-
-/// PromoteSplat - Splat is promoted to target supported vector shuffles.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
- MVT SrcVT = SV->getSimpleValueType(0);
- SDValue V1 = SV->getOperand(0);
- SDLoc dl(SV);
-
- int EltNo = SV->getSplatIndex();
- int NumElems = SrcVT.getVectorNumElements();
- bool Is256BitVec = SrcVT.is256BitVector();
-
- assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
- "Unknown how to promote splat for type");
-
- // Extract the 128-bit part containing the splat element and update
- // the splat element index when it refers to the higher register.
- if (Is256BitVec) {
- V1 = Extract128BitVector(V1, EltNo, DAG, dl);
- if (EltNo >= NumElems/2)
- EltNo -= NumElems/2;
- }
-
- // All i16 and i8 vector types can't be used directly by a generic shuffle
- // instruction because the target has no such instruction. Generate shuffles
- // which repeat i16 and i8 several times until they fit in i32, and then can
- // be manipulated by target suported shuffles.
- MVT EltVT = SrcVT.getVectorElementType();
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- V1 = PromoteSplati8i16(V1, DAG, EltNo);
-
- // Recreate the 256-bit vector and place the same 128-bit vector
- // into the low and high part. This is necessary because we want
- // to use VPERM* to shuffle the vectors
- if (Is256BitVec) {
- V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
- }
-
- return getLegalSplat(DAG, V1, EltNo);
-}
-
/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
/// vector of zero or undef vector. This produces a shuffle where the low
/// element of V2 is swizzled into the zero/undef vector, landing at element
@@ -5394,13 +4286,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
return false;
if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
- // FIXME: Support AVX-512 here.
- Type *Ty = C->getType();
- if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
- Ty->getVectorNumElements() != 32))
- return false;
-
DecodePSHUFBMask(C, Mask);
+ if (Mask.empty())
+ return false;
break;
}
@@ -5412,16 +4300,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
IsUnary = true;
break;
case X86ISD::MOVSS:
- case X86ISD::MOVSD: {
- // The index 0 always comes from the first element of the second source,
- // this is why MOVSS and MOVSD are used in the first place. The other
- // elements come from the other positions of the first source vector
- Mask.push_back(NumElems);
- for (unsigned i = 1; i != NumElems; ++i) {
- Mask.push_back(i);
- }
+ case X86ISD::MOVSD:
+ DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
break;
- }
case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
@@ -5429,11 +4310,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
break;
case X86ISD::MOVSLDUP:
DecodeMOVSLDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVSHDUP:
DecodeMOVSHDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVDDUP:
+ DecodeMOVDDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
case X86ISD::MOVLHPD:
case X86ISD::MOVLPD:
case X86ISD::MOVLPS:
@@ -5517,148 +4403,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
return SDValue();
}
-/// getNumOfConsecutiveZeros - Return the number of elements of a vector
-/// shuffle operation which come from a consecutively from a zero. The
-/// search can start in two different directions, from left or right.
-/// We count undefs as zeros until PreferredNum is reached.
-static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
- unsigned NumElems, bool ZerosFromLeft,
- SelectionDAG &DAG,
- unsigned PreferredNum = -1U) {
- unsigned NumZeros = 0;
- for (unsigned i = 0; i != NumElems; ++i) {
- unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
- SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
- if (!Elt.getNode())
- break;
-
- if (X86::isZeroNode(Elt))
- ++NumZeros;
- else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
- NumZeros = std::min(NumZeros + 1, PreferredNum);
- else
- break;
- }
-
- return NumZeros;
-}
-
-/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
-/// correspond consecutively to elements from one of the vector operands,
-/// starting from its index OpIdx. Also tell OpNum which source vector operand.
-static
-bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
- unsigned MaskI, unsigned MaskE, unsigned OpIdx,
- unsigned NumElems, unsigned &OpNum) {
- bool SeenV1 = false;
- bool SeenV2 = false;
-
- for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
- int Idx = SVOp->getMaskElt(i);
- // Ignore undef indicies
- if (Idx < 0)
- continue;
-
- if (Idx < (int)NumElems)
- SeenV1 = true;
- else
- SeenV2 = true;
-
- // Only accept consecutive elements from the same vector
- if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
- return false;
- }
-
- OpNum = SeenV1 ? 0 : 1;
- return true;
-}
-
-/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
-/// logical left shift of a vector.
-static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
- bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
- unsigned NumElems =
- SVOp->getSimpleValueType(0).getVectorNumElements();
- unsigned NumZeros = getNumOfConsecutiveZeros(
- SVOp, NumElems, false /* check zeros from right */, DAG,
- SVOp->getMaskElt(0));
- unsigned OpSrc;
-
- if (!NumZeros)
- return false;
-
- // Considering the elements in the mask that are not consecutive zeros,
- // check if they consecutively come from only one of the source vectors.
- //
- // V1 = {X, A, B, C} 0
- // \ \ \ /
- // vector_shuffle V1, V2 <1, 2, 3, X>
- //
- if (!isShuffleMaskConsecutive(SVOp,
- 0, // Mask Start Index
- NumElems-NumZeros, // Mask End Index(exclusive)
- NumZeros, // Where to start looking in the src vector
- NumElems, // Number of elements in vector
- OpSrc)) // Which source operand ?
- return false;
-
- isLeft = false;
- ShAmt = NumZeros;
- ShVal = SVOp->getOperand(OpSrc);
- return true;
-}
-
-/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
-/// logical left shift of a vector.
-static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
- bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
- unsigned NumElems =
- SVOp->getSimpleValueType(0).getVectorNumElements();
- unsigned NumZeros = getNumOfConsecutiveZeros(
- SVOp, NumElems, true /* check zeros from left */, DAG,
- NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
- unsigned OpSrc;
-
- if (!NumZeros)
- return false;
-
- // Considering the elements in the mask that are not consecutive zeros,
- // check if they consecutively come from only one of the source vectors.
- //
- // 0 { A, B, X, X } = V2
- // / \ / /
- // vector_shuffle V1, V2 <X, X, 4, 5>
- //
- if (!isShuffleMaskConsecutive(SVOp,
- NumZeros, // Mask Start Index
- NumElems, // Mask End Index(exclusive)
- 0, // Where to start looking in the src vector
- NumElems, // Number of elements in vector
- OpSrc)) // Which source operand ?
- return false;
-
- isLeft = true;
- ShAmt = NumZeros;
- ShVal = SVOp->getOperand(OpSrc);
- return true;
-}
-
-/// isVectorShift - Returns true if the shuffle can be implemented as a
-/// logical left or right shift of a vector.
-static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
- bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
- // Although the logic below support any bitwidth size, there are no
- // shift instructions which handle more than 128-bit vectors.
- if (!SVOp->getSimpleValueType(0).is128BitVector())
- return false;
-
- if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
- isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
- return true;
-
- return false;
-}
-
/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
@@ -5744,19 +4488,19 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
// Find all zeroable elements.
- bool Zeroable[4];
+ std::bitset<4> Zeroable;
for (int i=0; i < 4; ++i) {
SDValue Elt = Op->getOperand(i);
Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
}
- assert(std::count_if(&Zeroable[0], &Zeroable[4],
- [](bool M) { return !M; }) > 1 &&
+ assert(Zeroable.size() - Zeroable.count() > 1 &&
"We expect at least two non-zero elements!");
// We only know how to deal with build_vector nodes where elements are either
// zeroable or extract_vector_elt with constant index.
SDValue FirstNonZero;
- for (int i=0; i < 4; ++i) {
+ unsigned FirstNonZeroIdx;
+ for (unsigned i=0; i < 4; ++i) {
if (Zeroable[i])
continue;
SDValue Elt = Op->getOperand(i);
@@ -5767,8 +4511,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
MVT VT = Elt.getOperand(0).getSimpleValueType();
if (!VT.is128BitVector())
return SDValue();
- if (!FirstNonZero.getNode())
+ if (!FirstNonZero.getNode()) {
FirstNonZero = Elt;
+ FirstNonZeroIdx = i;
+ }
}
assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
@@ -5807,14 +4553,14 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
return SDValue();
SDValue V2 = Elt.getOperand(0);
- if (Elt == FirstNonZero)
+ if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
V1 = SDValue();
bool CanFold = true;
for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
if (Zeroable[i])
continue;
-
+
SDValue Current = Op->getOperand(i);
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
@@ -5833,10 +4579,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
// Ok, we can emit an INSERTPS instruction.
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (Zeroable[i])
- ZMask |= 1 << i;
+ unsigned ZMask = Zeroable.to_ulong();
unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
@@ -5845,19 +4588,19 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
}
-/// getVShift - Return a vector logical shift node.
-///
+/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, SDLoc dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
- EVT ShVT = MVT::v2i64;
+ MVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+ MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+ assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+ SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(Opc, dl, ShVT, SrcOp,
- DAG.getConstant(NumBits,
- TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+ DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
static SDValue
@@ -5924,9 +4667,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, false, 0);
- SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != NumElems; ++i)
- Mask.push_back(EltNo);
+ SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
}
@@ -5934,19 +4675,18 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
return SDValue();
}
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
///
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
-static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {
- EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
LoadSDNode *LDBase = nullptr;
@@ -5957,7 +4697,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
// non-consecutive, bail out.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];
-
+ // Look through a bitcast.
+ if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+ Elt = Elt.getOperand(0);
if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();
@@ -5972,7 +4714,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+ EVT LdVT = Elt.getValueType();
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+ return SDValue();
+ if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
return SDValue();
LastLoadedElt = i;
}
@@ -5981,6 +4728,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ EVT EltVT = LDBase->getValueType(0);
+ // Ensure that the input vector size for the merged loads matches the
+ // cumulative size of the input elements.
+ if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+ return SDValue();
if (isAfterLegalize &&
!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
@@ -5988,15 +4741,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
SDValue NewLd = SDValue();
- if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
- NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), 0);
NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), LDBase->getAlignment());
+ LDBase->getPointerInfo(), LDBase->isVolatile(),
+ LDBase->isNonTemporal(), LDBase->isInvariant(),
+ LDBase->getAlignment());
if (LDBase->hasAnyUseOfValue(1)) {
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
@@ -6009,7 +4757,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
return NewLd;
}
- if (NumElems == 4 && LastLoadedElt == 1 &&
+
+ //TODO: The code below fires only for for loading the low v2i32 / v2f32
+ //of a v4i32 / v4f32. It's probably worth generalizing.
+ EVT EltVT = VT.getVectorElementType();
+ if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
@@ -6134,8 +4886,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
const Function *F = DAG.getMachineFunction().getFunction();
- bool OptForSize = F->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -6183,7 +4934,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
if (!IsLoad)
return SDValue();
- if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (Subtarget->hasVLX() && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
@@ -6339,8 +5091,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
AllContants = false;
NonConstIdx = idx;
NumNonConsts++;
- }
- else {
+ } else {
NumConsts++;
if (cast<ConstantSDNode>(In)->getZExtValue())
Immediate |= (1ULL << idx);
@@ -6363,7 +5114,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
MVT::getIntegerVT(VT.getSizeInBits()));
DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
}
- else
+ else
DstVec = DAG.getUNDEF(VT);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
Op.getOperand(NonConstIdx),
@@ -6386,7 +5137,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
/// \brief Return true if \p N implements a horizontal binop and return the
/// operands for the horizontal binop into V0 and V1.
-///
+///
/// This is a helper function of PerformBUILD_VECTORCombine.
/// This function checks that the build_vector \p N in input implements a
/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
@@ -6407,7 +5158,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
"Invalid Vector in input!");
-
+
bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
bool CanFold = true;
unsigned ExpectedVExtractIdx = BaseIdx;
@@ -6476,13 +5227,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
}
/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
-/// a concat_vector.
+/// a concat_vector.
///
/// This is a helper function of PerformBUILD_VECTORCombine.
/// This function expects two 256-bit vectors called V0 and V1.
/// At first, each vector is split into two separate 128-bit vectors.
/// Then, the resulting 128-bit vectors are used to implement two
-/// horizontal binary operations.
+/// horizontal binary operations.
///
/// The kind of horizontal binary operation is defined by \p X86Opcode.
///
@@ -6566,7 +5317,7 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
bool AddFound = false;
bool SubFound = false;
- for (unsigned i = 0, e = NumElts; i != e; i++) {
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
@@ -6676,18 +5427,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
// Try to match an SSE3 float HADD/HSUB.
if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
-
+
if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
} else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
// Try to match an SSSE3 integer HADD/HSUB.
if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
-
+
if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
}
-
+
if (!Subtarget->hasAVX())
return SDValue();
@@ -6738,7 +5489,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
// Do this only if the target has AVX2.
if (Subtarget->hasAVX2())
return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
-
+
// Do not try to expand this build_vector into a pair of horizontal
// add/sub if we can emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
@@ -6863,32 +5614,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Handle SSE only.
assert(VT == MVT::v2i64 && "Expected an SSE value type!");
EVT VecVT = MVT::v4i32;
- unsigned VecElts = 4;
// Truncate the value (which may itself be a constant) to i32, and
// convert it to a vector with movd (S2V+shuffle to zero extend).
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-
- // If using the new shuffle lowering, just directly insert this.
- if (ExperimentalVectorShuffleLowering)
- return DAG.getNode(
- ISD::BITCAST, dl, VT,
- getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
-
- Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
-
- // Now we have our 32-bit value zero extended in the low element of
- // a vector. If Idx != 0, swizzle it into place.
- if (Idx != 0) {
- SmallVector<int, 4> Mask;
- Mask.push_back(Idx);
- for (unsigned i = 1; i != VecElts; ++i)
- Mask.push_back(i);
- Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
- &Mask[0]);
- }
- return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+ return DAG.getNode(
+ ISD::BITCAST, dl, VT,
+ getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
}
}
@@ -6948,17 +5681,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-
- // If using the new shuffle lowering, just directly insert this.
- if (ExperimentalVectorShuffleLowering)
- return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
-
- // Turn it into a shuffle of zero and zero-extended scalar to vector.
- Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
- SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i != NumElems; ++i)
- MaskVec.push_back(i == Idx ? 0 : 1);
- return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+ return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
}
}
@@ -6982,12 +5705,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (IsAllConstants)
return SDValue();
- // For AVX-length vectors, build the individual 128-bit pieces and use
+ // For AVX-length vectors, see if we can use a vector load to get all of the
+ // elements, otherwise build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.is256BitVector() || VT.is512BitVector()) {
- SmallVector<SDValue, 64> V;
- for (unsigned i = 0; i != NumElems; ++i)
- V.push_back(Op.getOperand(i));
+ SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
+
+ // Check for a build vector of consecutive loads.
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
+ return LD;
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
@@ -7091,7 +5817,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41()) {
+ if (Subtarget->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -7271,38 +5997,40 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return true;
}
-// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
-// 2013 will allow us to use it as a non-type template parameter.
-namespace {
-
-/// \brief Implementation of the \c isShuffleEquivalent variadic functor.
-///
-/// See its documentation for details.
-bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
- if (Mask.size() != Args.size())
- return false;
- for (int i = 0, e = Mask.size(); i < e; ++i) {
- assert(*Args[i] >= 0 && "Arguments must be positive integers!");
- if (Mask[i] != -1 && Mask[i] != *Args[i])
- return false;
- }
- return true;
-}
-
-} // namespace
-
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
/// This is a fast way to test a shuffle mask against a fixed pattern:
///
-/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
+/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
///
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
-static const VariadicFunction1<
- bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask) {
+ if (Mask.size() != ExpectedMask.size())
+ return false;
+
+ int Size = Mask.size();
+
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+ auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+ if (!MaskBV || !ExpectedBV ||
+ MaskBV->getOperand(Mask[i] % Size) !=
+ ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ return false;
+ }
+
+ return true;
+}
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
///
@@ -7328,6 +6056,37 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
return DAG.getConstant(Imm, MVT::i8);
}
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.isInteger() && "Only supports integer vector types!");
+ MVT EltVT = VT.getScalarType();
+ int NumEltBits = EltVT.getSizeInBits();
+ SDValue Zero = DAG.getConstant(0, EltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
+ SmallVector<SDValue, 16> MaskOps;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+ return SDValue(); // Shuffled input!
+ MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+ }
+
+ SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+ V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+ // We have to cast V2 around.
+ MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ V2 = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+ DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
+ DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
+ return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
/// \brief Try to emit a blend instruction for a shuffle.
///
/// This doesn't do any checks for the availability of instructions for blending
@@ -7338,7 +6097,6 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
-
unsigned BlendMask = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= Size) {
@@ -7415,11 +6173,17 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
}
}
// FALLTHROUGH
+ case MVT::v16i8:
case MVT::v32i8: {
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
+
// Scale the blend by the number of bytes per element.
- int Scale = VT.getScalarSizeInBits() / 8;
- assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+ int Scale = VT.getScalarSizeInBits() / 8;
+
+ // This form of blend is always done on bytes. Compute the byte vector
+ // type.
+ MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
// mix of LLVM's code generator and the x86 backend. We tell the code
@@ -7432,19 +6196,19 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
// the LLVM model for boolean values in vector elements gets the relevant
// bit set, it is set backwards and over constrained relative to x86's
// actual model.
- SDValue VSELECTMask[32];
+ SmallVector<SDValue, 32> VSELECTMask;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
- VSELECTMask[Scale * i + j] =
+ VSELECTMask.push_back(
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
- : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+ V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
return DAG.getNode(
ISD::BITCAST, DL, VT,
- DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
+ DAG.getNode(ISD::VSELECT, DL, BlendVT,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
V1, V2));
}
@@ -7453,12 +6217,45 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
}
}
-/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
-/// unblended shuffles followed by an unshuffled blend.
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // We build up the blend mask while checking whether a blend is a viable way
+ // to reduce the shuffle.
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+ if (BlendMask[Mask[i] % Size] == -1)
+ BlendMask[Mask[i] % Size] = Mask[i];
+ else if (BlendMask[Mask[i] % Size] != Mask[i])
+ return SDValue(); // Can't blend in the needed input!
+
+ PermuteMask[i] = Mask[i] % Size;
+ }
+
+ SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
-/// operations.
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
SDValue V1,
SDValue V2,
@@ -7478,6 +6275,16 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
BlendMask[i] = i + Size;
}
+ // Try to lower with the simpler initial blend strategy unless one of the
+ // input shuffles would be a no-op. We prefer to shuffle inputs as the
+ // shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, blending
+ // first is a better strategy.
+ if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+ if (SDValue BlendPerm =
+ lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ return BlendPerm;
+
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
@@ -7492,15 +6299,13 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
/// does not check for the profitability of lowering either as PALIGNR or
/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
/// This matches shuffle vectors that look like:
-///
+///
/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
-///
+///
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-///
-/// Note that this only handles 128-bit vector widths currently.
static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SDValue V2,
ArrayRef<int> Mask,
@@ -7508,6 +6313,10 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+
// We need to detect various ways of spelling a rotation:
// [11, 12, 13, 14, 15, 0, 1, 2]
// [-1, 12, 13, 14, -1, -1, 1, -1]
@@ -7517,44 +6326,52 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
// [-1, 4, 5, 6, -1, -1, -1, -1]
int Rotation = 0;
SDValue Lo, Hi;
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] == -1)
- continue;
- assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
+ for (int l = 0; l < NumElts; l += NumLaneElts) {
+ for (int i = 0; i < NumLaneElts; ++i) {
+ if (Mask[l + i] == -1)
+ continue;
+ assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
- // Based on the mod-Size value of this mask element determine where
- // a rotated vector would have started.
- int StartIdx = i - (Mask[i] % Size);
- if (StartIdx == 0)
- // The identity rotation isn't interesting, stop.
- return SDValue();
+ // Get the mod-Size index and lane correct it.
+ int LaneIdx = (Mask[l + i] % NumElts) - l;
+ // Make sure it was in this lane.
+ if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
+ return SDValue();
- // If we found the tail of a vector the rotation must be the missing
- // front. If we found the head of a vector, it must be how much of the head.
- int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - LaneIdx;
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return SDValue();
- if (Rotation == 0)
- Rotation = CandidateRotation;
- else if (Rotation != CandidateRotation)
- // The rotations don't match, so we can't match this mask.
- return SDValue();
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
- // Compute which value this mask is pointing at.
- SDValue MaskV = Mask[i] < Size ? V1 : V2;
-
- // Compute which of the two target values this index should be assigned to.
- // This reflects whether the high elements are remaining or the low elements
- // are remaining.
- SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
-
- // Either set up this value if we've not encountered it before, or check
- // that it remains consistent.
- if (!TargetV)
- TargetV = MaskV;
- else if (TargetV != MaskV)
- // This may be a rotation, but it pulls from the inputs in some
- // unsupported interleaving.
- return SDValue();
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return SDValue();
+
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned
+ // to. This reflects whether the high elements are remaining or the low
+ // elements are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return SDValue();
+ }
}
// Check that we successfully analyzed the mask, and normalize the results.
@@ -7565,26 +6382,27 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
else if (!Hi)
Hi = Lo;
- assert(VT.getSizeInBits() == 128 &&
- "Rotate-based lowering only supports 128-bit lowering!");
- assert(Mask.size() <= 16 &&
- "Can shuffle at most 16 bytes in a 128-bit vector!");
-
// The actual rotate instruction rotates bytes, so we need to scale the
- // rotation based on how many bytes are in the vector.
- int Scale = 16 / Mask.size();
+ // rotation based on how many bytes are in the vector lane.
+ int Scale = 16 / NumLaneElts;
- // SSSE3 targets can use the palignr instruction
+ // SSSE3 targets can use the palignr instruction.
if (Subtarget->hasSSSE3()) {
- // Cast the inputs to v16i8 to match PALIGNR.
- Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
- Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+ // Cast the inputs to i8 vector of correct length to match PALIGNR.
+ MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+ Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
+ DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
DAG.getConstant(Rotation * Scale, MVT::i8)));
}
+ assert(VT.getSizeInBits() == 128 &&
+ "Rotate-based lowering only supports 128-bit lowering!");
+ assert(Mask.size() <= 16 &&
+ "Can shuffle at most 16 bytes in a 128-bit vector!");
+
// Default SSE2 implementation
int LoByteShift = 16 - Rotation * Scale;
int HiByteShift = Rotation * Scale;
@@ -7594,9 +6412,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
- DAG.getConstant(8 * LoByteShift, MVT::i8));
+ DAG.getConstant(LoByteShift, MVT::i8));
SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
- DAG.getConstant(8 * HiByteShift, MVT::i8));
+ DAG.getConstant(HiByteShift, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
}
@@ -7613,6 +6431,11 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue V1, SDValue V2) {
SmallBitVector Zeroable(Mask.size(), false);
+ while (V1.getOpcode() == ISD::BITCAST)
+ V1 = V1->getOperand(0);
+ while (V2.getOpcode() == ISD::BITCAST)
+ V2 = V2->getOperand(0);
+
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
@@ -7624,10 +6447,10 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
continue;
}
- // If this is an index into a build_vector node, dig out the input value and
- // use it.
+ // If this is an index into a build_vector node (which has the same number
+ // of elements), dig out the input value and use it.
SDValue V = M < Size ? V1 : V2;
- if (V.getOpcode() != ISD::BUILD_VECTOR)
+ if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
continue;
SDValue Input = V.getOperand(M % Size);
@@ -7640,85 +6463,133 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}
-/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
-///
-/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
-/// byte-shift instructions. The mask must consist of a shifted sequential
-/// shuffle from one of the input vectors and zeroable elements for the
-/// remaining 'shifted in' elements.
+/// \brief Try to emit a bitmask instruction for a shuffle.
///
-/// Note that this only handles 128-bit vector widths currently.
-static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
- assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ MVT EltVT = VT.getScalarType();
+ int NumEltBits = EltVT.getSizeInBits();
+ MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+ SDValue Zero = DAG.getConstant(0, IntEltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
+ if (EltVT.isFloatingPoint()) {
+ Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
+ AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
+ }
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+ V = DAG.getNode(VT.isFloatingPoint()
+ ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+ DL, VT, V, VMask);
+ return V;
+}
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz, 2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [ 1, zz, 3, zz]
+/// [ -1, -1, 7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz, 0, 1, 2, 3, 4, 5, 6]
+/// [ zz, zz, -1, -1, 2, 3, 4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1, 1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [ 5, 6, 7, zz, zz, zz, zz, zz]
+/// [ -1, 5, 6, 7, zz, zz, zz, zz]
+/// [ 1, 2, -1, -1, -1, -1, zz, zz]
+static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Size = Mask.size();
- int Scale = 16 / Size;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i < Size; i += Scale)
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+ return false;
- auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset,
- ArrayRef<int> Mask) {
- for (int i = StartIndex; i < EndIndex; i++) {
- if (Mask[i] < 0)
- continue;
- if (i + Base != Mask[i] - MaskOffset)
- return false;
- }
return true;
};
- for (int Shift = 1; Shift < Size; Shift++) {
- int ByteShift = Shift * Scale;
-
- // PSRLDQ : (little-endian) right byte shift
- // [ 5, 6, 7, zz, zz, zz, zz, zz]
- // [ -1, 5, 6, 7, zz, zz, zz, zz]
- // [ 1, 2, -1, -1, -1, -1, zz, zz]
- bool ZeroableRight = true;
- for (int i = Size - Shift; i < Size; i++) {
- ZeroableRight &= Zeroable[i];
- }
-
- if (ZeroableRight) {
- bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask);
- bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask);
-
- if (ValidShiftRight1 || ValidShiftRight2) {
- // Cast the inputs to v2i64 to match PSRLDQ.
- SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
+ auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = Left ? i + Shift : i;
+ unsigned Low = Left ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ if (!isSequentialOrUndefInRange(Mask, Pos, Len,
+ Low + (V == V1 ? 0 : Size)))
+ return SDValue();
}
- // PSLLDQ : (little-endian) left byte shift
- // [ zz, 0, 1, 2, 3, 4, 5, 6]
- // [ zz, zz, -1, -1, 2, 3, 4, -1]
- // [ zz, zz, zz, zz, zz, zz, -1, 1]
- bool ZeroableLeft = true;
- for (int i = 0; i < Shift; i++) {
- ZeroableLeft &= Zeroable[i];
- }
+ int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
- if (ZeroableLeft) {
- bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask);
- bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask);
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
- if (ValidShiftLeft1 || ValidShiftLeft2) {
- // Cast the inputs to v2i64 to match PSLLDQ.
- SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
- }
- }
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+ MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+ V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left))
+ for (SDValue V : {V1, V2})
+ if (SDValue Match = MatchShift(Shift, Scale, Left, V))
+ return Match;
+
+ // no match
return SDValue();
}
@@ -7728,10 +6599,11 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
+ SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
- int EltBits = VT.getSizeInBits() / NumElements;
+ int NumElements = VT.getVectorNumElements();
+ int EltBits = VT.getScalarSizeInBits();
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
@@ -7739,10 +6611,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget->hasSSE41()) {
- MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
- InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
}
@@ -7800,7 +6670,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
}
-/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
///
/// This routine will try to do everything in its power to cleverly lower
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
@@ -7818,7 +6688,10 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
- int NumElements = Mask.size();
+ int NumElements = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() <= 32 &&
+ "Exceeds 32-bit integer zero extension limit");
+ assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
// Define a helper function to check a particular ext-scale and lower to it if
// valid.
@@ -7829,11 +6702,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
if (Mask[i] == -1)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
- // Each of the extend elements needs to be zeroable.
+ // Each of the extended elements need to be zeroable.
if (!Zeroable[i])
return SDValue();
- // We no lorger are in the anyext case.
+ // We no longer are in the anyext case.
AnyExt = false;
continue;
}
@@ -7847,7 +6720,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
return SDValue(); // Flip-flopping inputs.
if (Mask[i] % NumElements != i / Scale)
- return SDValue(); // Non-consecutive strided elemenst.
+ return SDValue(); // Non-consecutive strided elements.
}
// If we fail to find an input, we have a zero-shuffle which should always
@@ -7857,7 +6730,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
return SDValue();
return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
+ DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
@@ -7869,11 +6742,34 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
// many elements.
for (; NumExtElements < NumElements; NumExtElements *= 2) {
assert(NumElements % NumExtElements == 0 &&
- "The input vector size must be divisble by the extended size.");
+ "The input vector size must be divisible by the extended size.");
if (SDValue V = Lower(NumElements / NumExtElements))
return V;
}
+ // General extends failed, but 128-bit vectors may be able to use MOVQ.
+ if (Bits != 128)
+ return SDValue();
+
+ // Returns one of the source operands if the shuffle can be reduced to a
+ // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+ auto CanZExtLowHalf = [&]() {
+ for (int i = NumElements / 2; i != NumElements; ++i)
+ if (!Zeroable[i])
+ return SDValue();
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+ return V1;
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+ return V2;
+ return SDValue();
+ };
+
+ if (SDValue V = CanZExtLowHalf()) {
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ }
+
// No viable ext lowering found.
return SDValue();
}
@@ -7916,7 +6812,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
- MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
MVT ExtVT = VT;
@@ -7983,6 +6879,10 @@ static SDValue lowerVectorShuffleAsElementInsertion(
ExtVT, V1, V2);
}
+ // This lowering only works for the low element with floating point vectors.
+ if (VT.isFloatingPoint() && V2Index != 0)
+ return SDValue();
+
V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
if (ExtVT != VT)
V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
@@ -8001,7 +6901,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
V2 = DAG.getNode(
X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
DAG.getConstant(
- V2Index * EltVT.getSizeInBits(),
+ V2Index * EltVT.getSizeInBits()/8,
DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
}
@@ -8014,7 +6914,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
@@ -8086,6 +6986,199 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
}
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ unsigned ZMask = 0;
+ int V1DstIndex = -1;
+ int V2DstIndex = -1;
+ bool V1UsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any V1 inputs in place.
+ if (i == Mask[i]) {
+ V1UsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (V1DstIndex != -1 || V2DstIndex != -1)
+ return SDValue();
+
+ if (Mask[i] < 4) {
+ // V1 input out of place for insertion.
+ V1DstIndex = i;
+ } else {
+ // V2 input for insertion.
+ V2DstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (V1DstIndex == -1 && V2DstIndex == -1)
+ return SDValue();
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned V2SrcIndex = 0;
+ if (V1DstIndex != -1) {
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
+ // and don't use the original V2 at all.
+ V2SrcIndex = Mask[V1DstIndex];
+ V2DstIndex = V1DstIndex;
+ V2 = V1;
+ } else {
+ V2SrcIndex = Mask[V2DstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!V1UsedInPlace)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+ return M >= 0 && M % Size < Size / 2;
+ });
+ int NumHiInputs = std::count_if(
+ Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ auto TryUnpack = [&](MVT UnpackVT, int Scale) {
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
+
+ // If we will have to shuffle both inputs to use the unpack, check whether
+ // we can just unpack first and shuffle the result. If so, skip this unpack.
+ if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+ !isNoopShuffleMask(V2Mask))
+ return SDValue();
+
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+ DL, UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigNumElements = VT.getVectorNumElements();
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
+ int Scale = ScalarSize / OrigScalarSize;
+ int NumElements = OrigNumElements / Scale;
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
+ if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+ return Unpack;
+ }
+
+ // If none of the unpack-rooted lowerings worked (or were profitable) try an
+ // initial unpack.
+ if (NumLoInputs == 0 || NumHiInputs == 0) {
+ assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+ "We have to have *some* inputs!");
+ int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+ // FIXME: We could consider the total complexity of the permute of each
+ // possible unpacking. Or at the least we should consider how many
+ // half-crossings are created.
+ // FIXME: We could consider commuting the unpacks.
+
+ SmallVector<int, 32> PermMask;
+ PermMask.assign(Size, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+ PermMask[i] =
+ 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+ }
+ return DAG.getVectorShuffle(
+ VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+ DL, VT, V1, V2),
+ DAG.getUNDEF(VT), PermMask);
+ }
+
+ return SDValue();
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -8105,6 +7198,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3())
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
@@ -8122,29 +7220,24 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
assert(Mask[1] >= 2 && "Non-canonicalized blend!");
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 2))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 3))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
-
// If we have a single input, insert that into V1 if we can do so cheaply.
if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
return Insertion;
}
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
- if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {1, 3}))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
@@ -8158,6 +7251,12 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
@@ -8182,7 +7281,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -8198,37 +7297,60 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
}
+ assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+ // If we have a blend of two PACKUS operations an the blend aligns with the
+ // low and half halves, we can just merge the PACKUS operations. This is
+ // particularly important as it lets us merge shuffles that this routine itself
+ // creates.
+ auto GetPackNode = [](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
- // If we have a single input from V2 insert that into V1 if we can do so
- // cheaply.
- if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
- return Insertion;
- // Try inverting the insertion since for v2 masks it is easy to do and we
- // can't reliably sort the mask one way or the other.
- int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
- Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
- return Insertion;
- }
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 2))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 3))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+ return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+ };
+ if (SDValue V1Pack = GetPackNode(V1))
+ if (SDValue V2Pack = GetPackNode(V2))
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+ Mask[0] == 0 ? V1Pack.getOperand(0)
+ : V1Pack.getOperand(1),
+ Mask[1] == 2 ? V2Pack.getOperand(0)
+ : V2Pack.getOperand(1)));
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Shift;
- if (Subtarget->hasSSE41())
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
+ return Insertion;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v2i64, V1, V2, Mask, DAG))
- return Shift;
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8237,6 +7359,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+ Mask, DAG);
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
@@ -8247,6 +7375,24 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
/// \brief Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
@@ -8358,10 +7504,18 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3()) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
@@ -8375,70 +7529,41 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
Mask, Subtarget, DAG))
return V;
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Check for whether we can use INSERTPS to perform the blend. We only use
- // INSERTPS when the V1 elements are already in the correct locations
- // because otherwise we can just always use two SHUFPS instructions which
- // are much smaller to encode than a SHUFPS and an INSERTPS.
- if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
-
- // When using INSERTPS we can zero any lane of the destination. Collect
- // the zero inputs into a mask and drop them from the lanes of V1 which
- // actually need to be present as inputs to the INSERTPS.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
- bool InsertNeedsShuffle = false;
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (i != V2Index) {
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- } else if (Mask[i] != i) {
- InsertNeedsShuffle = true;
- break;
- }
- }
-
- // We don't want to use INSERTPS or other insertion techniques if it will
- // require shuffling anyways.
- if (!InsertNeedsShuffle) {
- // If all of V1 is zeroable, replace it with undef.
- if ((ZMask | 1 << V2Index) == 0xF)
- V1 = DAG.getUNDEF(MVT::v4f32);
-
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ return V;
- // Insert the V2 element into the desired position.
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
- }
+ if (!isSingleSHUFPSMask(Mask))
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return BlendPerm;
}
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
+
// Otherwise fall back to a SHUFPS lowering strategy.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
@@ -8470,7 +7595,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -8481,36 +7606,47 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
- if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
Mask = UnpackLoMask;
- else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+ else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Shift;
+
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return V;
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
-
- if (Subtarget->hasSSE41())
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v4i32, V1, V2, Mask, DAG))
- return Shift;
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8519,6 +7655,17 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+ Mask, DAG);
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Unpack;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
@@ -8542,7 +7689,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// The exact breakdown of how to form these dword pairs and align them on the
/// correct sides is really tricky. See the comments within the function for
/// more of the details.
-static SDValue lowerV8I16SingleInputVectorShuffle(
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
@@ -8570,27 +7717,6 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
- Mask, Subtarget, DAG))
- return Broadcast;
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
- if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
-
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v8i16, V, V, Mask, DAG))
- return Shift;
-
- // Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
- return Rotate;
-
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
@@ -8993,158 +8119,56 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
return V;
}
-/// \brief Detect whether the mask pattern should be lowered through
-/// interleaving.
-///
-/// This essentially tests whether viewing the mask as an interleaving of two
-/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
-/// lowering it through interleaving is a significantly better strategy.
-static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
- int NumEvenInputs[2] = {0, 0};
- int NumOddInputs[2] = {0, 0};
- int NumLoInputs[2] = {0, 0};
- int NumHiInputs[2] = {0, 0};
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] < 0)
- continue;
-
- int InputIdx = Mask[i] >= Size;
-
- if (i < Size / 2)
- ++NumLoInputs[InputIdx];
- else
- ++NumHiInputs[InputIdx];
-
- if ((i % 2) == 0)
- ++NumEvenInputs[InputIdx];
- else
- ++NumOddInputs[InputIdx];
- }
-
- // The minimum number of cross-input results for both the interleaved and
- // split cases. If interleaving results in fewer cross-input results, return
- // true.
- int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
- NumEvenInputs[0] + NumOddInputs[1]);
- int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
- NumLoInputs[0] + NumHiInputs[1]);
- return InterleavedCrosses < SplitCrosses;
-}
-
-/// \brief Blend two v8i16 vectors using a naive unpack strategy.
-///
-/// This strategy only works when the inputs from each vector fit into a single
-/// half of that vector, and generally there are not so many inputs as to leave
-/// the in-place shuffles required highly constrained (and thus expensive). It
-/// shifts all the inputs into a single side of both input vectors and then
-/// uses an unpack to interleave these inputs in a single vector. At that
-/// point, we will fall back on the generic single input shuffle lowering.
-static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
- SDValue V2,
- MutableArrayRef<int> Mask,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
- assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
- SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
- for (int i = 0; i < 8; ++i)
- if (Mask[i] >= 0 && Mask[i] < 4)
- LoV1Inputs.push_back(i);
- else if (Mask[i] >= 4 && Mask[i] < 8)
- HiV1Inputs.push_back(i);
- else if (Mask[i] >= 8 && Mask[i] < 12)
- LoV2Inputs.push_back(i);
- else if (Mask[i] >= 12)
- HiV2Inputs.push_back(i);
-
- int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
- int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
- (void)NumV1Inputs;
- (void)NumV2Inputs;
- assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
- assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
- assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
-
- bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
- HiV1Inputs.size() + HiV2Inputs.size();
-
- auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
- ArrayRef<int> HiInputs, bool MoveToLo,
- int MaskOffset) {
- ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
- ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
- if (BadInputs.empty())
- return V;
-
- int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int MoveOffset = MoveToLo ? 0 : 4;
+/// \brief Helper to form a PSHUFB-based shuffle+blend.
+static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V1Mask[16];
+ SDValue V2Mask[16];
+ V1InUse = false;
+ V2InUse = false;
- if (GoodInputs.empty()) {
- for (int BadInput : BadInputs) {
- MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
- Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
- }
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i / Scale] == -1) {
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
} else {
- if (GoodInputs.size() == 2) {
- // If the low inputs are spread across two dwords, pack them into
- // a single dword.
- MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
- MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
- Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
- Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
- } else {
- // Otherwise pin the good inputs.
- for (int GoodInput : GoodInputs)
- MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
- }
-
- if (BadInputs.size() == 2) {
- // If we have two bad inputs then there may be either one or two good
- // inputs fixed in place. Find a fixed input, and then find the *other*
- // two adjacent indices by using modular arithmetic.
- int GoodMaskIdx =
- std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
- [](int M) { return M >= 0; }) -
- std::begin(MoveMask);
- int MoveMaskIdx =
- ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
- assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
- assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
- MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
- MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
- Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
- Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
- } else {
- assert(BadInputs.size() == 1 && "All sizes handled");
- int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
- std::end(MoveMask), -1) -
- std::begin(MoveMask);
- MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
- Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
- }
- }
-
- return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
- MoveMask);
- };
- V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
- /*MaskOffset*/ 0);
- V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
- /*MaskOffset*/ 8);
-
- // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
- // cross-half traffic in the final shuffle.
+ const int ZeroMask = 0x80;
+ int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+ : ZeroMask;
+ int V2Idx = Mask[i / Scale] < Size
+ ? ZeroMask
+ : (Mask[i / Scale] - Size) * Scale + i % Scale;
+ if (Zeroable[i / Scale])
+ V1Idx = V2Idx = ZeroMask;
+ V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
+ }
+ }
+
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
- // Munge the mask to be a single-input mask after the unpack merges the
- // results.
- for (int &M : Mask)
- if (M != -1)
- M = 2 * (M % 4) + (M / 8);
+ // If we need shuffled inputs from both, blend the two.
+ SDValue V;
+ if (V1InUse && V2InUse)
+ V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ else
+ V = V1InUse ? V1 : V2;
- return DAG.getVectorShuffle(
- MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
- DL, MVT::v8i16, V1, V2),
- DAG.getUNDEF(MVT::v8i16), Mask);
+ // Cast the result back to the correct type.
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
}
/// \brief Generic lowering of 8-lane i16 shuffles.
@@ -9181,85 +8205,95 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return ZExt;
auto isV1 = [](int M) { return M >= 0 && M < 8; };
+ (void)isV1;
auto isV2 = [](int M) { return M >= 8; };
- int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
- if (NumV2Inputs == 0)
- return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+ if (NumV2Inputs == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
+ if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, V1, Mask, Subtarget,
+ DAG);
+ }
- assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
- "to be V1-input shuffles.");
+ assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
+ "All single-input shuffles should be canonicalized to be V1-input "
+ "shuffles.");
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return V;
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
- if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
-
- if (Subtarget->hasSSE41())
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v8i16, V1, V2, Mask, DAG))
- return Shift;
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return Rotate;
- if (NumV1Inputs + NumV2Inputs <= 4)
- return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
-
- // Check whether an interleaving lowering is likely to be more efficient.
- // This isn't perfect but it is a strong heuristic that tends to work well on
- // the kinds of shuffles that show up in practice.
- //
- // FIXME: Handle 1x, 2x, and 4x interleaving.
- if (shouldLowerAsInterleaving(Mask)) {
- // FIXME: Figure out whether we should pack these into the low or high
- // halves.
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return BitBlend;
- int EMask[8], OMask[8];
- for (int i = 0; i < 4; ++i) {
- EMask[i] = Mask[2*i];
- OMask[i] = Mask[2*i + 1];
- EMask[i + 4] = -1;
- OMask[i + 4] = -1;
- }
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Unpack;
- SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
- SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
-
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
- }
-
- int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-
- for (int i = 0; i < 4; ++i) {
- LoBlendMask[i] = Mask[i];
- HiBlendMask[i] = Mask[i + 4];
+ // If we can't directly blend but can use PSHUFB, that will be better as it
+ // can both shuffle and set up the inefficient blend.
+ if (!IsBlendSupported && Subtarget->hasSSSE3()) {
+ bool V1InUse, V2InUse;
+ return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
+ V1InUse, V2InUse);
}
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
- HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
-
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+ // We can always bit-blend if we have to so the fallback strategy is to
+ // decompose into single-input permutes and blends.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, DAG);
}
/// \brief Check whether a compaction lowering can be done by dropping even
@@ -9345,40 +8379,31 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> OrigMask = SVOp->getMask();
- assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return ZExt;
- int MaskStorage[16] = {
- OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
- OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
- OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
- OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
- MutableArrayRef<int> Mask(MaskStorage);
- MutableArrayRef<int> LoMask = Mask.slice(0, 8);
- MutableArrayRef<int> HiMask = Mask.slice(8, 8);
-
int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -9475,36 +8500,17 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return V;
}
- // Check whether an interleaving lowering is likely to be more efficient.
- // This isn't perfect but it is a strong heuristic that tends to work well on
- // the kinds of shuffles that show up in practice.
- //
- // FIXME: We need to handle other interleaving widths (i16, i32, ...).
- if (shouldLowerAsInterleaving(Mask)) {
- int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
- return (M >= 0 && M < 8) || (M >= 16 && M < 24);
- });
- int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
- return (M >= 8 && M < 16) || M >= 24;
- });
- int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1};
- int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1};
- bool UnpackLo = NumLoHalf >= NumHiHalf;
- MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
- MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
- for (int i = 0; i < 8; ++i) {
- TargetEMask[i] = Mask[2 * i];
- TargetOMask[i] = Mask[2 * i + 1];
- }
-
- SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
- SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
-
- return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
- MVT::v16i8, Evens, Odds);
- }
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // High half.
+ 4, 20, 5, 21, 6, 22, 7, 23}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+ 8, 24, 9, 25, 10, 26, 11, 27,
+ // High half.
+ 12, 28, 13, 29, 14, 30, 15, 31}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
@@ -9520,33 +8526,47 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
if (Subtarget->hasSSSE3()) {
- SDValue V1Mask[16];
- SDValue V2Mask[16];
- for (int i = 0; i < 16; ++i)
- if (Mask[i] == -1) {
- V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
- } else {
- V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
- V2Mask[i] =
- DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
- }
- V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
- if (isSingleInputShuffleMask(Mask))
- return V1; // Single inputs are easy.
+ bool V1InUse = false;
+ bool V2InUse = false;
- // Otherwise, blend the two.
- V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
- return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
+ DAG, V1InUse, V2InUse);
+
+ // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+ // do so. This avoids using them to handle blends-with-zero which is
+ // important as a single pshufb is significantly faster for that.
+ if (V1InUse && V2InUse) {
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return Blend;
+
+ // We can use an unpack to do the blending rather than an or in some
+ // cases. Even though the or may be (very minorly) more efficient, we
+ // preference this lowering because there are common cases where part of
+ // the complexity of the shuffles goes away when we do the final blend as
+ // an unpack.
+ // FIXME: It might be worth trying to detect if the unpack-feeding
+ // shuffles will both be pshufb, in which case we shouldn't bother with
+ // this.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Unpack;
+ }
+
+ return PSHUFB;
}
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
Mask, Subtarget, DAG))
return V;
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return BitBlend;
+
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
// details.
@@ -9585,72 +8605,58 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Result;
}
- int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ // Handle multi-input cases by blending single-input shuffles.
+ if (NumV2Elements > 0)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+ Mask, DAG);
- auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
- MutableArrayRef<int> V1HalfBlendMask,
- MutableArrayRef<int> V2HalfBlendMask) {
- for (int i = 0; i < 8; ++i)
- if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
- V1HalfBlendMask[i] = HalfMask[i];
- HalfMask[i] = i;
- } else if (HalfMask[i] >= 16) {
- V2HalfBlendMask[i] = HalfMask[i] - 16;
- HalfMask[i] = i + 8;
- }
- };
- buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
- buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+ // The fallback path for single-input shuffles widens this into two v8i16
+ // vectors with unpacks, shuffles those, and then pulls them back together
+ // with a pack.
+ SDValue V = V1;
- SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+ int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0)
+ (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
- auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
- MutableArrayRef<int> HiBlendMask) {
- SDValue V1, V2;
- // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
- // them out and avoid using UNPCK{L,H} to extract the elements of V as
- // i16s.
- if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; }) &&
- std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; })) {
- // Use a mask to drop the high bytes.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
- V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
- DAG.getConstant(0x00FF, MVT::v8i16));
-
- // This will be a single vector shuffle instead of a blend so nuke V2.
- V2 = DAG.getUNDEF(MVT::v8i16);
-
- // Squash the masks to point directly into V1.
- for (int &M : LoBlendMask)
- if (M >= 0)
- M /= 2;
- for (int &M : HiBlendMask)
- if (M >= 0)
- M /= 2;
- } else {
- // Otherwise just unpack the low half of V into V1 and the high half into
- // V2 so that we can blend them as i16s.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
- }
+ SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
- SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- return std::make_pair(BlendedLo, BlendedHi);
- };
- SDValue V1Lo, V1Hi, V2Lo, V2Hi;
- std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
- std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+ SDValue VLoHalf, VHiHalf;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+ DAG.getConstant(0x00FF, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+ VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into VLoHalf.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into VLoHalf and the high half into
+ // VHiHalf so that we can blend them as i16s.
+ VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
}
@@ -9736,7 +8742,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
return true;
}
-/// \brief Generic routine to split ector shuffle into half-sized shuffles.
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
///
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
@@ -9757,14 +8763,43 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
MVT ScalarVT = VT.getScalarType();
MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
- SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
- DAG.getIntPtrConstant(SplitNumElements));
- SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
- DAG.getIntPtrConstant(0));
- SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
- DAG.getIntPtrConstant(SplitNumElements));
+ // Rather than splitting build-vectors, just build two narrower build
+ // vectors. This helps shuffling with splats and zeros.
+ auto SplitVector = [&](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V->getOperand(0);
+
+ MVT OrigVT = V.getSimpleValueType();
+ int OrigNumElements = OrigVT.getVectorNumElements();
+ int OrigSplitNumElements = OrigNumElements / 2;
+ MVT OrigScalarVT = OrigVT.getScalarType();
+ MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+ SDValue LoV, HiV;
+
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV) {
+ LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(0));
+ HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(OrigSplitNumElements));
+ } else {
+
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (int i = 0; i < OrigSplitNumElements; ++i) {
+ LoOps.push_back(BV->getOperand(i));
+ HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+ }
+ LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
+ HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+ }
+ return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
+ DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
+ };
+
+ SDValue LoV1, HiV1, LoV2, HiV2;
+ std::tie(LoV1, HiV1) = SplitVector(V1);
+ std::tie(LoV2, HiV2) = SplitVector(V2);
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
@@ -9960,15 +8995,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
VT.getVectorNumElements() / 2);
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
- if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
- isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0));
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
}
- if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) {
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0));
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
@@ -9983,6 +9018,104 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
DAG.getConstant(PermMask, MVT::i8));
}
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This is only useful with multiple inputs.");
+
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int NumLanes = Size / LaneSize;
+ assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+ // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+ // check whether the in-128-bit lane shuffles share a repeating pattern.
+ SmallVector<int, 4> Lanes;
+ Lanes.resize(NumLanes, -1);
+ SmallVector<int, 4> InLaneMask;
+ InLaneMask.resize(LaneSize, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ int j = i / LaneSize;
+
+ if (Lanes[j] < 0) {
+ // First entry we've seen for this lane.
+ Lanes[j] = Mask[i] / LaneSize;
+ } else if (Lanes[j] != Mask[i] / LaneSize) {
+ // This doesn't match the lane selected previously!
+ return SDValue();
+ }
+
+ // Check that within each lane we have a consistent shuffle mask.
+ int k = i % LaneSize;
+ if (InLaneMask[k] < 0) {
+ InLaneMask[k] = Mask[i] % LaneSize;
+ } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+ // This doesn't fit a repeating in-lane mask.
+ return SDValue();
+ }
+ }
+
+ // First shuffle the lanes into place.
+ MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+ VT.getSizeInBits() / 64);
+ SmallVector<int, 8> LaneMask;
+ LaneMask.resize(NumLanes * 2, -1);
+ for (int i = 0; i < NumLanes; ++i)
+ if (Lanes[i] >= 0) {
+ LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+ LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+ }
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+ SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+ // Cast it back to the type we actually want.
+ LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+
+ // Now do a simple shuffle that isn't lane crossing.
+ SmallVector<int, 8> NewMask;
+ NewMask.resize(Size, -1);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+ assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+ "Must not introduce lane crosses at this point!");
+
+ return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+ return false;
+
+ return true;
+}
+
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -10004,10 +9137,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
Mask, Subtarget, DAG))
return Broadcast;
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowerid with an
// interleaved permutation.
@@ -10029,10 +9166,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
@@ -10040,7 +9181,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
if (NumV2Elements == 1 && Mask[0] >= 4)
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
@@ -10067,6 +9208,16 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
}
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget->hasAVX2())
@@ -10102,7 +9253,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10123,12 +9274,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
}
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
}
// AVX2 provides a direct instruction for permuting a single input across
@@ -10137,6 +9282,31 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
+
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic blend lowering.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
Mask, DAG);
@@ -10161,7 +9331,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10171,15 +9341,26 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends. We also need to squash the
@@ -10214,6 +9395,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG);
}
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget->hasAVX2())
@@ -10239,12 +9426,19 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10259,12 +9453,25 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
}
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return Shift;
+
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
if (isSingleInputShuffleMask(Mask)) {
@@ -10277,6 +9484,12 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
}
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic blend lowering.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
Mask, DAG);
@@ -10297,36 +9510,53 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
Mask, Subtarget, DAG))
return Broadcast;
- // There are no generalized cross-lane shuffle operations available on i16
- // element types.
- if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG);
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask,
- // First 128-bit lane:
- 0, 16, 1, 17, 2, 18, 3, 19,
- // Second 128-bit lane:
- 8, 24, 9, 25, 10, 26, 11, 27))
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane:
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // Second 128-bit lane:
+ 8, 24, 9, 25, 10, 26, 11, 27}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
- if (isShuffleEquivalent(Mask,
- // First 128-bit lane:
- 4, 20, 5, 21, 6, 22, 7, 23,
- // Second 128-bit lane:
- 12, 28, 13, 29, 14, 30, 15, 31))
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane:
+ 4, 20, 5, 21, 6, 22, 7, 23,
+ // Second 128-bit lane:
+ 12, 28, 13, 29, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
@@ -10347,6 +9577,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
}
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic lowering.
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
}
@@ -10366,17 +9602,18 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
- // There are no generalized cross-lane shuffle operations available on i8
- // element types.
- if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
- Mask, DAG);
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG))
return Blend;
@@ -10385,21 +9622,37 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Note that these are repeated 128-bit lane unpacks, not unpacks across all
// 256-bit lanes.
if (isShuffleEquivalent(
- Mask,
- // First 128-bit lane:
- 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
- // Second 128-bit lane:
- 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
+ V1, V2, Mask,
+ {// First 128-bit lane:
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+ // Second 128-bit lane:
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
if (isShuffleEquivalent(
- Mask,
- // First 128-bit lane:
- 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
- // Second 128-bit lane:
- 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
+ V1, V2, Mask,
+ {// First 128-bit lane:
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+ // Second 128-bit lane:
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+ Mask, DAG);
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 32; ++i)
PSHUFBMask[i] =
@@ -10412,6 +9665,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
}
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic lowering.
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
}
@@ -10478,6 +9737,13 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
}
@@ -10493,6 +9759,20 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ // Second 128-bit lane.
+ 8, 24, 9, 25, 12, 28, 13, 29}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ // Second 128-bit lane.
+ 10, 26, 11, 27, 14, 30, 15, 31}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
}
@@ -10508,6 +9788,13 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
}
@@ -10523,6 +9810,20 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ // Second 128-bit lane.
+ 8, 24, 9, 25, 12, 28, 13, 29}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ // Second 128-bit lane.
+ 10, 26, 11, 27, 14, 30, 15, 31}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
}
@@ -10574,8 +9875,8 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
"Cannot lower 512-bit vectors w/ basic ISA!");
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have supprot for
@@ -10651,6 +9952,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
}
+ // We actually see shuffles that are entirely re-arrangements of a set of
+ // zero inputs. This mostly happens while decomposing complex shuffles into
+ // simple ones. Directly lower these as a buildvector of zeros.
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ if (Zeroable.all())
+ return getZeroVector(VT, Subtarget, DAG, dl);
+
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
@@ -10690,7 +9998,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// When the number of V1 and V2 elements are the same, try to minimize the
// number of uses of V2 in the low half of the vector. When that is tied,
// ensure that the sum of indices for V1 is equal to or lower than the sum
- // indices for V2.
+ // indices for V2. When those are equal, try to ensure that the number of odd
+ // indices for V1 is lower than the number of odd indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
for (int M : SVOp->getMask().slice(0, NumElements / 2))
@@ -10707,8 +10016,18 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
SumV2Indices += i;
else if (SVOp->getMask()[i] >= 0)
SumV1Indices += i;
- if (SumV2Indices < SumV1Indices)
+ if (SumV2Indices < SumV1Indices) {
return DAG.getCommutedVectorShuffle(*SVOp);
+ } else if (SumV2Indices == SumV1Indices) {
+ int NumV1OddIndices = 0, NumV2OddIndices = 0;
+ for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+ if (SVOp->getMask()[i] >= NumElements)
+ NumV2OddIndices += i % 2;
+ else if (SVOp->getMask()[i] >= 0)
+ NumV1OddIndices += i % 2;
+ if (NumV2OddIndices < NumV1OddIndices)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ }
}
}
@@ -10727,1586 +10046,6 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
llvm_unreachable("Unimplemented!");
}
-
-//===----------------------------------------------------------------------===//
-// Legacy vector shuffle lowering
-//
-// This code is the legacy code handling vector shuffles until the above
-// replaces its functionality and performance.
-//===----------------------------------------------------------------------===//
-
-static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
- bool hasInt256, unsigned *MaskOut = nullptr) {
- MVT EltVT = VT.getVectorElementType();
-
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return false;
-
- if (!hasSSE41 || EltVT == MVT::i8)
- return false;
- if (!hasInt256 && VT == MVT::v16i16)
- return false;
-
- unsigned MaskValue = 0;
- unsigned NumElems = VT.getVectorNumElements();
- // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
- unsigned NumLanes = (NumElems - 1) / 8 + 1;
- unsigned NumElemsInLane = NumElems / NumLanes;
-
- // Blend for v16i16 should be symetric for the both lanes.
- for (unsigned i = 0; i < NumElemsInLane; ++i) {
-
- int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
- int EltIdx = MaskVals[i];
-
- if ((EltIdx < 0 || EltIdx == (int)i) &&
- (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
- continue;
-
- if (((unsigned)EltIdx == (i + NumElems)) &&
- (SndLaneEltIdx < 0 ||
- (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
- MaskValue |= (1 << i);
- else
- return false;
- }
-
- if (MaskOut)
- *MaskOut = MaskValue;
- return true;
-}
-
-// Try to lower a shuffle node into a simple blend instruction.
-// This function assumes isBlendMask returns true for this
-// SuffleVectorSDNode
-static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
- unsigned MaskValue,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- MVT EltVT = VT.getVectorElementType();
- assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
- Subtarget->hasInt256() && "Trying to lower a "
- "VECTOR_SHUFFLE to a Blend but "
- "with the wrong mask"));
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- unsigned NumElems = VT.getVectorNumElements();
-
- // Convert i32 vectors to floating point if it is not AVX2.
- // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
- MVT BlendVT = VT;
- if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
- BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
- NumElems);
- V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
- }
-
- SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
- DAG.getConstant(MaskValue, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
-}
-
-/// In vector type \p VT, return true if the element at index \p InputIdx
-/// falls on a different 128-bit lane than \p OutputIdx.
-static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
- unsigned OutputIdx) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
-}
-
-/// Generate a PSHUFB if possible. Selects elements from \p V1 according to
-/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
-/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
-/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
-/// zero.
-static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
- SelectionDAG &DAG) {
- MVT VT = V1.getSimpleValueType();
- assert(VT.is128BitVector() || VT.is256BitVector());
-
- MVT EltVT = VT.getVectorElementType();
- unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
- unsigned NumElts = VT.getVectorNumElements();
-
- SmallVector<SDValue, 32> PshufbMask;
- for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
- int InputIdx = MaskVals[OutputIdx];
- unsigned InputByteIdx;
-
- if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
- InputByteIdx = 0x80;
- else {
- // Cross lane is not allowed.
- if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
- return SDValue();
- InputByteIdx = InputIdx * EltSizeInBytes;
- // Index is an byte offset within the 128-bit lane.
- InputByteIdx &= 0xf;
- }
-
- for (unsigned j = 0; j < EltSizeInBytes; ++j) {
- PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
- if (InputByteIdx != 0x80)
- ++InputByteIdx;
- }
- }
-
- MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
- if (ShufVT != VT)
- V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
- return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
-}
-
-// v8i16 shuffles - Prefer shuffles in the following order:
-// 1. [all] pshuflw, pshufhw, optional move
-// 2. [ssse3] 1 x pshufb
-// 3. [ssse3] 2 x pshufb + 1 x por
-// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-static SDValue
-LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- SmallVector<int, 8> MaskVals;
-
- // Determine if more than 1 of the words in each of the low and high quadwords
- // of the result come from the same quadword of one of the two inputs. Undef
- // mask values count as coming from any quadword, for better codegen.
- //
- // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
- // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
- unsigned LoQuad[] = { 0, 0, 0, 0 };
- unsigned HiQuad[] = { 0, 0, 0, 0 };
- // Indices of quads used.
- std::bitset<4> InputQuads;
- for (unsigned i = 0; i < 8; ++i) {
- unsigned *Quad = i < 4 ? LoQuad : HiQuad;
- int EltIdx = SVOp->getMaskElt(i);
- MaskVals.push_back(EltIdx);
- if (EltIdx < 0) {
- ++Quad[0];
- ++Quad[1];
- ++Quad[2];
- ++Quad[3];
- continue;
- }
- ++Quad[EltIdx / 4];
- InputQuads.set(EltIdx / 4);
- }
-
- int BestLoQuad = -1;
- unsigned MaxQuad = 1;
- for (unsigned i = 0; i < 4; ++i) {
- if (LoQuad[i] > MaxQuad) {
- BestLoQuad = i;
- MaxQuad = LoQuad[i];
- }
- }
-
- int BestHiQuad = -1;
- MaxQuad = 1;
- for (unsigned i = 0; i < 4; ++i) {
- if (HiQuad[i] > MaxQuad) {
- BestHiQuad = i;
- MaxQuad = HiQuad[i];
- }
- }
-
- // For SSSE3, If all 8 words of the result come from only 1 quadword of each
- // of the two input vectors, shuffle them into one input vector so only a
- // single pshufb instruction is necessary. If there are more than 2 input
- // quads, disable the next transformation since it does not help SSSE3.
- bool V1Used = InputQuads[0] || InputQuads[1];
- bool V2Used = InputQuads[2] || InputQuads[3];
- if (Subtarget->hasSSSE3()) {
- if (InputQuads.count() == 2 && V1Used && V2Used) {
- BestLoQuad = InputQuads[0] ? 0 : 1;
- BestHiQuad = InputQuads[2] ? 2 : 3;
- }
- if (InputQuads.count() > 2) {
- BestLoQuad = -1;
- BestHiQuad = -1;
- }
- }
-
- // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
- // the shuffle mask. If a quad is scored as -1, that means that it contains
- // words from all 4 input quadwords.
- SDValue NewV;
- if (BestLoQuad >= 0 || BestHiQuad >= 0) {
- int MaskV[] = {
- BestLoQuad < 0 ? 0 : BestLoQuad,
- BestHiQuad < 0 ? 1 : BestHiQuad
- };
- NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
- NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
-
- // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
- // source words for the shuffle, to aid later transformations.
- bool AllWordsInNewV = true;
- bool InOrder[2] = { true, true };
- for (unsigned i = 0; i != 8; ++i) {
- int idx = MaskVals[i];
- if (idx != (int)i)
- InOrder[i/4] = false;
- if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
- continue;
- AllWordsInNewV = false;
- break;
- }
-
- bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
- if (AllWordsInNewV) {
- for (int i = 0; i != 8; ++i) {
- int idx = MaskVals[i];
- if (idx < 0)
- continue;
- idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
- if ((idx != i) && idx < 4)
- pshufhw = false;
- if ((idx != i) && idx > 3)
- pshuflw = false;
- }
- V1 = NewV;
- V2Used = false;
- BestLoQuad = 0;
- BestHiQuad = 1;
- }
-
- // If we've eliminated the use of V2, and the new mask is a pshuflw or
- // pshufhw, that's as cheap as it gets. Return the new shuffle.
- if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
- unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
- unsigned TargetMask = 0;
- NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
- DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
- TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
- getShufflePSHUFLWImmediate(SVOp);
- V1 = NewV.getOperand(0);
- return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
- }
- }
-
- // Promote splats to a larger type which usually leads to more efficient code.
- // FIXME: Is this true if pshufb is available?
- if (SVOp->isSplat())
- return PromoteSplat(SVOp, DAG);
-
- // If we have SSSE3, and all words of the result are from 1 input vector,
- // case 2 is generated, otherwise case 3 is generated. If no SSSE3
- // is present, fall back to case 4.
- if (Subtarget->hasSSSE3()) {
- SmallVector<SDValue,16> pshufbMask;
-
- // If we have elements from both input vectors, set the high bit of the
- // shuffle mask element to zero out elements that come from V2 in the V1
- // mask, and elements that come from V1 in the V2 mask, so that the two
- // results can be OR'd together.
- bool TwoInputs = V1Used && V2Used;
- V1 = getPSHUFB(MaskVals, V1, dl, DAG);
- if (!TwoInputs)
- return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
-
- // Calculate the shuffle mask for the second input, shuffle it, and
- // OR it with the first shuffled input.
- CommuteVectorShuffleMask(MaskVals, 8);
- V2 = getPSHUFB(MaskVals, V2, dl, DAG);
- V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
- return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
- }
-
- // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
- // and update MaskVals with new element order.
- std::bitset<8> InOrder;
- if (BestLoQuad >= 0) {
- int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
- for (int i = 0; i != 4; ++i) {
- int idx = MaskVals[i];
- if (idx < 0) {
- InOrder.set(i);
- } else if ((idx / 4) == BestLoQuad) {
- MaskV[i] = idx & 3;
- InOrder.set(i);
- }
- }
- NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
- &MaskV[0]);
-
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
- NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
- NewV.getOperand(0),
- getShufflePSHUFLWImmediate(SVOp), DAG);
- }
- }
-
- // If BestHi >= 0, generate a pshufhw to put the high elements in order,
- // and update MaskVals with the new element order.
- if (BestHiQuad >= 0) {
- int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
- for (unsigned i = 4; i != 8; ++i) {
- int idx = MaskVals[i];
- if (idx < 0) {
- InOrder.set(i);
- } else if ((idx / 4) == BestHiQuad) {
- MaskV[i] = (idx & 3) + 4;
- InOrder.set(i);
- }
- }
- NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
- &MaskV[0]);
-
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
- NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
- NewV.getOperand(0),
- getShufflePSHUFHWImmediate(SVOp), DAG);
- }
- }
-
- // In case BestHi & BestLo were both -1, which means each quadword has a word
- // from each of the four input quadwords, calculate the InOrder bitvector now
- // before falling through to the insert/extract cleanup.
- if (BestLoQuad == -1 && BestHiQuad == -1) {
- NewV = V1;
- for (int i = 0; i != 8; ++i)
- if (MaskVals[i] < 0 || MaskVals[i] == i)
- InOrder.set(i);
- }
-
- // The other elements are put in the right place using pextrw and pinsrw.
- for (unsigned i = 0; i != 8; ++i) {
- if (InOrder[i])
- continue;
- int EltIdx = MaskVals[i];
- if (EltIdx < 0)
- continue;
- SDValue ExtOp = (EltIdx < 8) ?
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
- DAG.getIntPtrConstant(EltIdx)) :
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
- DAG.getIntPtrConstant(EltIdx - 8));
- NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
- DAG.getIntPtrConstant(i));
- }
- return NewV;
-}
-
-/// \brief v16i16 shuffles
-///
-/// FIXME: We only support generation of a single pshufb currently. We can
-/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
-/// well (e.g 2 x pshufb + 1 x por).
-static SDValue
-LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
-
- if (V2.getOpcode() != ISD::UNDEF)
- return SDValue();
-
- SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
- return getPSHUFB(MaskVals, V1, dl, DAG);
-}
-
-// v16i8 shuffles - Prefer shuffles in the following order:
-// 1. [ssse3] 1 x pshufb
-// 2. [ssse3] 2 x pshufb + 1 x por
-// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
-static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
- const X86Subtarget* Subtarget,
- SelectionDAG &DAG) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- ArrayRef<int> MaskVals = SVOp->getMask();
-
- // Promote splats to a larger type which usually leads to more efficient code.
- // FIXME: Is this true if pshufb is available?
- if (SVOp->isSplat())
- return PromoteSplat(SVOp, DAG);
-
- // If we have SSSE3, case 1 is generated when all result bytes come from
- // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
- // present, fall back to case 3.
-
- // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
- if (Subtarget->hasSSSE3()) {
- SmallVector<SDValue,16> pshufbMask;
-
- // If all result elements are from one input vector, then only translate
- // undef mask values to 0x80 (zero out result) in the pshufb mask.
- //
- // Otherwise, we have elements from both input vectors, and must zero out
- // elements that come from V2 in the first mask, and V1 in the second mask
- // so that we can OR them together.
- for (unsigned i = 0; i != 16; ++i) {
- int EltIdx = MaskVals[i];
- if (EltIdx < 0 || EltIdx >= 16)
- EltIdx = 0x80;
- pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
- }
- V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, pshufbMask));
-
- // As PSHUFB will zero elements with negative indices, it's safe to ignore
- // the 2nd operand if it's undefined or zero.
- if (V2.getOpcode() == ISD::UNDEF ||
- ISD::isBuildVectorAllZeros(V2.getNode()))
- return V1;
-
- // Calculate the shuffle mask for the second input, shuffle it, and
- // OR it with the first shuffled input.
- pshufbMask.clear();
- for (unsigned i = 0; i != 16; ++i) {
- int EltIdx = MaskVals[i];
- EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
- pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
- }
- V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, pshufbMask));
- return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
- }
-
- // No SSSE3 - Calculate in place words and then fix all out of place words
- // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
- // the 16 different words that comprise the two doublequadword input vectors.
- V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
- SDValue NewV = V1;
- for (int i = 0; i != 8; ++i) {
- int Elt0 = MaskVals[i*2];
- int Elt1 = MaskVals[i*2+1];
-
- // This word of the result is all undef, skip it.
- if (Elt0 < 0 && Elt1 < 0)
- continue;
-
- // This word of the result is already in the correct place, skip it.
- if ((Elt0 == i*2) && (Elt1 == i*2+1))
- continue;
-
- SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
- SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
- SDValue InsElt;
-
- // If Elt0 and Elt1 are defined, are consecutive, and can be load
- // using a single extract together, load it and store it.
- if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
- InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
- DAG.getIntPtrConstant(Elt1 / 2));
- NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
- DAG.getIntPtrConstant(i));
- continue;
- }
-
- // If Elt1 is defined, extract it from the appropriate source. If the
- // source byte is not also odd, shift the extracted word left 8 bits
- // otherwise clear the bottom 8 bits if we need to do an or.
- if (Elt1 >= 0) {
- InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
- DAG.getIntPtrConstant(Elt1 / 2));
- if ((Elt1 & 1) == 0)
- InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
- DAG.getConstant(8,
- TLI.getShiftAmountTy(InsElt.getValueType())));
- else if (Elt0 >= 0)
- InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
- DAG.getConstant(0xFF00, MVT::i16));
- }
- // If Elt0 is defined, extract it from the appropriate source. If the
- // source byte is not also even, shift the extracted word right 8 bits. If
- // Elt1 was also defined, OR the extracted values together before
- // inserting them in the result.
- if (Elt0 >= 0) {
- SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
- Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
- if ((Elt0 & 1) != 0)
- InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
- DAG.getConstant(8,
- TLI.getShiftAmountTy(InsElt0.getValueType())));
- else if (Elt1 >= 0)
- InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
- DAG.getConstant(0x00FF, MVT::i16));
- InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
- : InsElt0;
- }
- NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
- DAG.getIntPtrConstant(i));
- }
- return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
-}
-
-// v32i8 shuffles - Translate to VPSHUFB if possible.
-static
-SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
-
- bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
- bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
-
- // VPSHUFB may be generated if
- // (1) one of input vector is undefined or zeroinitializer.
- // The mask value 0x80 puts 0 in the corresponding slot of the vector.
- // And (2) the mask indexes don't cross the 128-bit lane.
- if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
- (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
- return SDValue();
-
- if (V1IsAllZero && !V2IsAllZero) {
- CommuteVectorShuffleMask(MaskVals, 32);
- V1 = V2;
- }
- return getPSHUFB(MaskVals, V1, dl, DAG);
-}
-
-/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
-/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
-/// done when every pair / quad of shuffle mask elements point to elements in
-/// the right sequence. e.g.
-/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
-static
-SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- SDLoc dl(SVOp);
- unsigned NumElems = VT.getVectorNumElements();
- MVT NewVT;
- unsigned Scale;
- switch (VT.SimpleTy) {
- default: llvm_unreachable("Unexpected!");
- case MVT::v2i64:
- case MVT::v2f64:
- return SDValue(SVOp, 0);
- case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
- case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
- case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
- case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
- case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
- case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
- }
-
- SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i != NumElems; i += Scale) {
- int StartIdx = -1;
- for (unsigned j = 0; j != Scale; ++j) {
- int EltIdx = SVOp->getMaskElt(i+j);
- if (EltIdx < 0)
- continue;
- if (StartIdx < 0)
- StartIdx = (EltIdx / Scale);
- if (EltIdx != (int)(StartIdx*Scale + j))
- return SDValue();
- }
- MaskVec.push_back(StartIdx);
- }
-
- SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
- SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
- return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
-}
-
-/// getVZextMovL - Return a zero-extending vector move low node.
-///
-static SDValue getVZextMovL(MVT VT, MVT OpVT,
- SDValue SrcOp, SelectionDAG &DAG,
- const X86Subtarget *Subtarget, SDLoc dl) {
- if (VT == MVT::v2f64 || VT == MVT::v4f32) {
- LoadSDNode *LD = nullptr;
- if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
- LD = dyn_cast<LoadSDNode>(SrcOp);
- if (!LD) {
- // movssrr and movsdrr do not clear top bits. Try to use movd, movq
- // instead.
- MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
- if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
- SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
- SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
- SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
- // PR2108
- OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- OpVT,
- SrcOp.getOperand(0)
- .getOperand(0))));
- }
- }
- }
-
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
- DAG.getNode(ISD::BITCAST, dl,
- OpVT, SrcOp)));
-}
-
-/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
-/// which could not be matched by any known target speficic shuffle
-static SDValue
-LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-
- SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
- if (NewOp.getNode())
- return NewOp;
-
- MVT VT = SVOp->getSimpleValueType(0);
-
- unsigned NumElems = VT.getVectorNumElements();
- unsigned NumLaneElems = NumElems / 2;
-
- SDLoc dl(SVOp);
- MVT EltVT = VT.getVectorElementType();
- MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
- SDValue Output[2];
-
- SmallVector<int, 16> Mask;
- for (unsigned l = 0; l < 2; ++l) {
- // Build a shuffle mask for the output, discovering on the fly which
- // input vectors to use as shuffle operands (recorded in InputUsed).
- // If building a suitable shuffle vector proves too hard, then bail
- // out with UseBuildVector set.
- bool UseBuildVector = false;
- int InputUsed[2] = { -1, -1 }; // Not yet discovered.
- unsigned LaneStart = l * NumLaneElems;
- for (unsigned i = 0; i != NumLaneElems; ++i) {
- // The mask element. This indexes into the input.
- int Idx = SVOp->getMaskElt(i+LaneStart);
- if (Idx < 0) {
- // the mask element does not index into any input vector.
- Mask.push_back(-1);
- continue;
- }
-
- // The input vector this mask element indexes into.
- int Input = Idx / NumLaneElems;
-
- // Turn the index into an offset from the start of the input vector.
- Idx -= Input * NumLaneElems;
-
- // Find or create a shuffle vector operand to hold this input.
- unsigned OpNo;
- for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
- if (InputUsed[OpNo] == Input)
- // This input vector is already an operand.
- break;
- if (InputUsed[OpNo] < 0) {
- // Create a new operand for this input vector.
- InputUsed[OpNo] = Input;
- break;
- }
- }
-
- if (OpNo >= array_lengthof(InputUsed)) {
- // More than two input vectors used! Give up on trying to create a
- // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
- UseBuildVector = true;
- break;
- }
-
- // Add the mask index for the new shuffle vector.
- Mask.push_back(Idx + OpNo * NumLaneElems);
- }
-
- if (UseBuildVector) {
- SmallVector<SDValue, 16> SVOps;
- for (unsigned i = 0; i != NumLaneElems; ++i) {
- // The mask element. This indexes into the input.
- int Idx = SVOp->getMaskElt(i+LaneStart);
- if (Idx < 0) {
- SVOps.push_back(DAG.getUNDEF(EltVT));
- continue;
- }
-
- // The input vector this mask element indexes into.
- int Input = Idx / NumElems;
-
- // Turn the index into an offset from the start of the input vector.
- Idx -= Input * NumElems;
-
- // Extract the vector element by hand.
- SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
- SVOp->getOperand(Input),
- DAG.getIntPtrConstant(Idx)));
- }
-
- // Construct the output using a BUILD_VECTOR.
- Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
- } else if (InputUsed[0] < 0) {
- // No input vectors were used! The result is undefined.
- Output[l] = DAG.getUNDEF(NVT);
- } else {
- SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
- (InputUsed[0] % 2) * NumLaneElems,
- DAG, dl);
- // If only one input was used, use an undefined vector for the other.
- SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
- Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
- (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
- // At least one input vector was used. Create a new shuffle vector.
- Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
- }
-
- Mask.clear();
- }
-
- // Concatenate the result back
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
-}
-
-/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
-/// 4 elements, and match them with several different shuffle types.
-static SDValue
-LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- MVT VT = SVOp->getSimpleValueType(0);
-
- assert(VT.is128BitVector() && "Unsupported vector size");
-
- std::pair<int, int> Locs[4];
- int Mask1[] = { -1, -1, -1, -1 };
- SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
-
- unsigned NumHi = 0;
- unsigned NumLo = 0;
- for (unsigned i = 0; i != 4; ++i) {
- int Idx = PermMask[i];
- if (Idx < 0) {
- Locs[i] = std::make_pair(-1, -1);
- } else {
- assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
- if (Idx < 4) {
- Locs[i] = std::make_pair(0, NumLo);
- Mask1[NumLo] = Idx;
- NumLo++;
- } else {
- Locs[i] = std::make_pair(1, NumHi);
- if (2+NumHi < 4)
- Mask1[2+NumHi] = Idx;
- NumHi++;
- }
- }
- }
-
- if (NumLo <= 2 && NumHi <= 2) {
- // If no more than two elements come from either vector. This can be
- // implemented with two shuffles. First shuffle gather the elements.
- // The second shuffle, which takes the first shuffle as both of its
- // vector operands, put the elements into the right order.
- V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-
- int Mask2[] = { -1, -1, -1, -1 };
-
- for (unsigned i = 0; i != 4; ++i)
- if (Locs[i].first != -1) {
- unsigned Idx = (i < 2) ? 0 : 4;
- Idx += Locs[i].first * 2 + Locs[i].second;
- Mask2[i] = Idx;
- }
-
- return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
- }
-
- if (NumLo == 3 || NumHi == 3) {
- // Otherwise, we must have three elements from one vector, call it X, and
- // one element from the other, call it Y. First, use a shufps to build an
- // intermediate vector with the one element from Y and the element from X
- // that will be in the same half in the final destination (the indexes don't
- // matter). Then, use a shufps to build the final vector, taking the half
- // containing the element from Y from the intermediate, and the other half
- // from X.
- if (NumHi == 3) {
- // Normalize it so the 3 elements come from V1.
- CommuteVectorShuffleMask(PermMask, 4);
- std::swap(V1, V2);
- }
-
- // Find the element from V2.
- unsigned HiIndex;
- for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
- int Val = PermMask[HiIndex];
- if (Val < 0)
- continue;
- if (Val >= 4)
- break;
- }
-
- Mask1[0] = PermMask[HiIndex];
- Mask1[1] = -1;
- Mask1[2] = PermMask[HiIndex^1];
- Mask1[3] = -1;
- V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-
- if (HiIndex >= 2) {
- Mask1[0] = PermMask[0];
- Mask1[1] = PermMask[1];
- Mask1[2] = HiIndex & 1 ? 6 : 4;
- Mask1[3] = HiIndex & 1 ? 4 : 6;
- return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
- }
-
- Mask1[0] = HiIndex & 1 ? 2 : 0;
- Mask1[1] = HiIndex & 1 ? 0 : 2;
- Mask1[2] = PermMask[2];
- Mask1[3] = PermMask[3];
- if (Mask1[2] >= 0)
- Mask1[2] += 4;
- if (Mask1[3] >= 0)
- Mask1[3] += 4;
- return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
- }
-
- // Break it into (shuffle shuffle_hi, shuffle_lo).
- int LoMask[] = { -1, -1, -1, -1 };
- int HiMask[] = { -1, -1, -1, -1 };
-
- int *MaskPtr = LoMask;
- unsigned MaskIdx = 0;
- unsigned LoIdx = 0;
- unsigned HiIdx = 2;
- for (unsigned i = 0; i != 4; ++i) {
- if (i == 2) {
- MaskPtr = HiMask;
- MaskIdx = 1;
- LoIdx = 0;
- HiIdx = 2;
- }
- int Idx = PermMask[i];
- if (Idx < 0) {
- Locs[i] = std::make_pair(-1, -1);
- } else if (Idx < 4) {
- Locs[i] = std::make_pair(MaskIdx, LoIdx);
- MaskPtr[LoIdx] = Idx;
- LoIdx++;
- } else {
- Locs[i] = std::make_pair(MaskIdx, HiIdx);
- MaskPtr[HiIdx] = Idx;
- HiIdx++;
- }
- }
-
- SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
- SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
- int MaskOps[] = { -1, -1, -1, -1 };
- for (unsigned i = 0; i != 4; ++i)
- if (Locs[i].first != -1)
- MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
- return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
-}
-
-static bool MayFoldVectorLoad(SDValue V) {
- while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
-
- if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
- V = V.getOperand(0);
- if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
- V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
- // BUILD_VECTOR (load), undef
- V = V.getOperand(0);
-
- return MayFoldLoad(V);
-}
-
-static
-SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
-
- // Canonizalize to v2f64.
- V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
- return DAG.getNode(ISD::BITCAST, dl, VT,
- getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
- V1, DAG));
-}
-
-static
-SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
- bool HasSSE2) {
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
-
- assert(VT != MVT::v2i64 && "unsupported shuffle type");
-
- if (HasSSE2 && VT == MVT::v2f64)
- return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
-
- // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
- return DAG.getNode(ISD::BITCAST, dl, VT,
- getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
- DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
- DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
-}
-
-static
-SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
-
- assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
- "unsupported shuffle type");
-
- if (V2.getOpcode() == ISD::UNDEF)
- V2 = V1;
-
- // v4i32 or v4f32
- return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
-}
-
-static
-SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
- unsigned NumElems = VT.getVectorNumElements();
-
- // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
- // operand of these instructions is only memory, so check if there's a
- // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
- // same masks.
- bool CanFoldLoad = false;
-
- // Trivial case, when V2 comes from a load.
- if (MayFoldVectorLoad(V2))
- CanFoldLoad = true;
-
- // When V1 is a load, it can be folded later into a store in isel, example:
- // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
- // turns into:
- // (MOVLPSmr addr:$src1, VR128:$src2)
- // So, recognize this potential and also use MOVLPS or MOVLPD
- else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
- CanFoldLoad = true;
-
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- if (CanFoldLoad) {
- if (HasSSE2 && NumElems == 2)
- return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
-
- if (NumElems == 4)
- // If we don't care about the second element, proceed to use movss.
- if (SVOp->getMaskElt(1) != -1)
- return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
- }
-
- // movl and movlp will both match v2i64, but v2i64 is never matched by
- // movl earlier because we make it strict to avoid messing with the movlp load
- // folding logic (see the code above getMOVLP call). Match it here then,
- // this is horrible, but will stay like this until we move all shuffle
- // matching to x86 specific nodes. Note that for the 1st condition all
- // types are matched with movsd.
- if (HasSSE2) {
- // FIXME: isMOVLMask should be checked and matched before getMOVLP,
- // as to remove this logic from here, as much as possible
- if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
- return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
- return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
- }
-
- assert(VT != MVT::v4i32 && "unsupported shuffle type");
-
- // Invert the operand order and use SHUFPS to match it.
- return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
- getShuffleSHUFImmediate(SVOp), DAG);
-}
-
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
- SelectionDAG &DAG) {
- SDLoc dl(Load);
- MVT VT = Load->getSimpleValueType(0);
- MVT EVT = VT.getVectorElementType();
- SDValue Addr = Load->getOperand(1);
- SDValue NewAddr = DAG.getNode(
- ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
- DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
-
- SDValue NewLoad =
- DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
- DAG.getMachineFunction().getMachineMemOperand(
- Load->getMemOperand(), 0, EVT.getStoreSize()));
- return NewLoad;
-}
-
-// It is only safe to call this function if isINSERTPSMask is true for
-// this shufflevector mask.
-static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
- SelectionDAG &DAG) {
- // Generate an insertps instruction when inserting an f32 from memory onto a
- // v4f32 or when copying a member from one v4f32 to another.
- // We also use it for transferring i32 from one register to another,
- // since it simply copies the same bits.
- // If we're transferring an i32 from memory to a specific element in a
- // register, we output a generic DAG that will match the PINSRD
- // instruction.
- MVT VT = SVOp->getSimpleValueType(0);
- MVT EVT = VT.getVectorElementType();
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- auto Mask = SVOp->getMask();
- assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
- "unsupported vector type for insertps/pinsrd");
-
- auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
- auto FromV2Predicate = [](const int &i) { return i >= 4; };
- int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
-
- SDValue From;
- SDValue To;
- unsigned DestIndex;
- if (FromV1 == 1) {
- From = V1;
- To = V2;
- DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
- Mask.begin();
-
- // If we have 1 element from each vector, we have to check if we're
- // changing V1's element's place. If so, we're done. Otherwise, we
- // should assume we're changing V2's element's place and behave
- // accordingly.
- int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
- assert(DestIndex <= INT32_MAX && "truncated destination index");
- if (FromV1 == FromV2 &&
- static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
- From = V2;
- To = V1;
- DestIndex =
- std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
- }
- } else {
- assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
- "More than one element from V1 and from V2, or no elements from one "
- "of the vectors. This case should not have returned true from "
- "isINSERTPSMask");
- From = V2;
- To = V1;
- DestIndex =
- std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
- }
-
- // Get an index into the source vector in the range [0,4) (the mask is
- // in the range [0,8) because it can address V1 and V2)
- unsigned SrcIndex = Mask[DestIndex] % 4;
- if (MayFoldLoad(From)) {
- // Trivial case, when From comes from a load and is only used by the
- // shuffle. Make it use insertps from the vector that we need from that
- // load.
- SDValue NewLoad =
- NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
- if (!NewLoad.getNode())
- return SDValue();
-
- if (EVT == MVT::f32) {
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector =
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
- SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
- InsertpsMask);
- } else { // EVT == MVT::i32
- // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
- // instruction, to match the PINSRD instruction, which loads an i32 to a
- // certain vector element.
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
- DAG.getConstant(DestIndex, MVT::i32));
- }
- }
-
- // Vector-element-to-vector
- SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
-}
-
-// Reduce a vector shuffle to zext.
-static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- // PMOVZX is only available from SSE41.
- if (!Subtarget->hasSSE41())
- return SDValue();
-
- MVT VT = Op.getSimpleValueType();
-
- // Only AVX2 support 256-bit vector integer extending.
- if (!Subtarget->hasInt256() && VT.is256BitVector())
- return SDValue();
-
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDLoc DL(Op);
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extending is an unary operation and the element type of the source vector
- // won't be equal to or larger than i64.
- if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
- VT.getVectorElementType() == MVT::i64)
- return SDValue();
-
- // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
- unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
- while ((1U << Shift) < NumElems) {
- if (SVOp->getMaskElt(1U << Shift) == 1)
- break;
- Shift += 1;
- // The maximal ratio is 8, i.e. from i8 to i64.
- if (Shift > 3)
- return SDValue();
- }
-
- // Check the shuffle mask.
- unsigned Mask = (1U << Shift) - 1;
- for (unsigned i = 0; i != NumElems; ++i) {
- int EltIdx = SVOp->getMaskElt(i);
- if ((i & Mask) != 0 && EltIdx != -1)
- return SDValue();
- if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
- return SDValue();
- }
-
- unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
- MVT NeVT = MVT::getIntegerVT(NBits);
- MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
-
- if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
- return SDValue();
-
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
-}
-
-static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- MVT VT = Op.getSimpleValueType();
- SDLoc dl(Op);
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
-
- if (isZeroShuffle(SVOp))
- return getZeroVector(VT, Subtarget, DAG, dl);
-
- // Handle splat operations
- if (SVOp->isSplat()) {
- // Use vbroadcast whenever the splat comes from a foldable load
- SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
- if (Broadcast.getNode())
- return Broadcast;
- }
-
- // Check integer expanding shuffles.
- SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
-
- // If the shuffle can be profitably rewritten as a narrower shuffle, then
- // do it!
- if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
- VT == MVT::v32i8) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
- if (NewOp.getNode())
- return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
- } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
- // FIXME: Figure out a cleaner way to do this.
- if (ISD::isBuildVectorAllZeros(V2.getNode())) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
- if (NewOp.getNode()) {
- MVT NewVT = NewOp.getSimpleValueType();
- if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
- NewVT, true, false))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
- dl);
- }
- } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
- if (NewOp.getNode()) {
- MVT NewVT = NewOp.getSimpleValueType();
- if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
- dl);
- }
- }
- }
- return SDValue();
-}
-
-SDValue
-X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
- SDLoc dl(Op);
- unsigned NumElems = VT.getVectorNumElements();
- bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
- bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
- bool V1IsSplat = false;
- bool V2IsSplat = false;
- bool HasSSE2 = Subtarget->hasSSE2();
- bool HasFp256 = Subtarget->hasFp256();
- bool HasInt256 = Subtarget->hasInt256();
- MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
-
- // Check if we should use the experimental vector shuffle lowering. If so,
- // delegate completely to that code path.
- if (ExperimentalVectorShuffleLowering)
- return lowerVectorShuffle(Op, Subtarget, DAG);
-
- assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
-
- if (V1IsUndef && V2IsUndef)
- return DAG.getUNDEF(VT);
-
- // When we create a shuffle node we put the UNDEF node to second operand,
- // but in some cases the first operand may be transformed to UNDEF.
- // In this case we should just commute the node.
- if (V1IsUndef)
- return DAG.getCommutedVectorShuffle(*SVOp);
-
- // Vector shuffle lowering takes 3 steps:
- //
- // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
- // narrowing and commutation of operands should be handled.
- // 2) Matching of shuffles with known shuffle masks to x86 target specific
- // shuffle nodes.
- // 3) Rewriting of unmatched masks into new generic shuffle operations,
- // so the shuffle can be broken into other shuffles and the legalizer can
- // try the lowering again.
- //
- // The general idea is that no vector_shuffle operation should be left to
- // be matched during isel, all of them must be converted to a target specific
- // node here.
-
- // Normalize the input vectors. Here splats, zeroed vectors, profitable
- // narrowing and commutation of operands should be handled. The actual code
- // doesn't include all of those, work in progress...
- SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
-
- SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
-
- // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
- // unpckh_undef). Only use pshufd if speed is more important than size.
- if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
- if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
- V2IsUndef && MayFoldVectorLoad(V1))
- return getMOVDDup(Op, dl, V1, DAG);
-
- if (isMOVHLPS_v_undef_Mask(M, VT))
- return getMOVHighToLow(Op, dl, DAG);
-
- // Use to match splats
- if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
- (VT == MVT::v2f64 || VT == MVT::v2i64))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
- if (isPSHUFDMask(M, VT)) {
- // The actual implementation will match the mask in the if above and then
- // during isel it can match several different instructions, not only pshufd
- // as its name says, sad but true, emulate the behavior for now...
- if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
- return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
-
- unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
-
- if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
- return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
-
- if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
- DAG);
-
- return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
- TargetMask, DAG);
- }
-
- if (isPALIGNRMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
- getShufflePALIGNRImmediate(SVOp),
- DAG);
-
- if (isVALIGNMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
- getShuffleVALIGNImmediate(SVOp),
- DAG);
-
- // Check if this can be converted into a logical shift.
- bool isLeft = false;
- unsigned ShAmt = 0;
- SDValue ShVal;
- bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
- if (isShift && ShVal.hasOneUse()) {
- // If the shifted value has multiple uses, it may be cheaper to use
- // v_set0 + movlhps or movhlps, etc.
- MVT EltVT = VT.getVectorElementType();
- ShAmt *= EltVT.getSizeInBits();
- return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
- }
-
- if (isMOVLMask(M, VT)) {
- if (ISD::isBuildVectorAllZeros(V1.getNode()))
- return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
- if (!isMOVLPMask(M, VT)) {
- if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
-
- if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
- }
- }
-
- // FIXME: fold these into legal mask.
- if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
- return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
-
- if (isMOVHLPSMask(M, VT))
- return getMOVHighToLow(Op, dl, DAG);
-
- if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
-
- if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
-
- if (isMOVLPMask(M, VT))
- return getMOVLP(Op, dl, DAG, HasSSE2);
-
- if (ShouldXformToMOVHLPS(M, VT) ||
- ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
- return DAG.getCommutedVectorShuffle(*SVOp);
-
- if (isShift) {
- // No better options. Use a vshldq / vsrldq.
- MVT EltVT = VT.getVectorElementType();
- ShAmt *= EltVT.getSizeInBits();
- return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
- }
-
- bool Commuted = false;
- // FIXME: This should also accept a bitcast of a splat? Be careful, not
- // 1,1,1,1 -> v8i16 though.
- BitVector UndefElements;
- if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
- if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
- V1IsSplat = true;
- if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
- if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
- V2IsSplat = true;
-
- // Canonicalize the splat or undef, if present, to be on the RHS.
- if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
- CommuteVectorShuffleMask(M, NumElems);
- std::swap(V1, V2);
- std::swap(V1IsSplat, V2IsSplat);
- Commuted = true;
- }
-
- if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
- // Shuffling low element of v1 into undef, just return v1.
- if (V2IsUndef)
- return V1;
- // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
- // the instruction selector will not match, so get a canonical MOVL with
- // swapped operands to undo the commute.
- return getMOVL(DAG, dl, VT, V2, V1);
- }
-
- if (isUNPCKLMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-
- if (isUNPCKHMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-
- if (V2IsSplat) {
- // Normalize mask so all entries that point to V2 points to its first
- // element then try to match unpck{h|l} again. If match, return a
- // new vector_shuffle with the corrected mask.p
- SmallVector<int, 8> NewMask(M.begin(), M.end());
- NormalizeMask(NewMask, NumElems);
- if (isUNPCKLMask(NewMask, VT, HasInt256, true))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
- if (isUNPCKHMask(NewMask, VT, HasInt256, true))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
- }
-
- if (Commuted) {
- // Commute is back and try unpck* again.
- // FIXME: this seems wrong.
- CommuteVectorShuffleMask(M, NumElems);
- std::swap(V1, V2);
- std::swap(V1IsSplat, V2IsSplat);
-
- if (isUNPCKLMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-
- if (isUNPCKHMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
- }
-
- // Normalize the node to match x86 shuffle ops if needed
- if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
- return DAG.getCommutedVectorShuffle(*SVOp);
-
- // The checks below are all present in isShuffleMaskLegal, but they are
- // inlined here right now to enable us to directly emit target specific
- // nodes, and remove one by one until they don't return Op anymore.
-
- if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
- SVOp->getSplatIndex() == 0 && V2IsUndef) {
- if (VT == MVT::v2f64 || VT == MVT::v2i64)
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- }
-
- if (isPSHUFHWMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
- getShufflePSHUFHWImmediate(SVOp),
- DAG);
-
- if (isPSHUFLWMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
- getShufflePSHUFLWImmediate(SVOp),
- DAG);
-
- unsigned MaskValue;
- if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
- &MaskValue))
- return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
-
- if (isSHUFPMask(M, VT))
- return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
- getShuffleSHUFImmediate(SVOp), DAG);
-
- if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
- //===--------------------------------------------------------------------===//
- // Generate target specific nodes for 128 or 256-bit shuffles only
- // supported in the AVX instruction set.
- //
-
- // Handle VMOVDDUPY permutations
- if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
- return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
-
- // Handle VPERMILPS/D* permutations
- if (isVPERMILPMask(M, VT)) {
- if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
- return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
- getShuffleSHUFImmediate(SVOp), DAG);
- return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
- getShuffleSHUFImmediate(SVOp), DAG);
- }
-
- unsigned Idx;
- if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
- return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
- Idx*(NumElems/2), DAG, dl);
-
- // Handle VPERM2F128/VPERM2I128 permutations
- if (isVPERM2X128Mask(M, VT, HasFp256))
- return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
- V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
-
- if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
- return getINSERTPS(SVOp, dl, DAG);
-
- unsigned Imm8;
- if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
- return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
-
- if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
- VT.is512BitVector()) {
- MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
- MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
- SmallVector<SDValue, 16> permclMask;
- for (unsigned i = 0; i != NumElems; ++i) {
- permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
- }
-
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
- if (V2IsUndef)
- // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
- return DAG.getNode(X86ISD::VPERMV, dl, VT,
- DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
- return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
- DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
- }
-
- //===--------------------------------------------------------------------===//
- // Since no target specific shuffle was selected for this generic one,
- // lower it into other known shuffles. FIXME: this isn't true yet, but
- // this is the plan.
- //
-
- // Handle v8i16 specifically since SSE can do byte extraction and insertion.
- if (VT == MVT::v8i16) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- if (VT == MVT::v16i8) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- if (VT == MVT::v32i8) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- // Handle all 128-bit wide vectors with 4 elements, and match them with
- // several different shuffle types.
- if (NumElems == 4 && VT.is128BitVector())
- return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
-
- // Handle general 256-bit shuffles
- if (VT.is256BitVector())
- return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
-
- return SDValue();
-}
-
// This function assumes its argument is a BUILD_VECTOR of constants or
// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
// true.
@@ -12344,48 +10083,29 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
return true;
}
-/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
-/// instruction.
-static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- MVT EltVT = VT.getVectorElementType();
- unsigned NumElems = VT.getVectorNumElements();
-
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return SDValue();
-
- if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
- return SDValue();
- if (!Subtarget->hasInt256() && VT == MVT::v16i16)
- return SDValue();
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
+ auto *CondBV = cast<BuildVectorSDNode>(Cond);
- // Check the mask for BLEND and build the value.
- unsigned MaskValue = 0;
- if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
- return SDValue();
-
- // Convert i32 vectors to floating point if it is not AVX2.
- // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
- MVT BlendVT = VT;
- if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
- BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
- NumElems);
- LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
- RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+ // Only non-legal VSELECTs reach this lowering, convert those into generic
+ // shuffles and re-use the shuffle lowering path for blends.
+ SmallVector<int, 32> Mask;
+ for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+ SDValue CondElt = CondBV->getOperand(i);
+ Mask.push_back(
+ isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
}
-
- SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
- DAG.getConstant(MaskValue, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
}
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -12396,28 +10116,41 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
- SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
+ // Try to lower this to a blend-style vector shuffle. This can handle all
+ // constant condition cases.
+ SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG);
if (BlendOp.getNode())
return BlendOp;
- // Some types for vselect were previously set to Expand, not Legal or
- // Custom. Return an empty SDValue so we fall-through to Expand, after
- // the Custom lowering phase.
- MVT VT = Op.getSimpleValueType();
- switch (VT.SimpleTy) {
+ // Variable blends are only legal from SSE4.1 onward.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ // Only some types will be legal on some subtargets. If we can emit a legal
+ // VSELECT-matching blend, return Op, and but if we need to expand, return
+ // a null value.
+ switch (Op.getSimpleValueType().SimpleTy) {
default:
- break;
+ // Most of the vector types have blends past SSE4.1.
+ return Op;
+
+ case MVT::v32i8:
+ // The byte blends for AVX vectors were introduced only in AVX2.
+ if (Subtarget->hasAVX2())
+ return Op;
+
+ return SDValue();
+
case MVT::v8i16:
case MVT::v16i16:
+ // AVX-512 BWI and VLX features support VSELECT with i16 elements.
if (Subtarget->hasBWI() && Subtarget->hasVLX())
- break;
+ return Op;
+
+ // FIXME: We should custom lower this by fixing the condition and using i8
+ // blends.
return SDValue();
}
-
- // We couldn't create a "Blend with immediate" node.
- // This node should still be legal, but we'll have to emit a blendv*
- // instruction.
- return Op;
}
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
@@ -12493,6 +10226,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
MVT EltVT = Op.getSimpleValueType();
assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512
@@ -12506,6 +10241,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+ rc = getRegClassFor(MVT::v16i1);
unsigned MaxSift = rc->getSize()*8 - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, MVT::i8));
@@ -12631,7 +10368,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
/// Insert one bit to mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
-SDValue
+SDValue
X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
@@ -12644,7 +10381,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
// insert element and then truncate the result.
MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
- SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
@@ -12815,27 +10552,47 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256()) {
- SDLoc dl(Op.getNode());
- SDValue Vec = Op.getNode()->getOperand(0);
- SDValue SubVec = Op.getNode()->getOperand(1);
- SDValue Idx = Op.getNode()->getOperand(2);
-
- if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
- Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
- SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
- }
+ if (!Subtarget->hasAVX())
+ return SDValue();
- if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
- SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // Fold two 16-byte subvector loads into one 32-byte load:
+ // (insert_subvector (insert_subvector undef, (load addr), 0),
+ // (load addr + 16), Elts/2)
+ // --> load32 addr
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+ !Subtarget->isUnalignedMem32Slow()) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+ if (Idx2->getZExtValue() == 0) {
+ SDValue Ops[] = { SubVec2, SubVec };
+ SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+ if (LD.getNode())
+ return LD;
+ }
}
}
+
+ if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ SubVecVT.is128BitVector())
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
return SDValue();
}
@@ -13392,7 +11149,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
}
return SDValue();
}
-
+
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
@@ -14039,7 +11796,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
InVT = ExtVT;
}
-
+
SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
SDValue CP = DAG.getConstantPool(C, getPointerTy());
@@ -14233,7 +11990,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
}
-
+
unsigned EltBits = EltVT.getSizeInBits();
LLVMContext *Context = DAG.getContext();
// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
@@ -14260,7 +12017,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
}
-
+
// If not vector, then scalar.
unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
@@ -14290,19 +12047,17 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
- // First get the sign bit of second operand.
- SmallVector<Constant*,4> CV;
- if (SrcVT == MVT::f64) {
- const fltSemantics &Sem = APFloat::IEEEdouble;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
- } else {
- const fltSemantics &Sem = APFloat::IEEEsingle;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- }
+ const fltSemantics &Sem =
+ VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+ const unsigned SizeInBits = VT.getSizeInBits();
+
+ SmallVector<Constant *, 4> CV(
+ VT == MVT::f64 ? 2 : 4,
+ ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
+
+ // First, clear all bits but the sign bit from the second operand (sign).
+ CV[0] = ConstantFP::get(*Context,
+ APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
Constant *C = ConstantVector::get(CV);
SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
@@ -14310,40 +12065,30 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
false, false, false, 16);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
- // Shift sign bit right or left if the two operands have different types.
- if (SrcVT.bitsGT(VT)) {
- // Op0 is MVT::f32, Op1 is MVT::f64.
- SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
- SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
- DAG.getConstant(32, MVT::i32));
- SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
- SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
- DAG.getIntPtrConstant(0));
- }
-
- // Clear first operand sign bit.
- CV.clear();
- if (VT == MVT::f64) {
- const fltSemantics &Sem = APFloat::IEEEdouble;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
- APInt(64, ~(1ULL << 63)))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+ // Next, clear the sign bit from the first operand (magnitude).
+ // If it's a constant, we can clear it here.
+ if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
+ APFloat APF = Op0CN->getValueAPF();
+ // If the magnitude is a positive zero, the sign bit alone is enough.
+ if (APF.isPosZero())
+ return SignBit;
+ APF.clearSign();
+ CV[0] = ConstantFP::get(*Context, APF);
} else {
- const fltSemantics &Sem = APFloat::IEEEsingle;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
- APInt(32, ~(1U << 31)))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV[0] = ConstantFP::get(
+ *Context,
+ APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
}
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
- SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
- SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
-
- // Or the value with the sign bit.
+ SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, 16);
+ // If the magnitude operand wasn't a constant, we need to AND out the sign.
+ if (!isa<ConstantFPSDNode>(Op0))
+ Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
+
+ // OR the magnitude value with the sign bit.
return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
}
@@ -14473,11 +12218,11 @@ static bool hasNonFlagsUse(SDValue Op) {
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- // KORTEST instruction should be selected
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
- DAG.getConstant(0, Op.getValueType()));
-
+ if (Op.getValueType() == MVT::i1) {
+ SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+ DAG.getConstant(0, MVT::i8));
+ }
// CF and OF aren't always set the way we want. Determine which
// of these we need.
bool NeedCF = false;
@@ -14697,9 +12442,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
DAG.getConstant(0, Op.getValueType()));
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SmallVector<SDValue, 4> Ops;
- for (unsigned i = 0; i != NumOperands; ++i)
- Ops.push_back(Op.getOperand(i));
+ SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesWith(Op, New);
@@ -14717,16 +12460,16 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if (Op0.getValueType() == MVT::i1)
llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
}
-
+
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
- // Do the comparison at i32 if it's smaller, besides the Atom case.
- // This avoids subregister aliasing issues. Keep the smaller reference
- // if we're optimizing for size, however, as that'll allow better folding
+ // Do the comparison at i32 if it's smaller, besides the Atom case.
+ // This avoids subregister aliasing issues. Keep the smaller reference
+ // if we're optimizing for size, however, as that'll allow better folding
// of memory operations.
if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
- !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
- AttributeSet::FunctionIndex, Attribute::MinSize) &&
+ !DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::MinSize) &&
!Subtarget->isAtom()) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
@@ -14780,7 +12523,7 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
return SDValue();
EVT VT = Op.getValueType();
-
+
// SSE1 has rsqrtss and rsqrtps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
@@ -14808,9 +12551,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
// significant digits in the divisor.
if (!Subtarget->useReciprocalEst())
return SDValue();
-
+
EVT VT = Op.getValueType();
-
+
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
@@ -15307,8 +13050,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
cast<ConstantSDNode>(Op1)->isNullValue() &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
- if (NewSetCC.getNode())
+ if (NewSetCC.getNode()) {
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
return NewSetCC;
+ }
}
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
@@ -15629,11 +13375,11 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
((Subtarget->hasDQI() && Subtarget->hasVLX() &&
VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
-
+
((Subtarget->hasDQI() && VT.is512BitVector() &&
VTElt.getSizeInBits() >= 32))))
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
-
+
unsigned int NumElts = VT.getVectorNumElements();
if (NumElts != 8 && NumElts != 16)
@@ -15718,6 +13464,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
@@ -15797,9 +13544,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
// Attempt to load the original value using scalar loads.
// Find the largest scalar type that divides the total loaded size.
MVT SclrLoadTy = MVT::i8;
- for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
- tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
- MVT Tp = (MVT::SimpleValueType)tp;
+ for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
SclrLoadTy = Tp;
}
@@ -16232,7 +13977,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
- bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+ bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
SplitStack;
SDLoc dl(Op);
@@ -16258,7 +14003,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
- const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
@@ -16316,8 +14061,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
@@ -16427,21 +14171,16 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!DAG.getTarget().Options.UseSoftFloat &&
- !(DAG.getMachineFunction()
- .getFunction()->getAttributes()
- .hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat)) &&
+ !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::NoImplicitFloat)) &&
Subtarget->hasSSE1());
}
// Insert VAARG_64 node into the DAG
// VAARG_64 returns two values: Variable Argument Address, Chain
- SmallVector<SDValue, 11> InstOps;
- InstOps.push_back(Chain);
- InstOps.push_back(SrcPtr);
- InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
- InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
- InstOps.push_back(DAG.getConstant(Align, MVT::i32));
+ SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, MVT::i32),
+ DAG.getConstant(ArgMode, MVT::i8),
+ DAG.getConstant(Align, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
VTs, InstOps, MVT::i64,
@@ -16558,7 +14297,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
SDValue SrcOp, SDValue ShAmt,
SelectionDAG &DAG) {
- assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+ MVT SVT = ShAmt.getSimpleValueType();
+ assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
// Catch shift-by-constant.
if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
@@ -16573,13 +14313,28 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
}
- // Need to build a vector containing shift amount
- // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
- SDValue ShOps[4];
- ShOps[0] = ShAmt;
- ShOps[1] = DAG.getConstant(0, MVT::i32);
- ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+ ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+ // Let the shuffle legalizer expand this shift amount node.
+ SDValue Op0 = ShAmt.getOperand(0);
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+ ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+ } else {
+ // Need to build a vector containing shift amount.
+ // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+ SmallVector<SDValue, 4> ShOps;
+ ShOps.push_back(ShAmt);
+ if (SVT == MVT::i32) {
+ ShOps.push_back(DAG.getConstant(0, SVT));
+ ShOps.push_back(DAG.getUNDEF(SVT));
+ }
+ ShOps.push_back(DAG.getUNDEF(SVT));
+
+ MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+ }
// The return type has to be a 128-bit type with the same element
// type as the input type.
@@ -16628,52 +14383,28 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
}
-static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- return X86ISD::FMADD;
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- return X86ISD::FMSUB;
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- return X86ISD::FNMADD;
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- return X86ISD::FNMSUB;
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- return X86ISD::FMADDSUB;
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
- return X86ISD::FMSUBADD;
- }
+/// \brief Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is comming as MVT::i8 and it should be truncated
+/// to MVT::i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node for
+/// a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (isAllOnes(Mask))
+ return Op;
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ // The mask should be of type MVT::i1
+ SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
@@ -16701,7 +14432,73 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
RoundingMode),
Mask, Src0, Subtarget, DAG);
}
-
+ case INTR_TYPE_SCALAR_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src0 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With supress-all-exceptions (sae) - 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ if (Op.getNumOperands() == 6) {
+ SDValue Sae = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ RoundingMode, Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1,Src2),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, Src1, Subtarget, DAG);
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3),
+ Mask, Src1, Subtarget, DAG);
+ }
case CMP_MASK:
case CMP_MASK_CC: {
// Comparison intrinsics with masks.
@@ -16751,9 +14548,45 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
case VSHIFT_MASK:
- return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
- Op.getOperand(1), Op.getOperand(2), DAG),
- Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);;
+ return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
+ Op.getSimpleValueType(),
+ Op.getOperand(1),
+ Op.getOperand(2), DAG),
+ Op.getOperand(4), Op.getOperand(3), Subtarget,
+ DAG);
+ case COMPRESS_EXPAND_IN_REG: {
+ SDValue Mask = Op.getOperand(3);
+ SDValue DataToCompress = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ if (isAllOnes(Mask)) // return data as is
+ return Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDLoc dl(Op);
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
+ PassThru);
+ }
+ case BLEND: {
+ SDValue Mask = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDLoc dl(Op);
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
+ Op.getOperand(2));
+ }
default:
break;
}
@@ -16762,138 +14595,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- // Arithmetic intrinsics.
- case Intrinsic::x86_sse2_pmulu_dq:
- case Intrinsic::x86_avx2_pmulu_dq:
- return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse41_pmuldq:
- case Intrinsic::x86_avx2_pmul_dq:
- return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pmulhu_w:
- case Intrinsic::x86_avx2_pmulhu_w:
- return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pmulh_w:
- case Intrinsic::x86_avx2_pmulh_w:
- return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- // SSE/SSE2/AVX floating point max/min intrinsics.
- case Intrinsic::x86_sse_max_ps:
- case Intrinsic::x86_sse2_max_pd:
- case Intrinsic::x86_avx_max_ps_256:
- case Intrinsic::x86_avx_max_pd_256:
- case Intrinsic::x86_sse_min_ps:
- case Intrinsic::x86_sse2_min_pd:
- case Intrinsic::x86_avx_min_ps_256:
- case Intrinsic::x86_avx_min_pd_256: {
- unsigned Opcode;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_sse_max_ps:
- case Intrinsic::x86_sse2_max_pd:
- case Intrinsic::x86_avx_max_ps_256:
- case Intrinsic::x86_avx_max_pd_256:
- Opcode = X86ISD::FMAX;
- break;
- case Intrinsic::x86_sse_min_ps:
- case Intrinsic::x86_sse2_min_pd:
- case Intrinsic::x86_avx_min_ps_256:
- case Intrinsic::x86_avx_min_pd_256:
- Opcode = X86ISD::FMIN;
- break;
- }
- return DAG.getNode(Opcode, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
- }
-
- // AVX2 variable shift intrinsics
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256: {
- unsigned Opcode;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q_256:
- Opcode = ISD::SHL;
- break;
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q_256:
- Opcode = ISD::SRL;
- break;
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- Opcode = ISD::SRA;
- break;
- }
- return DAG.getNode(Opcode, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
- }
-
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packsswb:
- return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_packuswb_128:
- case Intrinsic::x86_sse41_packusdw:
- case Intrinsic::x86_avx2_packuswb:
- case Intrinsic::x86_avx2_packusdw:
- return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_ssse3_pshuf_b_128:
- case Intrinsic::x86_avx2_pshuf_b:
- return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pshuf_d:
- return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pshufl_w:
- return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pshufh_w:
- return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_ssse3_psign_b_128:
- case Intrinsic::x86_ssse3_psign_w_128:
- case Intrinsic::x86_ssse3_psign_d_128:
- case Intrinsic::x86_avx2_psign_b:
- case Intrinsic::x86_avx2_psign_w:
- case Intrinsic::x86_avx2_psign_d:
- return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_avx2_permd:
- case Intrinsic::x86_avx2_permps:
- // Operands intentionally swapped. Mask is last operand to intrinsic,
- // but second operand for node/instruction.
- return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
- Op.getOperand(2), Op.getOperand(1));
-
case Intrinsic::x86_avx512_mask_valign_q_512:
case Intrinsic::x86_avx512_mask_valign_d_512:
// Vector source operands are swapped.
@@ -17056,58 +14757,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
-
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
- auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
- if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
- return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- else
- return SDValue();
- }
-
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
}
}
@@ -17305,7 +14954,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
switch(IntrData->Type) {
default:
llvm_unreachable("Unknown Intrinsic Type");
- break;
+ break;
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
@@ -17403,6 +15052,58 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
Results.push_back(Store);
return DAG.getMergeValues(Results, dl);
}
+ case COMPRESS_TO_MEM: {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToCompress = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+
+ if (isAllOnes(Mask)) // return just a store
+ return DAG.getStore(Chain, dl, DataToCompress, Addr,
+ MachinePointerInfo(), false, false, 0);
+
+ EVT VT = DataToCompress.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,
+ DataToCompress, DAG.getUNDEF(VT));
+ return DAG.getStore(Chain, dl, Compressed, Addr,
+ MachinePointerInfo(), false, false, 0);
+ }
+ case EXPAND_FROM_MEM: {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue PathThru = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+
+ if (isAllOnes(Mask)) // return just a load
+ return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
+ false, 0);
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
+ false, false, false, 0);
+
+ SDValue Results[] = {
+ DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru),
+ Chain};
+ return DAG.getMergeValues(Results, dl);
+ }
}
}
@@ -17420,8 +15121,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -17436,15 +15136,33 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
}
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ EVT VT = Op.getValueType();
+
MFI->setFrameAddressIsTaken(true);
- EVT VT = Op.getValueType();
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+ // Depth > 0 makes no sense on targets which use Windows unwind codes. It
+ // is not possible to crawl up the stack without looking at the unwind codes
+ // simultaneously.
+ int FrameAddrIndex = FuncInfo->getFAIndex();
+ if (!FrameAddrIndex) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+ SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
+ FuncInfo->setFAIndex(FrameAddrIndex);
+ }
+ return DAG.getFrameIndex(FrameAddrIndex, VT);
+ }
+
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
- unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
@@ -17471,8 +15189,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
}
@@ -17483,8 +15200,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl (Op);
EVT PtrVT = getPointerTy();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -17531,7 +15247,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Subtarget->is64Bit()) {
SDValue OutChains[6];
@@ -17694,8 +15410,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
@@ -18090,76 +15805,29 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG);
}
- if (VT == MVT::v16i8) {
- if (Op.getOpcode() == ISD::SHL) {
- // Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
- // Zero out the rightmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRL) {
- // Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
- // Zero out the leftmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRA) {
- if (ShiftAmt == 7) {
- // R s>> 7 === R s< 0
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
- }
-
- // R s>> a === ((R u>> a) ^ m) - m
- SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
- Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
- Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
- return Res;
- }
- llvm_unreachable("Unknown shift opcode.");
- }
+ if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- if (Subtarget->hasInt256() && VT == MVT::v32i8) {
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
// Zero out the rightmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
// Zero out the leftmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
@@ -18172,8 +15840,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
// R s>> a === ((R u>> a) ^ m) - m
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(NumElts,
+ DAG.getConstant(128 >> ShiftAmt, MVT::i8));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
@@ -18249,55 +15917,43 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
SDValue BaseShAmt;
EVT EltVT = VT.getVectorElementType();
- if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
- unsigned NumElts = VT.getVectorNumElements();
- unsigned i, j;
- for (i = 0; i != NumElts; ++i) {
- if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
- continue;
- break;
- }
- for (j = i; j != NumElts; ++j) {
- SDValue Arg = Amt.getOperand(j);
- if (Arg.getOpcode() == ISD::UNDEF) continue;
- if (Arg != Amt.getOperand(i))
- break;
- }
- if (i != NumElts && j == NumElts)
- BaseShAmt = Amt.getOperand(i);
+ if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+ // Check if this build_vector node is doing a splat.
+ // If so, then set BaseShAmt equal to the splat value.
+ BaseShAmt = BV->getSplatValue();
+ if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+ BaseShAmt = SDValue();
} else {
if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
Amt = Amt.getOperand(0);
- if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
- cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
+
+ ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
+ if (SVN && SVN->isSplat()) {
+ unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
SDValue InVec = Amt.getOperand(0);
if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
- unsigned NumElts = InVec.getValueType().getVectorNumElements();
- unsigned i = 0;
- for (; i != NumElts; ++i) {
- SDValue Arg = InVec.getOperand(i);
- if (Arg.getOpcode() == ISD::UNDEF) continue;
- BaseShAmt = Arg;
- break;
- }
+ assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
+ "Unexpected shuffle index found!");
+ BaseShAmt = InVec.getOperand(SplatIdx);
} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
if (ConstantSDNode *C =
dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
- unsigned SplatIdx =
- cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
if (C->getZExtValue() == SplatIdx)
BaseShAmt = InVec.getOperand(1);
}
}
- if (!BaseShAmt.getNode())
- BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
- DAG.getIntPtrConstant(0));
+
+ if (!BaseShAmt)
+ // Avoid introducing an extract element from a shuffle.
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
+ DAG.getIntPtrConstant(SplatIdx));
}
}
if (BaseShAmt.getNode()) {
- if (EltVT.bitsGT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+ assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+ if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
else if (EltVT.bitsLT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
@@ -18415,7 +16071,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
// Do this only if the vector shift count is a constant build_vector.
- if (Op.getOpcode() == ISD::SHL &&
+ if (Op.getOpcode() == ISD::SHL &&
(VT == MVT::v8i16 || VT == MVT::v4i32 ||
(Subtarget->hasInt256() && VT == MVT::v16i16)) &&
ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
@@ -18507,15 +16163,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
CanBeSimplified = Amt2 == Amt->getOperand(j);
}
}
-
+
if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
isa<ConstantSDNode>(Amt2)) {
// Replace this node with two shifts followed by a MOVSS/MOVSD.
EVT CastVT = MVT::v4i32;
- SDValue Splat1 =
+ SDValue Splat1 =
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
- SDValue Splat2 =
+ SDValue Splat2 =
DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
if (TargetOpcode == X86ISD::MOVSD)
@@ -18704,81 +16360,17 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-// Sign extension of the low part of vector elements. This may be used either
-// when sign extend instructions are not available or if the vector element
-// sizes already match the sign-extended size. If the vector elements are in
-// their pre-extended size and sign extend instructions are available, that will
-// be handled by LowerSIGN_EXTEND.
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc dl(Op);
- EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
- MVT VT = Op.getSimpleValueType();
-
- if (!Subtarget->hasSSE2() || !VT.isVector())
- return SDValue();
-
- unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
- ExtraVT.getScalarType().getSizeInBits();
-
- switch (VT.SimpleTy) {
- default: return SDValue();
- case MVT::v8i32:
- case MVT::v16i16:
- if (!Subtarget->hasFp256())
- return SDValue();
- if (!Subtarget->hasInt256()) {
- // needs to be split
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- EVT ExtraEltVT = ExtraVT.getVectorElementType();
- unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
- ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
- ExtraNumElems/2);
- SDValue Extra = DAG.getValueType(ExtraVT);
-
- LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
- LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
- }
- // fall through
- case MVT::v4i32:
- case MVT::v8i16: {
- SDValue Op0 = Op.getOperand(0);
-
- // This is a sign extension of some low part of vector elements without
- // changing the size of the vector elements themselves:
- // Shift-Left + Shift-Right-Algebraic.
- SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
- BitsDiff, DAG);
- return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
- DAG);
- }
- }
-}
-
/// Returns true if the operand type is exactly twice the native width, and
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
else if (OpWidth == 128)
- return Subtarget.hasCmpxchg16b();
+ return Subtarget->hasCmpxchg16b();
else
return false;
}
@@ -18795,9 +16387,7 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
}
bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
@@ -18840,9 +16430,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) {
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
@@ -18878,7 +16466,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
- } else if (hasMFENCE(Subtarget)) {
+ } else if (hasMFENCE(*Subtarget)) {
Function *MFence = llvm::Intrinsic::getDeclaration(M,
Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence);
@@ -18997,9 +16585,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
DAG.getIntPtrConstant(i)));
// Explicitly mark the extra elements as Undef.
- SDValue Undef = DAG.getUNDEF(SVT);
- for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
- Elts.push_back(Undef);
+ Elts.append(NumElts, DAG.getUNDEF(SVT));
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
@@ -19025,6 +16611,139 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
return SDValue();
}
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+
+ Op = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "CTPOP lowering only implemented for 128/256-bit wide vector types");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ unsigned Len = EltVT.getSizeInBits();
+
+ // This is the vectorized version of the "best" algorithm from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ // with a minor tweak to use a series of adds + shifts instead of vector
+ // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
+ //
+ // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
+ // v8i32 => Always profitable
+ //
+ // FIXME: There a couple of possible improvements:
+ //
+ // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
+ // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
+ //
+ assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
+ "CTPOP not implemented for this vector element type.");
+
+ // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
+ // extra legalization.
+ bool NeedsBitcast = EltVT == MVT::i32;
+ MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
+
+ SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
+ SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
+ SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
+ SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
+ if (NeedsBitcast)
+ Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+
+ SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
+ SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
+ if (NeedsBitcast)
+ M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
+
+ SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
+
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
+ SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
+ SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
+ SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
+
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
+ if (NeedsBitcast) {
+ Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+ M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
+ Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
+ }
+
+ SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
+ SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
+ if (VT != AndRHS.getValueType()) {
+ AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
+ AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
+ }
+ SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
+
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
+ SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
+ Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+
+ SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
+ SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
+ if (NeedsBitcast) {
+ Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+ M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
+ }
+ And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+ // The algorithm mentioned above uses:
+ // v = (v * 0x01010101...) >> (Len - 8)
+ //
+ // Change it to use vector adds + vector shifts which yield faster results on
+ // Haswell than using vector integer multiplication.
+ //
+ // For i32 elements:
+ // v = v + (v >> 8)
+ // v = v + (v >> 16)
+ //
+ // For i64 elements:
+ // v = v + (v >> 8)
+ // v = v + (v >> 16)
+ // v = v + (v >> 32)
+ //
+ Add = And;
+ SmallVector<SDValue, 8> Csts;
+ for (unsigned i = 8; i <= Len/2; i *= 2) {
+ Csts.assign(NumElts, DAG.getConstant(i, EltVT));
+ SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
+ Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+ Csts.clear();
+ }
+
+ // The result is on the least significant 6-bits on i32 and 7-bits on i64.
+ SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
+ SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
+ SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
+ if (NeedsBitcast) {
+ Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+ M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
+ }
+ And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+ return And;
+}
+
static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
SDNode *Node = Op.getNode();
SDLoc dl(Node);
@@ -19148,15 +16867,15 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
- case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
- case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
@@ -19243,6 +16962,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+ case X86ISD::FMINC:
+ case X86ISD::FMIN:
+ case X86ISD::FMAXC:
+ case X86ISD::FMAX: {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
+ llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+ SDValue UNDEF = DAG.getUNDEF(VT);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+ return;
+ }
case ISD::SIGN_EXTEND_INREG:
case ISD::ADDC:
case ISD::ADDE:
@@ -19599,6 +17334,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
+ case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
+ case X86ISD::EXPAND: return "X86ISD::EXPAND";
+ case X86ISD::SELECT: return "X86ISD::SELECT";
+ case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
}
}
@@ -19747,6 +17492,8 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
@@ -19783,68 +17530,20 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
if (!VT.isSimple())
return false;
- MVT SVT = VT.getSimpleVT();
-
// Very little shuffling can be done for 64-bit vectors right now.
if (VT.getSizeInBits() == 64)
return false;
- // If this is a single-input shuffle with no 128 bit lane crossings we can
- // lower it into pshufb.
- if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
- (SVT.is256BitVector() && Subtarget->hasInt256())) {
- bool isLegal = true;
- for (unsigned I = 0, E = M.size(); I != E; ++I) {
- if (M[I] >= (int)SVT.getVectorNumElements() ||
- ShuffleCrosses128bitLane(SVT, I, M[I])) {
- isLegal = false;
- break;
- }
- }
- if (isLegal)
- return true;
- }
-
- // FIXME: blends, shifts.
- return (SVT.getVectorNumElements() == 2 ||
- ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
- isMOVLMask(M, SVT) ||
- isMOVHLPSMask(M, SVT) ||
- isSHUFPMask(M, SVT) ||
- isSHUFPMask(M, SVT, /* Commuted */ true) ||
- isPSHUFDMask(M, SVT) ||
- isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
- isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
- isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
- isPALIGNRMask(M, SVT, Subtarget) ||
- isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
- isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
- isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
- isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
- isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
- (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
+ // We only care that the types being shuffled are legal. The lowering can
+ // handle any possible shuffle mask that results.
+ return isTypeLegal(VT.getSimpleVT());
}
bool
X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
EVT VT) const {
- if (!VT.isSimple())
- return false;
-
- MVT SVT = VT.getSimpleVT();
- unsigned NumElts = SVT.getVectorNumElements();
- // FIXME: This collection of masks seems suspect.
- if (NumElts == 2)
- return true;
- if (NumElts == 4 && SVT.is128BitVector()) {
- return (isMOVLMask(Mask, SVT) ||
- isCommutedMOVLMask(Mask, SVT, true) ||
- isSHUFPMask(Mask, SVT) ||
- isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
- isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
- Subtarget->hasInt256()));
- }
- return false;
+ // Just delegate to the generic legality, clear masks aren't special.
+ return isShuffleMaskLegal(Mask, VT);
}
//===----------------------------------------------------------------------===//
@@ -19982,11 +17681,10 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
return BB;
}
-static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII,
- const X86Subtarget* Subtarget) {
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
DebugLoc dl = MI->getDebugLoc();
-
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
@@ -20008,9 +17706,8 @@ static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
}
MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(
- MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
@@ -20040,7 +17737,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
// Machine Information
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -20192,7 +17889,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
.setMemRefs(MMOBegin, MMOEnd);
// Jump to endMBB
- BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+ BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
.addMBB(endMBB);
}
@@ -20296,7 +17993,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
unsigned CountReg = MI->getOperand(0).getReg();
@@ -20306,7 +18003,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
if (!Subtarget->isTargetWin64()) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
- BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+ BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
MBB->addSuccessor(EndMBB);
}
@@ -20379,7 +18076,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -20405,8 +18102,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const TargetRegisterInfo *TRI =
- BB->getParent()->getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (!MI->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
@@ -20448,7 +18144,7 @@ MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -20510,7 +18206,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
- BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+ BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
@@ -20518,13 +18214,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
.addReg(SPLimitVReg);
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
.addReg(SPLimitVReg);
- BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
- const uint32_t *RegMask = MF->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
@@ -20557,7 +18251,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
.addReg(IsLP64 ? X86::RAX : X86::EAX);
- BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Set up the CFG correctly.
BB->addSuccessor(bumpMBB);
@@ -20581,52 +18275,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
- assert(!Subtarget->isTargetMacho());
-
- // The lowering is pretty easy: we're just emitting the call to _alloca. The
- // non-trivial part is impdef of ESP.
-
- if (Subtarget->isTargetWin64()) {
- if (Subtarget->isTargetCygMing()) {
- // ___chkstk(Mingw64):
- // Clobbers R10, R11, RAX and EFLAGS.
- // Updates RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("___chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::RSP, RegState::Implicit)
- .addReg(X86::RAX, RegState::Define | RegState::Implicit)
- .addReg(X86::RSP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- } else {
- // __chkstk(MSVCRT): does not update stack pointer.
- // Clobbers R10, R11 and EFLAGS.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("__chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- // RAX has the offset to be subtracted from RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
- .addReg(X86::RSP)
- .addReg(X86::RAX);
- }
- } else {
- const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
- Subtarget->isTargetWindowsItanium())
- ? "_chkstk"
- : "_alloca";
+ assert(!Subtarget->isTargetMachO());
- BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
- .addExternalSymbol(StackProbeSymbol)
- .addReg(X86::EAX, RegState::Implicit)
- .addReg(X86::ESP, RegState::Implicit)
- .addReg(X86::EAX, RegState::Define | RegState::Implicit)
- .addReg(X86::ESP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- }
+ X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
MI->eraseFromParent(); // The pseudo instruction is gone now.
return BB;
@@ -20640,8 +18293,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
@@ -20650,10 +18302,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
- const uint32_t *RegMask = F->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
@@ -20698,7 +18348,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
@@ -20739,6 +18389,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
// v = phi(main, restore)
//
// restoreMBB:
+ // if base pointer being used, load it from frame
// v_restore = 1
MachineBasicBlock *thisMBB = MBB;
@@ -20804,8 +18455,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
@@ -20822,8 +18472,20 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
.addReg(restoreDstReg).addMBB(restoreMBB);
// restoreMBB:
+ if (RegInfo->hasBasePointer(*MF)) {
+ const bool Uses64BitFramePtr =
+ Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ X86FI->setRestoreBasePointer(MF);
+ unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+ unsigned BasePtr = RegInfo->getBaseRegister();
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
- BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
restoreMBB->addSuccessor(sinkMBB);
MI->eraseFromParent();
@@ -20835,7 +18497,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
@@ -20850,8 +18512,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
@@ -20895,7 +18556,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
// Replace 213-type (isel default) FMA3 instructions with 231-type for
// accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.
+// to remove extra copies in the loop.
MachineBasicBlock *
X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
MachineBasicBlock *MBB) const {
@@ -20970,7 +18631,7 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
default: llvm_unreachable("Unrecognized FMA variant.");
}
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
MachineInstrBuilder MIB =
BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
.addOperand(MI->getOperand(0))
@@ -20993,6 +18654,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
+ case X86::TAILJMPd64_REX:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
llvm_unreachable("TAILJMP64 would not be touched here.");
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
@@ -21035,7 +18699,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
MachineFunction *F = BB->getParent();
- const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// Change the floating point control register to use "round towards zero"
@@ -21119,7 +18783,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRM128MEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
@@ -21132,16 +18796,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRIMEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
- Subtarget);
+ return EmitMonitor(MI, BB, Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -21157,6 +18820,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::EH_SjLj_LongJmp64:
return emitEHSjLjLongJmp(MI, BB);
+ case TargetOpcode::STATEPOINT:
+ // As an implementation detail, STATEPOINT shares the STACKMAP format at
+ // this point in the process. We diverge later.
+ return emitPatchPoint(MI, BB);
+
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
@@ -22118,9 +19786,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
// We're looking for blends between FADD and FSUB nodes. We insist on these
// nodes being lined up in a specific expected pattern.
- if (!(isShuffleEquivalent(Mask, 0, 3) ||
- isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
- isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+ if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
return SDValue();
// Only specific types are legal at this point, assert so we notice if and
@@ -22176,7 +19844,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
EVT SVT = BC0.getValueType();
unsigned Opcode = BC0.getOpcode();
unsigned NumElts = VT.getVectorNumElements();
-
+
if (BC0.hasOneUse() && SVT.isVector() &&
SVT.getVectorNumElements() * 2 == NumElts &&
TLI.isOperationLegal(Opcode, VT)) {
@@ -22304,7 +19972,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
: InVec.getOperand(1);
// If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+ unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+ InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
@@ -22349,9 +20018,30 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EltNo);
}
+/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+/// special and don't usually play with other vector types, it's better to
+/// handle them early to be sure we emit efficient code by avoiding
+/// store-load conversions.
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::x86mmx ||
+ N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
+ N->getOperand(0)->getValueType(0) != MVT::v2i32)
+ return SDValue();
+
+ SDValue V = N->getOperand(0);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
+ N->getValueType(0), V.getOperand(0));
+
+ return SDValue();
+}
+
/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts
-/// to a simple store and scalar loads to extract the elements.
+/// into a somewhat faster sequence. For i686, the best sequence is apparently
+/// storing the value and loading scalars back, while for x64 we should
+/// use 64-bit extracts and shifts.
static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
@@ -22360,14 +20050,29 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
SDValue InputVector = N->getOperand(0);
- // Detect whether we are trying to convert from mmx to i32 and the bitcast
- // from mmx to v2i32 has a single usage.
- if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
- InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
- InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- InputVector.getNode()->getOperand(0));
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ N->getValueType(0) == MVT::i32 &&
+ InputVector.getValueType() == MVT::v2i32) {
+
+ // The bitcast source is a direct mmx result.
+ SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
+
+ // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+ MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
+ MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ MMXSrcOp.getOperand(0));
+ }
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
@@ -22410,36 +20115,61 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Ok, we've now decided to do the transformation.
+ // If 64-bit shifts are legal, use the extract-shift sequence,
+ // otherwise bounce the vector off the cache.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Vals[4];
SDLoc dl(InputVector);
- // Store the value to a temporary stack slot.
- SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
- SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
- MachinePointerInfo(), false, false, 0);
+ if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+ SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
+ EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+ SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(0, VecIdxTy));
+ SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(1, VecIdxTy));
+
+ SDValue ShAmt = DAG.getConstant(32,
+ DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
+ Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+ Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+ Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+ Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+ } else {
+ // Store the value to a temporary stack slot.
+ SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+ MachinePointerInfo(), false, false, 0);
- // Replace each use (extract) with a load of the appropriate element.
- for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
- UE = Uses.end(); UI != UE; ++UI) {
- SDNode *Extract = *UI;
+ EVT ElementType = InputVector.getValueType().getVectorElementType();
+ unsigned EltSize = ElementType.getSizeInBits() / 8;
- // cOMpute the element's address.
- SDValue Idx = Extract->getOperand(1);
- unsigned EltSize =
- InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
- uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+ // Replace each use (extract) with a load of the appropriate element.
+ for (unsigned i = 0; i < 4; ++i) {
+ uint64_t Offset = EltSize * i;
+ SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+
+ SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+ StackPtr, OffsetVal);
+
+ // Load the scalar.
+ Vals[i] = DAG.getLoad(ElementType, dl, Ch,
+ ScalarAddr, MachinePointerInfo(),
+ false, false, false, 0);
- SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
- StackPtr, OffsetVal);
+ }
+ }
- // Load the scalar.
- SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
- ScalarAddr, MachinePointerInfo(),
- false, false, false, 0);
+ // Replace the extracts
+ for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+ UE = Uses.end(); UI != UE; ++UI) {
+ SDNode *Extract = *UI;
- // Replace the exact with the load.
- DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+ SDValue Idx = Extract->getOperand(1);
+ uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
}
// The replacement was made in place; don't return anything.
@@ -22456,6 +20186,21 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
bool NeedSplit = false;
switch (VT.getSimpleVT().SimpleTy) {
default: return std::make_pair(0, false);
+ case MVT::v4i64:
+ case MVT::v2i64:
+ if (!Subtarget->hasVLX())
+ return std::make_pair(0, false);
+ break;
+ case MVT::v64i8:
+ case MVT::v32i16:
+ if (!Subtarget->hasBWI())
+ return std::make_pair(0, false);
+ break;
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (!Subtarget->hasAVX512())
+ return std::make_pair(0, false);
+ break;
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
@@ -22522,7 +20267,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
}
static SDValue
-TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc dl(N);
SDValue Cond = N->getOperand(0);
@@ -22535,18 +20280,6 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
Cond = CondSrc->getOperand(0);
}
- MVT VT = N->getSimpleValueType(0);
- MVT EltVT = VT.getVectorElementType();
- unsigned NumElems = VT.getVectorNumElements();
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return SDValue();
-
- if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
- return SDValue();
- if (!Subtarget->hasInt256() && VT == MVT::v16i16)
- return SDValue();
-
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
@@ -22560,6 +20293,8 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
return SDValue();
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> ShuffleMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
// Be sure we emit undef where we can.
@@ -22569,6 +20304,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
}
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+ return SDValue();
return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
}
@@ -22589,8 +20327,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
+ // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && TLI.isTypeLegal(VT) &&
+ VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -23008,96 +20747,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // Try to fold this VSELECT into a MOVSS/MOVSD
- if (N->getOpcode() == ISD::VSELECT &&
- Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
- if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
- (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
- bool CanFold = false;
- unsigned NumElems = Cond.getNumOperands();
- SDValue A = LHS;
- SDValue B = RHS;
-
- if (isZero(Cond.getOperand(0))) {
- CanFold = true;
-
- // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
- // fold (vselect <0,-1> -> (movsd A, B)
- for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
- CanFold = isAllOnes(Cond.getOperand(i));
- } else if (isAllOnes(Cond.getOperand(0))) {
- CanFold = true;
- std::swap(A, B);
-
- // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
- // fold (vselect <-1,0> -> (movsd B, A)
- for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
- CanFold = isZero(Cond.getOperand(i));
- }
-
- if (CanFold) {
- if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
- return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
- }
-
- if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
- // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
- // (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
- // (v2i64 (bitcast B)))))
- //
- // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
- // (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
- // (v2f64 (bitcast B)))))
- //
- // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
- // (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
- // (v2i64 (bitcast A)))))
- //
- // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
- // (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
- // (v2f64 (bitcast A)))))
-
- CanFold = (isZero(Cond.getOperand(0)) &&
- isZero(Cond.getOperand(1)) &&
- isAllOnes(Cond.getOperand(2)) &&
- isAllOnes(Cond.getOperand(3)));
-
- if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
- isAllOnes(Cond.getOperand(1)) &&
- isZero(Cond.getOperand(2)) &&
- isZero(Cond.getOperand(3))) {
- CanFold = true;
- std::swap(LHS, RHS);
- }
-
- if (CanFold) {
- EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
- SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
- SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
- SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
- NewB, DAG);
- return DAG.getNode(ISD::BITCAST, DL, VT, Select);
- }
- }
- }
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if ((N->getOpcode() == ISD::VSELECT ||
+ N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+ !DCI.isBeforeLegalize()) {
+ SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
}
- // If we know that this node is legal then we know that it is going to be
- // matched by one of the SSE/AVX BLEND instructions. These instructions only
- // depend on the highest bit in each word. Try to use SimplifyDemandedBits
- // to simplify previous instructions.
+ // If this is a *dynamic* select (non-constant condition) and we can match
+ // this node with one of the variable blend instructions, restructure the
+ // condition so that the blends can use the high bit of each element and use
+ // SimplifyDemandedBits to simplify the condition operand.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
- // We explicitly check against v8i16 and v16i16 because, although
- // they're marked as Custom, they might only be legal when Cond is a
- // build_vector of constants. This will be taken care in a later
- // condition.
- (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
- VT != MVT::v8i16) &&
- // Don't optimize vector of constants. Those are handled by
- // the generic code and all the bits must be properly set for
- // the generic optimizer.
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
@@ -23105,6 +20779,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (BitWidth == 1)
return SDValue();
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getScalarType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
+ !Subtarget->hasAVX2())
+ return SDValue();
+
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
@@ -23153,25 +20852,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // We should generate an X86ISD::BLENDI from a vselect if its argument
- // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
- // constants. This specific pattern gets generated when we split a
- // selector for a 512 bit vector in a machine without AVX512 (but with
- // 256-bit vectors), during legalization:
- //
- // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
- //
- // Iff we find this pattern and the build_vectors are built from
- // constants, we translate the vselect into a shuffle_vector that we
- // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
- if ((N->getOpcode() == ISD::VSELECT ||
- N->getOpcode() == X86ISD::SHRUNKBLEND) &&
- !DCI.isBeforeLegalize()) {
- SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
- if (Shuffle.getNode())
- return Shuffle;
- }
-
return SDValue();
}
@@ -23524,7 +21204,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
// fold (blend A, B, allOnes) -> B
if (ISD::isBuildVectorAllOnes(Mask.getNode()))
return Op1;
-
+
// Simplify the case where the mask is a constant i32 value.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
if (C->isNullValue())
@@ -23590,7 +21270,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = N->getValueType(0);
- if (VT != MVT::i64)
+ if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -23948,24 +21628,118 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
}
}
+static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // A vector zext_in_reg may be represented as a shuffle,
+ // feeding into a bitcast (this represents anyext) feeding into
+ // an and with a mask.
+ // We'd like to try to combine that into a shuffle with zero
+ // plus a bitcast, removing the and.
+ if (N0.getOpcode() != ISD::BITCAST ||
+ N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ // The other side of the AND should be a splat of 2^C, where C
+ // is the number of bits in the source type.
+ if (N1.getOpcode() == ISD::BITCAST)
+ N1 = N1.getOperand(0);
+ if (N1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+ BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
+
+ ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
+ EVT SrcType = Shuffle->getValueType(0);
+
+ // We expect a single-source shuffle
+ if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ unsigned SrcSize = SrcType.getScalarSizeInBits();
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!Vector->isConstantSplat(SplatValue, SplatUndef,
+ SplatBitSize, HasAnyUndefs))
+ return SDValue();
+
+ unsigned ResSize = N1.getValueType().getScalarSizeInBits();
+ // Make sure the splat matches the mask we expect
+ if (SplatBitSize > ResSize ||
+ (SplatValue + 1).exactLogBase2() != (int)SrcSize)
+ return SDValue();
+
+ // Make sure the input and output size make sense
+ if (SrcSize >= ResSize || ResSize % SrcSize)
+ return SDValue();
+
+ // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
+ // The number of u's between each two values depends on the ratio between
+ // the source and dest type.
+ unsigned ZextRatio = ResSize / SrcSize;
+ bool IsZext = true;
+ for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+ if (i % ZextRatio) {
+ if (Shuffle->getMaskElt(i) > 0) {
+ // Expected undef
+ IsZext = false;
+ break;
+ }
+ } else {
+ if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
+ // Expected element number
+ IsZext = false;
+ break;
+ }
+ }
+ }
+
+ if (!IsZext)
+ return SDValue();
+
+ // Ok, perform the transformation - replace the shuffle with
+ // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
+ // (instead of undef) where the k elements come from the zero vector.
+ SmallVector<int, 8> Mask;
+ unsigned NumElems = SrcType.getVectorNumElements();
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (i % ZextRatio)
+ Mask.push_back(NumElems);
+ else
+ Mask.push_back(i / ZextRatio);
+
+ SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
+ Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask);
+ return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle);
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget);
+ if (Zext.getNode())
+ return Zext;
+
SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
if (R.getNode())
return R;
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
// Create BEXTR instructions
// BEXTR is ((X >> imm) & (2**size-1))
if (VT == MVT::i32 || VT == MVT::i64) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
// Check for BEXTR.
if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
(N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
@@ -23975,7 +21749,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
uint64_t Mask = MaskNode->getZExtValue();
uint64_t Shift = ShiftNode->getZExtValue();
if (isMask_64(Mask)) {
- uint64_t MaskSize = CountPopulation_64(Mask);
+ uint64_t MaskSize = countPopulation(Mask);
if (Shift + MaskSize <= VT.getSizeInBits())
return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
DAG.getConstant(Shift | (MaskSize << 8), VT));
@@ -23993,10 +21767,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (VT != MVT::v2i64 && VT != MVT::v4i64)
return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
// Check LHS for vnot
if (N0.getOpcode() == ISD::XOR &&
//ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
@@ -24108,8 +21878,8 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ bool OptForSize =
+ MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
// SHLD/SHRD instructions have lower register pressure, but on some
// platforms they have higher latency than the equivalent
@@ -24233,11 +22003,12 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // On Sandybridge unaligned 256bit loads are inefficient.
+ // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+ // into two 16-byte operations.
ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
- if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
@@ -24270,6 +22041,166 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ return SDValue();
+
+ EVT VT = Mld->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT LdVT = Mld->getMemoryVT();
+ SDLoc dl(Mld);
+
+ assert(LdVT != VT && "Cannot extend to the same type");
+ unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+ unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for extending masked load");
+
+ unsigned SizeRatio = ToSz / FromSz;
+ assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ LdVT.getScalarType(), NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ // Convert Src0 value
+ SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+ if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+ WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+ DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+ }
+ // Prepare the new mask
+ SDValue NewMask;
+ SDValue Mask = Mld->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+ Mld->getBasePtr(), NewMask, WideSrc0,
+ Mld->getMemoryVT(), Mld->getMemOperand(),
+ ISD::NON_EXTLOAD);
+ SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+ if (!Mst->isTruncatingStore())
+ return SDValue();
+
+ EVT VT = Mst->getValue().getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for truncating masked store");
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
+ "Unexpected ratio for truncating masked store");
+
+ unsigned SizeRatio = FromSz / ToSz;
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+
+ SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ SDValue NewMask;
+ SDValue Mask = Mst->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
+ NewMask, StVT, Mst->getMemOperand(), false);
+}
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
@@ -24280,13 +22211,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we are saving a concatenation of two XMM registers, perform two stores.
- // On Sandy Bridge, 256-bit memory operations are executed by two
- // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
- // memory operation.
+ // If we are saving a concatenation of two XMM registers and 32-byte stores
+ // are slow, such as on Sandy Bridge, perform two 16-byte stores.
unsigned Alignment = St->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
- if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
StVT == VT && !IsAligned) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
@@ -24352,9 +22281,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// Find the largest store unit
MVT StoreType = MVT::i8;
- for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
- tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
- MVT Tp = (MVT::SimpleValueType)tp;
+ for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
StoreType = Tp;
}
@@ -24399,8 +22326,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
const Function *F = DAG.getMachineFunction().getFunction();
- bool NoImplicitFloatOps = F->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+ bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
&& Subtarget->hasSSE2();
if ((VT.isVector() ||
@@ -24500,7 +22426,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// Return 'true' if this vector operation is "horizontal"
/// and return the operands for the horizontal operation in LHS and RHS. A
/// horizontal operation performs the binary operation on successive elements
/// of its first operand, then on successive elements of its second operand,
@@ -24626,7 +22552,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
return true;
}
-/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+/// Do target-specific dag combines on floating point adds.
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
@@ -24641,7 +22567,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+/// Do target-specific dag combines on floating point subs.
static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
@@ -24656,23 +22582,23 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
-/// X86ISD::FXOR nodes.
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
// F[X]OR(0.0, x) -> x
- // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
return SDValue();
}
-/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
-/// X86ISD::FMAX nodes.
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
@@ -24693,29 +22619,33 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
N->getOperand(0), N->getOperand(1));
}
-/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+/// Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
- // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
+
+ // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
-/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
+/// Do target-specific dag combines on X86ISD::FANDN nodes
static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
- // FANDN(x, 0.0) -> 0.0
// FANDN(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
@@ -24978,6 +22908,23 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+ SelectionDAG &DAG) {
+ SDLoc dl(Load);
+ MVT VT = Load->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue Addr = Load->getOperand(1);
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+ DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+ SDValue NewLoad =
+ DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Load->getMemOperand(), 0, EVT.getStoreSize()));
+ return NewLoad;
+}
+
static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc dl(N);
@@ -24989,20 +22936,47 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
if (MayFoldLoad(Ld)) {
// Extract the countS bits from the immediate so we can get the proper
// address when narrowing the vector load to a specific element.
- // When the second source op is a memory address, interps doesn't use
+ // When the second source op is a memory address, insertps doesn't use
// countS and just gets an f32 from that address.
unsigned DestIndex =
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
- } else
- return SDValue();
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
- // countS bits are ignored when loading from memory on insertps, which
- // means we don't need to explicitly set them to 0.
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
- LoadScalarToVector, N->getOperand(2));
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+ // countS bits are ignored when loading from memory on insertps, which
+ // means we don't need to explicitly set them to 0.
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+ LoadScalarToVector, N->getOperand(2));
+ }
+ return SDValue();
+}
+
+static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue V0 = N->getOperand(0);
+ SDValue V1 = N->getOperand(1);
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+ // operands and changing the mask to 1. This saves us a bunch of
+ // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+ // x86InstrInfo knows how to commute this back after instruction selection
+ // if it would help register allocation.
+
+ // TODO: If optimizing for size or a processor that doesn't suffer from
+ // partial register update stalls, this should be transformed into a MOVSD
+ // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+ if (VT == MVT::v2f64)
+ if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+ SDValue NewMask = DAG.getConstant(1, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+ }
+
+ return SDValue();
}
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
@@ -25134,7 +23108,7 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
}
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
- const X86TargetLowering *XTLI) {
+ const X86Subtarget *Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -25160,10 +23134,9 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
EVT VT = Ld->getValueType(0);
if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !XTLI->getSubtarget()->is64Bit() &&
- VT == MVT::i64) {
- SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
- Ld->getChain(), Op0, DAG);
+ !Subtarget->is64Bit() && VT == MVT::i64) {
+ SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+ SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
return FILDChain;
}
@@ -25362,6 +23335,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SELECT:
case X86ISD::SHRUNKBLEND:
return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return PerformBITCASTCombine(N, DAG);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
@@ -25374,8 +23348,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
- case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
@@ -25414,8 +23390,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
- case X86ISD::INSERTPS:
- return PerformINSERTPSCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS: {
+ if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
+ break;
+ }
+ case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
}
@@ -25841,6 +23821,23 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
}
return;
+ case 'L':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+ (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 3) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'N':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
@@ -25849,6 +23846,14 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
}
return;
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 127) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'e': {
// 32-bit signed value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
@@ -25938,8 +23943,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-std::pair<unsigned, const TargetRegisterClass*>
-X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ const std::string &Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
@@ -26045,7 +24051,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
- Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
@@ -26193,7 +24199,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
// "load" ports instead of the dedicated "store" port.
// E.g., on Haswell:
// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
- // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
if (isLegalAddressingMode(AM, Ty))
// Scale represents reg2 * scale, thus account for 1
// as soon as we use a second register.