aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
authorElena Demikhovsky <elena.demikhovsky@intel.com>2013-08-11 07:55:09 +0000
committerElena Demikhovsky <elena.demikhovsky@intel.com>2013-08-11 07:55:09 +0000
commitfac4a4eb7dfbfc90ae1d5c7d6c39a2d89a33c30e (patch)
treedd79127c979855b250e8b8651917f7142792cd48 /lib/Target/X86
parent5b854f1ea55601790d9191c9720e77da35095340 (diff)
downloadexternal_llvm-fac4a4eb7dfbfc90ae1d5c7d6c39a2d89a33c30e.zip
external_llvm-fac4a4eb7dfbfc90ae1d5c7d6c39a2d89a33c30e.tar.gz
external_llvm-fac4a4eb7dfbfc90ae1d5c7d6c39a2d89a33c30e.tar.bz2
AVX-512: Added VPERM* instructons and MOV* zmm-to-zmm instructions.
Added a test for shuffles using VPERM. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188147 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp152
-rw-r--r--lib/Target/X86/X86ISelLowering.h1
-rw-r--r--lib/Target/X86/X86InstrAVX512.td237
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td40
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp49
5 files changed, 432 insertions, 47 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 00b4976..dfd41b7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3872,11 +3872,13 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
bool HasInt256, bool V2IsSplat = false) {
- unsigned NumElts = VT.getVectorNumElements();
+ if (VT.is512BitVector())
+ return false;
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
+ unsigned NumElts = VT.getVectorNumElements();
if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
(!HasInt256 || (NumElts != 16 && NumElts != 32)))
return false;
@@ -3911,6 +3913,8 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
bool HasInt256, bool V2IsSplat = false) {
unsigned NumElts = VT.getVectorNumElements();
+ if (VT.is512BitVector())
+ return false;
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
@@ -3948,6 +3952,8 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
unsigned NumElts = VT.getVectorNumElements();
bool Is256BitVec = VT.is256BitVector();
+ if (VT.is512BitVector())
+ return false;
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
@@ -3988,6 +3994,9 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
unsigned NumElts = VT.getVectorNumElements();
+ if (VT.is512BitVector())
+ return false;
+
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
@@ -4093,6 +4102,44 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
return (FstHalf | (SndHalf << 4));
}
+// Symetric in-lane mask. Each lane has 4 elements (for imm8)
+static bool isPermImmMask(ArrayRef<int> Mask, EVT VT, unsigned& Imm8) {
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ if (EltSize < 32)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ Imm8 = 0;
+ if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (Mask[i] < 0)
+ continue;
+ Imm8 |= Mask[i] << (i*2);
+ }
+ return true;
+ }
+
+ unsigned LaneSize = 4;
+ SmallVector<int, 4> MaskVal(LaneSize, -1);
+
+ for (unsigned l = 0; l != NumElts; l += LaneSize) {
+ for (unsigned i = 0; i != LaneSize; ++i) {
+ if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+ return false;
+ if (Mask[i+l] < 0)
+ continue;
+ if (MaskVal[i] < 0) {
+ MaskVal[i] = Mask[i+l] - l;
+ Imm8 |= MaskVal[i] << (i*2);
+ continue;
+ }
+ if (Mask[i+l] != (signed)(MaskVal[i]+l))
+ return false;
+ }
+ }
+ return true;
+}
+
/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
/// Note that VPERMIL mask matching is different depending whether theunderlying
@@ -4163,7 +4210,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
unsigned NumElems = VT.getVectorNumElements();
if ((VT.is128BitVector() && NumElems != 4) ||
- (VT.is256BitVector() && NumElems != 8))
+ (VT.is256BitVector() && NumElems != 8) ||
+ (VT.is512BitVector() && NumElems != 16))
return false;
// "i+1" is the value the indexed mask element must have
@@ -4186,7 +4234,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
unsigned NumElems = VT.getVectorNumElements();
if ((VT.is128BitVector() && NumElems != 4) ||
- (VT.is256BitVector() && NumElems != 8))
+ (VT.is256BitVector() && NumElems != 8) ||
+ (VT.is512BitVector() && NumElems != 16))
return false;
// "i" is the value the indexed mask element must have
@@ -4449,27 +4498,6 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
return getInsertVINSERTImmediate(N, 256);
}
-/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
-/// Handles 256-bit.
-static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getValueType(0).getSimpleVT();
-
- unsigned NumElts = VT.getVectorNumElements();
-
- assert((VT.is256BitVector() && NumElts == 4) &&
- "Unsupported vector type for VPERMQ/VPERMPD");
-
- unsigned Mask = 0;
- for (unsigned i = 0; i != NumElts; ++i) {
- int Elt = N->getMaskElt(i);
- if (Elt < 0)
- continue;
- Mask |= Elt << (i*2);
- }
-
- return Mask;
-}
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
@@ -5288,7 +5316,10 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, false, 0);
- SmallVector<int, 8> Mask(NumElems, EltNo);
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0; i != NumElems; ++i)
+ Mask.push_back(EltNo);
+
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
}
@@ -5720,7 +5751,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
- if (VT == MVT::v4i32 || VT == MVT::v8i32)
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
return Op;
return getZeroVector(VT, Subtarget, DAG, dl);
@@ -7413,21 +7444,30 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (BlendOp.getNode())
return BlendOp;
- if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
- SmallVector<SDValue, 8> permclMask;
- for (unsigned i = 0; i != 8; ++i) {
- permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+ unsigned Imm8;
+ if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
+ return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
+
+ if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
+ VT.is512BitVector()) {
+ EVT MaskEltVT = EVT::getIntegerVT(*DAG.getContext(),
+ VT.getVectorElementType().getSizeInBits());
+ EVT MaskVectorVT =
+ EVT::getVectorVT(*DAG.getContext(),MaskEltVT, NumElems);
+ SmallVector<SDValue, 16> permclMask;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
}
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
- &permclMask[0], 8);
- // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
- return DAG.getNode(X86ISD::VPERMV, dl, VT,
- DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
- }
- if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
- return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
- getShuffleCLImmediate(SVOp), DAG);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
+ &permclMask[0], NumElems);
+ if (V2IsUndef)
+ // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+ return DAG.getNode(X86ISD::VPERMV, dl, VT,
+ DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+ return DAG.getNode(X86ISD::VPERMV3, dl, VT,
+ DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
+ }
//===--------------------------------------------------------------------===//
// Since no target specific shuffle was selected for this generic one,
@@ -10149,6 +10189,36 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
}
+SDValue X86TargetLowering::LowerSIGN_EXTEND_AVX512(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op->getValueType(0);
+ SDValue In = Op->getOperand(0);
+ EVT InVT = In.getValueType();
+ SDLoc dl(Op);
+
+ if (InVT.getVectorElementType().getSizeInBits() >=8 &&
+ VT.getVectorElementType().getSizeInBits() >= 32)
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ if (InVT.getVectorElementType() == MVT::i1) {
+ unsigned int NumElts = InVT.getVectorNumElements();
+ assert ((NumElts == 8 || NumElts == 16) &&
+ "Unsupported SIGN_EXTEND operation");
+ if (VT.getVectorElementType().getSizeInBits() >= 32) {
+ Constant *C =
+ ConstantInt::get(*DAG.getContext(),
+ (NumElts == 8)? APInt(64, ~0ULL): APInt(32, ~0U));
+ SDValue CP = DAG.getConstantPool(C, getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ SDValue Ld = DAG.getLoad(VT.getScalarType(), dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
+ return DAG.getNode(X86ISD::VBROADCASTM, dl, VT, In, Ld);
+ }
+ }
+ return SDValue();
+}
+
SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
MVT VT = Op->getValueType(0).getSimpleVT();
@@ -10156,6 +10226,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
MVT InVT = In.getValueType().getSimpleVT();
SDLoc dl(Op);
+ if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+ return LowerSIGN_EXTEND_AVX512(Op, DAG);
+
if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
(VT != MVT::v8i32 || InVT != MVT::v8i16))
return SDValue();
@@ -13239,6 +13312,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
+ case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index e3db491..c931b9b 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -318,6 +318,7 @@ namespace llvm {
UNPCKH,
VPERMILP,
VPERMV,
+ VPERMV3,
VPERMI,
VPERM2X128,
VBROADCAST,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 8abae14..7fed783 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -473,6 +473,98 @@ defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERM
+//
+// -- immediate form --
+multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ SDNode OpNode, PatFrag mem_frag,
+ X86MemOperand x86memop, ValueType OpVT> {
+ def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+ EVEX;
+ def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (OpVT (OpNode (mem_frag addr:$src1),
+ (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64,
+ i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64,
+ f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM - register form --
+multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> {
+
+ def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
+
+ def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (OpVT (X86VPermv RC:$src1,
+ (bitconvert (mem_frag addr:$src2)))))]>, EVEX_4V;
+}
+
+defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv8i64, i512mem,
+ v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem,
+ v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv8f64, f512mem,
+ v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem,
+ v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM2I - 3 source operands form --
+multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ PatFrag mem_frag, X86MemOperand x86memop,
+ ValueType OpVT> {
+let Constraints = "$src1 = $dst" in {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (X86VPermv3 RC:$src1, RC:$src2, RC:$src3)))]>,
+ EVEX_4V;
+
+ def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (X86VPermv3 RC:$src1, RC:$src2,
+ (bitconvert (mem_frag addr:$src3)))))]>, EVEX_4V;
+ }
+}
+defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem,
+ v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem,
+ v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem,
+ v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem,
+ v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+
// Mask register copy, including
// - copy between mask registers
// - load/store mask registers
@@ -713,3 +805,148 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
(v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+multiclass avx512_mov_packed<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, Domain d> {
+let neverHasSideEffects = 1 in
+ def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+ EVEX;
+let canFoldAsLoad = 1 in
+ def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ld_frag addr:$src))], d>, EVEX;
+let Constraints = "$src1 = $dst" in {
+ def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, RC:$src2),
+ !strconcat(asm,
+ "\t{$src2, ${dst}{${mask}}|${dst}{${mask}}, $src2}"), [], d>,
+ EVEX, EVEX_K;
+ def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, x86memop:$src2),
+ !strconcat(asm,
+ "\t{$src2, ${dst}{${mask}}|${dst}{${mask}}, $src2}"),
+ [], d>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
+ "vmovaps", SSEPackedSingle>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
+ "vmovapd", SSEPackedDouble>,
+ OpSize, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32,
+ "vmovups", SSEPackedSingle>,
+ TB, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64,
+ "vmovupd", SSEPackedDouble>,
+ OpSize, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+ "vmovaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore512 (v16f32 VR512:$src), addr:$dst)],
+ SSEPackedSingle>, EVEX, EVEX_V512, TB,
+ EVEX_CD8<32, CD8VF>;
+def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+ "vmovapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore512 (v8f64 VR512:$src), addr:$dst)],
+ SSEPackedDouble>, EVEX, EVEX_V512,
+ OpSize, TB, VEX_W, EVEX_CD8<64, CD8VF>;
+def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+ "vmovups\t{$src, $dst|$dst, $src}",
+ [(store (v16f32 VR512:$src), addr:$dst)],
+ SSEPackedSingle>, EVEX, EVEX_V512, TB,
+ EVEX_CD8<32, CD8VF>;
+def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+ "vmovupd\t{$src, $dst|$dst, $src}",
+ [(store (v8f64 VR512:$src), addr:$dst)],
+ SSEPackedDouble>, EVEX, EVEX_V512,
+ OpSize, TB, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// Use vmovaps/vmovups for AVX-512 integer load/store.
+// 512-bit load/store
+def : Pat<(alignedloadv8i64 addr:$src),
+ (VMOVAPSZrm addr:$src)>;
+def : Pat<(loadv8i64 addr:$src),
+ (VMOVUPSZrm addr:$src)>;
+
+def : Pat<(alignedstore512 (v8i64 VR512:$src), addr:$dst),
+ (VMOVAPSZmr addr:$dst, VR512:$src)>;
+def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst),
+ (VMOVAPSZmr addr:$dst, VR512:$src)>;
+
+def : Pat<(store (v8i64 VR512:$src), addr:$dst),
+ (VMOVUPDZmr addr:$dst, VR512:$src)>;
+def : Pat<(store (v16i32 VR512:$src), addr:$dst),
+ (VMOVUPSZmr addr:$dst, VR512:$src)>;
+
+let neverHasSideEffects = 1 in {
+ def VMOVDQA32rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src),
+ "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+ EVEX, EVEX_V512;
+ def VMOVDQA64rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src),
+ "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+ EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in {
+ def VMOVDQA32mr : AVX512BI<0x7F, MRMDestMem, (outs),
+ (ins i512mem:$dst, VR512:$src),
+ "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ def VMOVDQA64mr : AVX512BI<0x7F, MRMDestMem, (outs),
+ (ins i512mem:$dst, VR512:$src),
+ "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+ EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+let mayLoad = 1 in {
+def VMOVDQA32rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst),
+ (ins i512mem:$src),
+ "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVDQA64rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst),
+ (ins i512mem:$src),
+ "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+ EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+}
+
+multiclass avx512_mov_int<bits<8> opc, string asm, RegisterClass RC,
+ RegisterClass KRC,
+ PatFrag ld_frag, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in
+ def rr : AVX512XSI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>,
+ EVEX;
+let canFoldAsLoad = 1 in
+ def rm : AVX512XSI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (ld_frag addr:$src))]>,
+ EVEX;
+let Constraints = "$src1 = $dst" in {
+ def rrk : AVX512XSI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, RC:$src2),
+ !strconcat(asm,
+ "\t{$src2, ${dst}{${mask}}|${dst}{${mask}}, $src2}"), []>,
+ EVEX, EVEX_K;
+ def rmk : AVX512XSI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, KRC:$mask, x86memop:$src2),
+ !strconcat(asm,
+ "\t{$src2, ${dst}{${mask}}|${dst}{${mask}}, $src2}"),
+ []>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVDQU32 : avx512_mov_int<0x6F, "vmovdqu32", VR512, VK16WM, memopv16i32, i512mem>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, memopv8i64, i512mem>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 0b51521..8587d38 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -151,6 +151,8 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>;
+def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0,1>, SDTCisInt<2>]>;
@@ -194,6 +196,7 @@ def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>;
def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
+def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
@@ -262,9 +265,16 @@ def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
-// 128-/256-bit extload pattern fragments
+// 512-bit load pattern fragments
+def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
// Like 'store', but always requires 128-bit vector alignment.
def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -278,6 +288,12 @@ def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
return cast<StoreSDNode>(N)->getAlignment() >= 32;
}]>;
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
// Like 'load', but always requires 128-bit vector alignment.
def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() >= 16;
@@ -293,6 +309,11 @@ def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() >= 32;
}]>;
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
def alignedloadfsf32 : PatFrag<(ops node:$ptr),
(f32 (alignedload node:$ptr))>;
def alignedloadfsf64 : PatFrag<(ops node:$ptr),
@@ -316,6 +337,16 @@ def alignedloadv4f64 : PatFrag<(ops node:$ptr),
def alignedloadv4i64 : PatFrag<(ops node:$ptr),
(v4i64 (alignedload256 node:$ptr))>;
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+ (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64 : PatFrag<(ops node:$ptr),
+ (v8f64 (alignedload512 node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+ (v16i32 (alignedload512 node:$ptr))>;
+def alignedloadv8i64 : PatFrag<(ops node:$ptr),
+ (v8i64 (alignedload512 node:$ptr))>;
+
// Like 'load', but uses special alignment checks suitable for use in
// memory operands in most SSE instructions, which are required to
// be naturally aligned on some targets but not on others. If the subtarget
@@ -339,9 +370,16 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
// 256-bit memop pattern fragments
// NOTE: all 256-bit integer vector loads are promoted to v4i64
def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
+def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>;
def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
+// 512-bit memop pattern fragments
+def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>;
+def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>;
+def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>;
+def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>;
+
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
// 16-byte boundary.
// FIXME: 8 byte alignment for mmx reads is not required
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 0443a93..b773768 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -81,6 +81,7 @@ enum {
TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
+ TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
};
@@ -1177,6 +1178,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::PDEP64rr, X86::PDEP64rm, 0 },
{ X86::PEXT32rr, X86::PEXT32rm, 0 },
{ X86::PEXT64rr, X86::PEXT64rm, 0 },
+
+ // AVX-512 foldable instructions
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
+ { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
+ { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
+ { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1454,6 +1463,8 @@ static bool isFrameLoadOpcode(int Opcode) {
case X86::VMOVDQAYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
+ case X86::VMOVDQA32rm:
+ case X86::VMOVDQA64rm:
return true;
}
}
@@ -2890,12 +2901,15 @@ static bool isHReg(unsigned Reg) {
// Try and copy between VR128/VR64 and GR64 registers.
static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
- bool HasAVX) {
+ const X86Subtarget& Subtarget) {
+
+
// SrcReg(VR128) -> DestReg(GR64)
// SrcReg(VR64) -> DestReg(GR64)
// SrcReg(GR64) -> DestReg(VR128)
// SrcReg(GR64) -> DestReg(VR64)
+ bool HasAVX = Subtarget.hasAVX();
if (X86::GR64RegClass.contains(DestReg)) {
if (X86::VR128RegClass.contains(SrcReg))
// Copy from a VR128 register to a GR64 register.
@@ -2926,13 +2940,31 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
return 0;
}
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+ if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
+ X86::VR256XRegClass.contains(DestReg, SrcReg) ||
+ X86::VR512RegClass.contains(DestReg, SrcReg)) {
+ DestReg = get512BitSuperRegister(DestReg);
+ SrcReg = get512BitSuperRegister(SrcReg);
+ return X86::VMOVAPSZrr;
+ }
+ if ((X86::VK8RegClass.contains(DestReg) ||
+ X86::VK16RegClass.contains(DestReg)) &&
+ (X86::VK8RegClass.contains(SrcReg) ||
+ X86::VK16RegClass.contains(SrcReg)))
+ return X86::KMOVWkk;
+ return 0;
+}
+
void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {
// First deal with the normal symmetric copies.
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
- unsigned Opc;
+ bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+ unsigned Opc = 0;
if (X86::GR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MOV64rr;
else if (X86::GR32RegClass.contains(DestReg, SrcReg))
@@ -2950,14 +2982,17 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
"8-bit H register can not be copied outside GR8_NOREX");
} else
Opc = X86::MOV8rr;
- } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+ }
+ else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+ Opc = X86::MMX_MOVQ64rr;
+ else if (HasAVX512)
+ Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+ else if (X86::VR128RegClass.contains(DestReg, SrcReg))
Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
else if (X86::VR256RegClass.contains(DestReg, SrcReg))
Opc = X86::VMOVAPSYrr;
- else if (X86::VR64RegClass.contains(DestReg, SrcReg))
- Opc = X86::MMX_MOVQ64rr;
- else
- Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
+ if (!Opc)
+ Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>());
if (Opc) {
BuildMI(MBB, MI, DL, get(Opc), DestReg)