aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp11
-rw-r--r--lib/Target/X86/X86InstrAVX512.td247
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td18
3 files changed, 249 insertions, 27 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4e71f36..3c3f09f 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1390,6 +1390,9 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::AND, MVT::v8i64, Legal);
setOperationAction(ISD::OR, MVT::v8i64, Legal);
setOperationAction(ISD::XOR, MVT::v8i64, Legal);
+ setOperationAction(ISD::AND, MVT::v16i32, Legal);
+ setOperationAction(ISD::OR, MVT::v16i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v16i32, Legal);
// Custom lower several nodes.
for (int i = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1409,14 +1412,6 @@ void X86TargetLowering::resetOperationActions() {
if (!VT.is512BitVector())
continue;
- if (VT != MVT::v8i64) {
- setOperationAction(ISD::XOR, VT, Promote);
- AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
- setOperationAction(ISD::OR, VT, Promote);
- AddPromotedToType (ISD::OR, VT, MVT::v8i64);
- setOperationAction(ISD::AND, VT, Promote);
- AddPromotedToType (ISD::AND, VT, MVT::v8i64);
- }
if ( EltSize >= 32) {
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 760b4ed..a6035af 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -80,6 +80,21 @@ let Predicates = [HasAVX512] in {
def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>;
}
+//
+// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
+//
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+ [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+}
+
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+
//===----------------------------------------------------------------------===//
// AVX-512 - VECTOR INSERT
//
@@ -518,16 +533,16 @@ multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
- (OpVT (X86VPermv RC:$src1,
- (bitconvert (mem_frag addr:$src2)))))]>, EVEX_4V;
+ (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
+ EVEX_4V;
}
-defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv8i64, i512mem,
+defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem,
v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem,
v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
let ExeDomain = SSEPackedSingle in
-defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv8f64, f512mem,
+defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem,
v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
let ExeDomain = SSEPackedDouble in
defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem,
@@ -552,7 +567,7 @@ let Constraints = "$src1 = $dst" in {
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
(OpVT (X86VPermv3 RC:$src1, RC:$src2,
- (bitconvert (mem_frag addr:$src3)))))]>, EVEX_4V;
+ (mem_frag addr:$src3))))]>, EVEX_4V;
}
}
defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem,
@@ -631,18 +646,17 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC,
def rm : AVX512BI<opc, MRMSrcMem,
(outs KRC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set KRC:$dst, (OpNode (vt RC:$src1),
- (bitconvert (memop_frag addr:$src2))))],
+ [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))],
IIC_SSE_CMPP_RM>, EVEX_4V;
}
defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem,
- memopv8i64, X86pcmpeqm, v16i32>, EVEX_V512;
+ memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512;
defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem,
memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W;
defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem,
- memopv8i64, X86pcmpgtm, v16i32>, EVEX_V512;
+ memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512;
defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem,
memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W;
@@ -656,7 +670,6 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
(v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
(v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
-
multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag,
SDNode OpNode, ValueType vt, Operand CC, string asm,
@@ -667,9 +680,8 @@ multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
IIC_SSE_CMPP_RR>, EVEX_4V;
def rmi : AVX512AIi8<opc, MRMSrcMem,
(outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
- [(set KRC:$dst, (OpNode (vt RC:$src1),
- (bitconvert (memop_frag addr:$src2)), imm:$cc))],
- IIC_SSE_CMPP_RM>, EVEX_4V;
+ [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
+ imm:$cc))], IIC_SSE_CMPP_RM>, EVEX_4V;
// Accept explicit immediate argument form instead of comparison code.
let neverHasSideEffects = 1 in {
def rri_alt : AVX512AIi8<opc, MRMSrcReg,
@@ -681,12 +693,12 @@ multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
}
}
-defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv8i64,
+defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32,
X86cmpm, v16i32, AVXCC,
"vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv8i64,
+defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32,
X86cmpmu, v16i32, AVXCC,
"vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
@@ -1415,3 +1427,208 @@ let AddedComplexity = 20 in {
(VMOVZPQILo2PQIZrr VR128X:$src)>;
}
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, PatFrag scalar_mfrag,
+ X86MemOperand x86scalar_mop, string BrdcstStr,
+ OpndItins itins, bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
+ def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))],
+ itins.rr>, EVEX_4V;
+ def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
+ itins.rm>, EVEX_4V;
+ def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86scalar_mop:$src2),
+ !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+ ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+ [(set RC:$dst, (OpNode RC:$src1,
+ (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+ itins.rm>, EVEX_4V, EVEX_B;
+}
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
+ def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, EVEX_4V, VEX_W;
+ def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, EVEX_4V, VEX_W;
+}
+
+defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VR512, memopv16i32,
+ i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VR512, memopv16i32,
+ i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 0>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VR512, memopv16i32,
+ i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+ T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VR512, memopv8i64,
+ i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>,
+ EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
+
+defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VR512, memopv8i64,
+ i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32,
+ VR512, memopv8i64, i512mem, SSE_INTALU_ITINS_P, 1>, T8,
+ EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32,
+ VR512, memopv8i64, i512mem, SSE_INTMUL_ITINS_P, 1>, EVEX_V512,
+ EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))),
+ (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VR512, memopv16i32,
+ i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VR512, memopv8i64,
+ i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPORDZ : avx512_binop_rm<0xEB, "vpord", or, v16i32, VR512, memopv16i32,
+ i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPORQZ : avx512_binop_rm<0xEB, "vporq", or, v8i64, VR512, memopv8i64,
+ i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VR512, memopv16i32,
+ i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VR512, memopv8i64,
+ i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VR512,
+ memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+ SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8i64,
+ i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 0>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 FP arithmetic
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins> {
+ defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss{z}"), OpNode, FR32X,
+ f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd{z}"), OpNode, FR64X,
+ f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
+}
+
+let isCommutable = 1 in {
+defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>;
+defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>;
+defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>;
+defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>;
+}
+let isCommutable = 0 in {
+defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>;
+defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>;
+}
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, ValueType vt,
+ X86MemOperand x86memop, PatFrag mem_frag,
+ X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+ string BrdcstStr,
+ Domain d, OpndItins itins, bit commutable> {
+ let isCommutable = commutable in
+ def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+ EVEX_4V;
+ let mayLoad = 1 in {
+ def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+ itins.rm, d>, EVEX_4V;
+ def rmb : PI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86scalar_mop:$src2),
+ !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+ ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+ [(set RC:$dst, (OpNode RC:$src1,
+ (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+ itins.rm, d>, EVEX_4V, EVEX_B;
+ }
+}
+
+defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VR512, v16f32, f512mem,
+ memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+ SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VR512, v8f64, f512mem,
+ memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+ SSE_ALU_ITINS_P.d, 1>,
+ EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VR512, v16f32, f512mem,
+ memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+ SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VR512, v8f64, f512mem,
+ memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+ SSE_ALU_ITINS_P.d, 1>,
+ EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VR512, v16f32, f512mem,
+ memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+ SSE_ALU_ITINS_P.s, 1>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VR512, v16f32, f512mem,
+ memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+ SSE_ALU_ITINS_P.s, 1>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VR512, v8f64, f512mem,
+ memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+ SSE_ALU_ITINS_P.d, 1>,
+ EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VR512, v8f64, f512mem,
+ memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+ SSE_ALU_ITINS_P.d, 1>,
+ EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VR512, v16f32, f512mem,
+ memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+ SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VR512, v16f32, f512mem,
+ memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+ SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VR512, v8f64, f512mem,
+ memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+ SSE_ALU_ITINS_P.d, 0>,
+ EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem,
+ memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+ SSE_ALU_ITINS_P.d, 0>,
+ EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index e6460e9..9f1c999 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -366,6 +366,16 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|| cast<LoadSDNode>(N)->getAlignment() >= 16;
}]>;
+def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return Subtarget->hasVectorUAMem()
+ || cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return Subtarget->hasVectorUAMem()
+ || cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
@@ -382,10 +392,10 @@ def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
// 512-bit memop pattern fragments
-def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>;
-def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>;
-def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>;
-def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>;
+def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>;
+def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop8 node:$ptr))>;
+def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>;
+def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop8 node:$ptr))>;
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
// 16-byte boundary.