diff options
Diffstat (limited to 'lib/Target/X86/X86InstrSSE.td')
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 627 |
1 files changed, 451 insertions, 176 deletions
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f9a5ae1..cc896f0 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -181,6 +181,7 @@ def SSE_MPSADBW_ITINS : OpndItins< IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM >; +let Sched = WriteVecIMul in def SSE_PMULLD_ITINS : OpndItins< IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM >; @@ -218,11 +219,21 @@ def DEFAULT_ITINS_BLENDSCHED : OpndItins< IIC_ALU_NONMEM, IIC_ALU_MEM >; +let Sched = WriteVarBlend in +def DEFAULT_ITINS_VARBLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + let Sched = WriteFBlend in def SSE_INTALU_ITINS_FBLEND_P : OpndItins< IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM >; +let Sched = WriteBlend in +def SSE_INTALU_ITINS_BLEND_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 Instructions Classes //===----------------------------------------------------------------------===// @@ -601,29 +612,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Patterns let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; - - // Move low f32 and clear high bits. - def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; - } - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -659,31 +647,10 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), - sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), - sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; - // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), addr:$dst), @@ -734,7 +701,6 @@ let Predicates = [UseAVX] in { (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), sub_xmm)>; - // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -750,7 +716,7 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE1] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), @@ -784,7 +750,7 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), @@ -854,6 +820,7 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in Sched<[WriteLoad]>; } +let Predicates = [HasAVX, NoVLX] in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS, VEX; @@ -879,20 +846,26 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD, VEX, VEX_L; +} + +let Predicates = [UseSSE1] in { defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS; -defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - PD; defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, PS; +} +let Predicates = [UseSSE2] in { +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD; +} -let SchedRW = [WriteStore] in { +let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -1006,7 +979,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - SchedRW = [WriteMove] in { + SchedRW = [WriteFShuffle] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -1036,7 +1009,7 @@ let Predicates = [UseSSE2] in (MOVUPDmr addr:$dst, VR128:$src)>; // Use vmovaps/vmovups for AVX integer load/store. -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // 128-bit load/store def : Pat<(alignedloadv2i64 addr:$src), (VMOVAPSrm addr:$src)>; @@ -1251,6 +1224,9 @@ let Predicates = [HasAVX] in { (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), @@ -1298,6 +1274,9 @@ let Predicates = [UseSSE2] in { (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), @@ -1360,6 +1339,11 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE1] in { @@ -1380,6 +1364,11 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (MOVHPDrm VR128:$src1, addr:$src2)>; } //===----------------------------------------------------------------------===// @@ -2577,18 +2566,17 @@ def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, - Domain d, bit IsConvertibleToThreeAddress = 0> { + Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffleLd, ReadAfterLd]>; - let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in - def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), asm, - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteFShuffle]>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteFShuffle]>; } defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -2607,10 +2595,10 @@ defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS; + memopv4f32, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD; + memopv2f64, SSEPackedDouble>, PD; } let Predicates = [HasAVX] in { @@ -3136,7 +3124,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), @@ -3156,10 +3143,10 @@ let Predicates = [UseSSE2] in { } let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert - // instruction is lowered into a X86insertps rather than a X86Movss. - // When selecting SSE scalar single-precision fp arithmetic instructions, - // make sure that we correctly match the X86insertps. + // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is + // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When + // selecting SSE scalar single-precision fp arithmetic instructions, make + // sure that we correctly match them. def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3177,6 +3164,57 @@ let Predicates = [UseSSE41] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } let Predicates = [HasAVX] in { @@ -3215,6 +3253,57 @@ let Predicates = [HasAVX] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } // Patterns used to select SSE scalar fp arithmetic instructions from @@ -3269,6 +3358,49 @@ let Predicates = [UseSSE2] in { (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; } +let Predicates = [UseSSE41] in { + // With SSE4.1 we may see these operations using X86Blendi rather than + // X86Movs{s,d}. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + let Predicates = [HasAVX] in { // The following patterns select AVX Scalar single/double precision fp // arithmetic instructions from a packed single precision fp instruction @@ -3298,6 +3430,46 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + // Also handle X86Blendi-based patterns. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; } /// Unop Arithmetic @@ -3326,6 +3498,16 @@ def SSE_SQRTSD : OpndItins< >; } +let Sched = WriteFRsqrt in { +def SSE_RSQRTPS : OpndItins< + IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM +>; + +def SSE_RSQRTSS : OpndItins< + IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM +>; +} + let Sched = WriteFRcp in { def SSE_RCPP : OpndItins< IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM @@ -3604,10 +3786,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; + int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, @@ -3686,6 +3868,7 @@ let Predicates = [UseSSE1] in { let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteStore] in { +let Predicates = [HasAVX, NoVLX] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3726,6 +3909,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L; +} def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3755,6 +3939,14 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), PS, Requires<[HasSSE2]>; } // SchedRW = [WriteStore] +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (VMOVNTPSmr addr:$dst, VR128:$src)>; +} + +def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTPSmr addr:$dst, VR128:$src)>; + } // AddedComplexity //===----------------------------------------------------------------------===// @@ -5277,6 +5469,13 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrr VR256:$src)>; } +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; @@ -5357,56 +5556,34 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { // Patterns used to select 'addsub' instructions. let Predicates = [HasAVX] in { - // Constant 170 corresponds to the binary mask '10101010'. - // When used as a blend mask, it allows selecting eight elements from two - // input vectors as follow: - // - Even-numbered values in the destination are copied from - // the corresponding elements in the first input vector; - // - Odd-numbered values in the destination are copied from - // the corresponding elements in the second input vector. - - def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), - (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), - (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; - - // Constant 10 corresponds to the binary mask '1010'. - // In the two pattens below, constant 10 is used as a blend mask to select - // - the 1st and 3rd element from the first input vector (the 'fsub' node); - // - the 2nd and 4th element from the second input vector (the 'fadd' node). - - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; + + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), + (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), + (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; } let Predicates = [UseSSE3] in { - // Constant 10 corresponds to the binary mask '1010'. - // In the pattern below, it is used as a blend mask to select: - // - the 1st and 3rd element from the first input vector (the fsub node); - // - the 2nd and 4th element from the second input vector (the fadd node). - - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; } //===---------------------------------------------------------------------===// @@ -6692,7 +6869,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u32u8imm:$src3), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6701,7 +6878,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), + (ins VR128:$src1, f32mem:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -7308,7 +7485,7 @@ let Constraints = "$src1 = $dst" in { let Predicates = [HasAVX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, VEX_4V; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, @@ -7316,7 +7493,7 @@ let Predicates = [HasAVX] in { } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, @@ -7337,7 +7514,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u32u8imm:$src3), + (ins RC:$src1, RC:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7346,7 +7523,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, Sched<[itins.Sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7360,31 +7537,33 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let isCommutable = 0 in { - let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } - let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V; + } + + let ExeDomain = SSEPackedSingle in { + defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", + int_x86_avx_blend_ps_256, VR256, loadv8f32, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", + int_x86_avx_blend_pd_256,VR256, loadv4f64, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, VR128, loadv2i64, i128mem, 0, DEFAULT_ITINS_BLENDSCHED>, VEX_4V; - defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_MPSADSCHED>, VEX_4V; - } + let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, @@ -7412,6 +7591,10 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem, + 1, SSE_MPSADBW_ITINS>; + } let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, VR128, memopv4f32, f128mem, @@ -7422,11 +7605,7 @@ let Constraints = "$src1 = $dst" in { 1, SSE_INTALU_ITINS_FBLEND_P>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem, - 1, SSE_MPSADBW_ITINS>; - } + 1, SSE_INTALU_ITINS_BLEND_P>; let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -7545,6 +7724,57 @@ let Predicates = [HasAVX2] in { (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } +// Patterns +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + } + + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; +} + +let Predicates = [UseSSE41] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; +} + + /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, @@ -7555,7 +7785,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], - itins.rr>; + itins.rr>, Sched<[itins.Sched]>; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), @@ -7564,18 +7794,21 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (mem_frag addr:$src2)), XMM0))], - itins.rm>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } let ExeDomain = SSEPackedDouble in defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, - int_x86_sse41_blendvpd>; + int_x86_sse41_blendvpd, + DEFAULT_ITINS_FBLENDSCHED>; let ExeDomain = SSEPackedSingle in defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, - int_x86_sse41_blendvps>; + int_x86_sse41_blendvps, + DEFAULT_ITINS_FBLENDSCHED>; defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, - int_x86_sse41_pblendvb>; + int_x86_sse41_pblendvb, + DEFAULT_ITINS_VARBLENDSCHED>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", @@ -8393,13 +8626,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX, + [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX, + (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; } @@ -8417,19 +8650,37 @@ let ExeDomain = SSEPackedDouble in { } let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), + (VPERMILPSYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPERMILPSYrm VR256:$src1, addr:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), + (VPERMILPDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), + (VPERMILPDYrm VR256:$src1, addr:$src2)>; + +def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)), +def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), (i8 imm:$imm))), (VPERMILPSYmi addr:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), + (VPERMILPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPERMILPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), + (VPERMILPDrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), + (VPERMILPDrm VR128:$src1, addr:$src2)>; + +def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), (VPERMILPDri VR128:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDmi addr:$src1, imm:$imm)>; } @@ -8540,15 +8791,15 @@ let Predicates = [HasF16C] in { // Patterns for matching conversions from float to half-float and vice versa. let Predicates = [HasF16C] in { - def : Pat<(f32_to_f16 FR32:$src), + def : Pat<(fp_to_f16 FR32:$src), (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; - def : Pat<(f16_to_f32 GR16:$src), + def : Pat<(f16_to_fp GR16:$src), (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; - def : Pat<(f16_to_f32 (i16 (f32_to_f16 FR32:$src))), + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; } @@ -8563,13 +8814,13 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u32u8imm:$src3), + (ins RC:$src1, RC:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, @@ -8578,12 +8829,10 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } -let isCommutable = 0 in { defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, VR128, loadv2i64, i128mem>; defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, VR256, loadv4i64, i256mem>, VEX_L; -} def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$mask)), @@ -8675,6 +8924,27 @@ let Predicates = [HasAVX2] in { def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), (VBROADCASTSDYrr VR128:$src)>; + // Provide aliases for broadcast from the same regitser class that + // automatically does the extract. + def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), + (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), + sub_xmm)))>; + def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), + (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), + sub_xmm)))>; + def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), + (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), + (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), + sub_xmm)))>; + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -8756,6 +9026,9 @@ let Predicates = [HasAVX] in { (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; } + + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; } //===----------------------------------------------------------------------===// @@ -8763,14 +9036,14 @@ let Predicates = [HasAVX] in { // multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, - Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; + Sched<[Sched]>, VEX_4V, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, @@ -8778,22 +9051,22 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, (bitconvert (mem_frag addr:$src2)))))]>, - Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; } -defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>; +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>; +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, - Sched<[WriteShuffle256]>, VEX, VEX_L; + Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, @@ -8801,12 +9074,14 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), (i8 imm:$src2))))]>, - Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; } -defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W; +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, + WriteShuffle256>, VEX_W; let ExeDomain = SSEPackedDouble in -defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W; +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, + WriteFShuffle256>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks |