diff options
Diffstat (limited to 'lib/Target/X86')
46 files changed, 2046 insertions, 1309 deletions
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt index 94aca7a..47489bb 100644 --- a/lib/Target/X86/AsmParser/CMakeLists.txt +++ b/lib/Target/X86/AsmParser/CMakeLists.txt @@ -5,12 +5,4 @@ add_llvm_library(LLVMX86AsmParser X86AsmParser.cpp ) -add_llvm_library_dependencies(LLVMX86AsmParser - LLVMMC - LLVMMCParser - LLVMSupport - LLVMX86Desc - LLVMX86Info - ) - add_dependencies(LLVMX86AsmParser X86CommonTableGen) diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt index 6c2405a..9f94d5d 100644 --- a/lib/Target/X86/AsmParser/LLVMBuild.txt +++ b/lib/Target/X86/AsmParser/LLVMBuild.txt @@ -21,4 +21,3 @@ name = X86AsmParser parent = X86 required_libraries = MC MCParser Support X86Desc X86Info add_to_library_groups = X86 - diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 4542d4b..be15899 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -51,21 +51,6 @@ endif() add_llvm_target(X86CodeGen ${sources}) -add_llvm_library_dependencies(LLVMX86CodeGen - LLVMAnalysis - LLVMAsmPrinter - LLVMCodeGen - LLVMCore - LLVMMC - LLVMSelectionDAG - LLVMSupport - LLVMTarget - LLVMX86AsmPrinter - LLVMX86Desc - LLVMX86Info - LLVMX86Utils - ) - add_subdirectory(AsmParser) add_subdirectory(Disassembler) add_subdirectory(InstPrinter) diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt index 4f570d5..0cd6db9 100644 --- a/lib/Target/X86/Disassembler/CMakeLists.txt +++ b/lib/Target/X86/Disassembler/CMakeLists.txt @@ -5,12 +5,6 @@ add_llvm_library(LLVMX86Disassembler X86DisassemblerDecoder.c ) -add_llvm_library_dependencies(LLVMX86Disassembler - LLVMMC - LLVMSupport - LLVMX86Info - ) - # workaround for hanging compilation on MSVC9 and 10 if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 ) set_property( diff --git a/lib/Target/X86/Disassembler/LLVMBuild.txt b/lib/Target/X86/Disassembler/LLVMBuild.txt index cd748cf..cac7adf 100644 --- a/lib/Target/X86/Disassembler/LLVMBuild.txt +++ b/lib/Target/X86/Disassembler/LLVMBuild.txt @@ -21,4 +21,3 @@ name = X86Disassembler parent = X86 required_libraries = MC Support X86Info add_to_library_groups = X86 - diff --git a/lib/Target/X86/InstPrinter/CMakeLists.txt b/lib/Target/X86/InstPrinter/CMakeLists.txt index 2a2b5db..28e2460 100644 --- a/lib/Target/X86/InstPrinter/CMakeLists.txt +++ b/lib/Target/X86/InstPrinter/CMakeLists.txt @@ -6,10 +6,4 @@ add_llvm_library(LLVMX86AsmPrinter X86InstComments.cpp ) -add_llvm_library_dependencies(LLVMX86AsmPrinter - LLVMMC - LLVMSupport - LLVMX86Utils - ) - add_dependencies(LLVMX86AsmPrinter X86CommonTableGen) diff --git a/lib/Target/X86/InstPrinter/LLVMBuild.txt b/lib/Target/X86/InstPrinter/LLVMBuild.txt index fb01323..6868dde 100644 --- a/lib/Target/X86/InstPrinter/LLVMBuild.txt +++ b/lib/Target/X86/InstPrinter/LLVMBuild.txt @@ -21,4 +21,3 @@ name = X86AsmPrinter parent = X86 required_libraries = MC Support X86Utils add_to_library_groups = X86 - diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 6e87efa..6e4b1b9 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -106,28 +106,92 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::PUNPCKHBWrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(16, ShuffleMask); + DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKHBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHBWrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKHBWYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHBWYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v32i8, ShuffleMask); break; case X86::PUNPCKHWDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKHWDrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(8, ShuffleMask); + DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKHWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHWDrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKHWDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHWDYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v16i16, ShuffleMask); break; case X86::PUNPCKHDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKHDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(4, ShuffleMask); + DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKHDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKHDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); break; case X86::PUNPCKHQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKHQDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(2, ShuffleMask); + DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKHQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHQDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKHQDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHQDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); break; case X86::PUNPCKLBWrr: @@ -135,42 +199,117 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::PUNPCKLBWrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLBWMask(16, ShuffleMask); + DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKLBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLBWrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKLBWYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLBWYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v32i8, ShuffleMask); break; case X86::PUNPCKLWDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLWDrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLWDMask(8, ShuffleMask); + DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKLWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLWDrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKLWDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLWDYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v16i16, ShuffleMask); break; case X86::PUNPCKLDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLDQMask(4, ShuffleMask); + DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKLDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKLDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); break; case X86::PUNPCKLQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLQDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLQDQMask(2, ShuffleMask); + DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKLQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLQDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKLQDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLQDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); break; case X86::SHUFPDrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::SHUFPDrmi: - DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask); + DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VSHUFPDrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VSHUFPDrmi: - DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask); + DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VSHUFPDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPDYrmi: + DecodeSHUFPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -179,14 +318,25 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::SHUFPSrmi: - DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask); + DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VSHUFPSrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VSHUFPSrmi: - DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask); + DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VSHUFPSYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPSYrmi: + DecodeSHUFPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -195,14 +345,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPDrm: - DecodeUNPCKLPMask(MVT::v2f64, ShuffleMask); + DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKLPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPDrm: - DecodeUNPCKLPMask(MVT::v2f64, ShuffleMask); + DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -210,7 +360,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPDYrm: - DecodeUNPCKLPMask(MVT::v4f64, ShuffleMask); + DecodeUNPCKLMask(MVT::v4f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -218,14 +368,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPSrm: - DecodeUNPCKLPMask(MVT::v4f32, ShuffleMask); + DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKLPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPSrm: - DecodeUNPCKLPMask(MVT::v4f32, ShuffleMask); + DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -233,7 +383,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPSYrm: - DecodeUNPCKLPMask(MVT::v8f32, ShuffleMask); + DecodeUNPCKLMask(MVT::v8f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -241,14 +391,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKHPDrm: - DecodeUNPCKHPMask(MVT::v2f64, ShuffleMask); + DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKHPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKHPDrm: - DecodeUNPCKHPMask(MVT::v2f64, ShuffleMask); + DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -256,7 +406,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKHPDYrm: - DecodeUNPCKLPMask(MVT::v4f64, ShuffleMask); + DecodeUNPCKHMask(MVT::v4f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -264,14 +414,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKHPSrm: - DecodeUNPCKHPMask(MVT::v4f32, ShuffleMask); + DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKHPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKHPSrm: - DecodeUNPCKHPMask(MVT::v4f32, ShuffleMask); + DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -279,34 +429,52 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKHPSYrm: - DecodeUNPCKHPMask(MVT::v8f32, ShuffleMask); + DecodeUNPCKHMask(MVT::v8f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERMILPSri: - DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPSmi: + DecodeVPERMILPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERMILPSYri: - DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPSYmi: + DecodeVPERMILPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERMILPDri: - DecodeVPERMILPDMask(2, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPDmi: + DecodeVPERMILPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERMILPDYri: - DecodeVPERMILPDMask(4, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPDYmi: + DecodeVPERMILPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERM2F128rr: - DecodeVPERM2F128Mask(MI->getOperand(3).getImm(), ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::VPERM2I128rr: Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPERM2F128rm: + case X86::VPERM2I128rm: + DecodeVPERM2F128Mask(MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); break; } diff --git a/lib/Target/X86/LLVMBuild.txt b/lib/Target/X86/LLVMBuild.txt index 514566c..87305e0 100644 --- a/lib/Target/X86/LLVMBuild.txt +++ b/lib/Target/X86/LLVMBuild.txt @@ -15,6 +15,9 @@ ; ;===------------------------------------------------------------------------===; +[common] +subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils + [component_0] type = TargetGroup name = X86 @@ -30,4 +33,3 @@ name = X86CodeGen parent = X86 required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target X86AsmPrinter X86Desc X86Info X86Utils add_to_library_groups = X86 - diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt index 264e791..ab2ebb4 100644 --- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt @@ -6,13 +6,6 @@ add_llvm_library(LLVMX86Desc X86MachObjectWriter.cpp ) -add_llvm_library_dependencies(LLVMX86Desc - LLVMMC - LLVMSupport - LLVMX86AsmPrinter - LLVMX86Info - ) - add_dependencies(LLVMX86Desc X86CommonTableGen) # Hack: we need to include 'main' target directory to grab private headers diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt index 3d09301..9e1d29c 100644 --- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt +++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt @@ -21,4 +21,3 @@ name = X86Desc parent = X86 required_libraries = MC Support X86AsmPrinter X86Info add_to_library_groups = X86 - diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 69ad7d7..87b2b05 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -107,6 +107,11 @@ public: bool MayNeedRelaxation(const MCInst &Inst) const; + bool fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCInstFragment *DF, + const MCAsmLayout &Layout) const; + void RelaxInstruction(const MCInst &Inst, MCInst &Res) const; bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const; @@ -244,6 +249,14 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const { return hasExp && !hasRIP; } +bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCInstFragment *DF, + const MCAsmLayout &Layout) const { + // Relax if the value is too big for a (signed) i8. + return int64_t(Value) != int64_t(int8_t(Value)); +} + // FIXME: Can tblgen help at all here to verify there aren't other instructions // we can relax? void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const { diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index c50f785..662ac1d 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -304,6 +304,12 @@ namespace X86II { // TAXD - Prefix before and after 0x0F. Combination of TA and XD. TAXD = 19 << Op0Shift, + // XOP8 - Prefix to include use of imm byte. + XOP8 = 20 << Op0Shift, + + // XOP9 - Prefix to exclude use of imm byte. + XOP9 = 21 << Op0Shift, + //===------------------------------------------------------------------===// // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. // They are used to specify GPRs and SSE registers, 64-bit operand size, @@ -418,7 +424,16 @@ namespace X86II { /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 7 + Has3DNow0F0FOpcode = 1U << 7, + + /// XOP_W - Same bit as VEX_W. Used to indicate swapping of + /// operand 3 and 4 to be encoded in ModRM or I8IMM. This is used + /// for FMA4 and XOP instructions. + XOP_W = 1U << 8, + + /// XOP - Opcode prefix used by XOP instructions. + XOP = 1U << 9 + }; // getBaseOpcodeFor - This function returns the "base" X86 opcode for the @@ -488,9 +503,12 @@ namespace X86II { return 0; case X86II::MRMSrcMem: { bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W; unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). + if (HasXOP_W) + ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 2703100..eb64ad1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -125,7 +125,19 @@ getNonexecutableStackSection(MCContext &Ctx) const { 0, SectionKind::getMetadata()); } -X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) { +X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { + if (Triple.getArch() == Triple::x86_64) { + GlobalPrefix = ""; + PrivateGlobalPrefix = ".L"; + } + + AsmTransCBE = x86_asm_table; + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; +} + +X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { GlobalPrefix = ""; PrivateGlobalPrefix = ".L"; diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index 2cd4c8e..5d619e8 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -38,8 +38,12 @@ namespace llvm { virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const; }; - struct X86MCAsmInfoCOFF : public MCAsmInfoCOFF { - explicit X86MCAsmInfoCOFF(const Triple &Triple); + struct X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + explicit X86MCAsmInfoMicrosoft(const Triple &Triple); + }; + + struct X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { + explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); }; } // namespace llvm diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 1ab469c..8e14cb1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -169,23 +169,36 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { return false; } -/// StartsWithGlobalOffsetTable - Return true for the simple cases where this -/// expression starts with _GLOBAL_OFFSET_TABLE_. This is a needed to support -/// PIC on ELF i386 as that symbol is magic. We check only simple case that +/// StartsWithGlobalOffsetTable - Check if this expression starts with +/// _GLOBAL_OFFSET_TABLE_ and if it is of the form +/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF +/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that /// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start /// of a binary expression. -static bool StartsWithGlobalOffsetTable(const MCExpr *Expr) { +enum GlobalOffsetTableExprKind { + GOT_None, + GOT_Normal, + GOT_SymDiff +}; +static GlobalOffsetTableExprKind +StartsWithGlobalOffsetTable(const MCExpr *Expr) { + const MCExpr *RHS = 0; if (Expr->getKind() == MCExpr::Binary) { const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr); Expr = BE->getLHS(); + RHS = BE->getRHS(); } if (Expr->getKind() != MCExpr::SymbolRef) - return false; + return GOT_None; const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); const MCSymbol &S = Ref->getSymbol(); - return S.getName() == "_GLOBAL_OFFSET_TABLE_"; + if (S.getName() != "_GLOBAL_OFFSET_TABLE_") + return GOT_None; + if (RHS && RHS->getKind() == MCExpr::SymbolRef) + return GOT_SymDiff; + return GOT_Normal; } void X86MCCodeEmitter:: @@ -209,12 +222,15 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind, // If we have an immoffset, add it to the expression. if ((FixupKind == FK_Data_4 || - FixupKind == MCFixupKind(X86::reloc_signed_4byte)) && - StartsWithGlobalOffsetTable(Expr)) { - assert(ImmOffset == 0); - - FixupKind = MCFixupKind(X86::reloc_global_offset_table); - ImmOffset = CurByte; + FixupKind == MCFixupKind(X86::reloc_signed_4byte))) { + GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr); + if (Kind != GOT_None) { + assert(ImmOffset == 0); + + FixupKind = MCFixupKind(X86::reloc_global_offset_table); + if (Kind == GOT_Normal) + ImmOffset = CurByte; + } } // If the fixup is pc-relative, we need to bias the value to be relative to @@ -415,6 +431,13 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // opcode extension, or ignored, depending on the opcode byte) unsigned char VEX_W = 0; + // XOP_W: opcode specific, same bit as VEX_W, but used to + // swap operand 3 and 4 for FMA4 and XOP instructions + unsigned char XOP_W = 0; + + // XOP: Use XOP prefix byte 0x8f instead of VEX. + unsigned char XOP = 0; + // VEX_5M (VEX m-mmmmm field): // // 0b00000: Reserved for future use @@ -422,7 +445,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0b00010: implied 0F 38 leading opcode bytes // 0b00011: implied 0F 3A leading opcode bytes // 0b00100-0b11111: Reserved for future use - // + // 0b01000: XOP map select - 08h instructions with imm byte + // 0b10001: XOP map select - 09h instructions with no imm byte unsigned char VEX_5M = 0x1; // VEX_4V (VEX vvvv field): a register specifier @@ -453,6 +477,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) VEX_W = 1; + if ((TSFlags >> X86II::VEXShift) & X86II::XOP_W) + XOP_W = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::XOP) + XOP = 1; + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; @@ -482,6 +512,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::XD: // F2 0F VEX_PP = 0x3; break; + case X86II::XOP8: + VEX_5M = 0x8; + break; + case X86II::XOP9: + VEX_5M = 0x9; + break; case X86II::A6: // Bypass: Not used by VEX case X86II::A7: // Bypass: Not used by VEX case X86II::TB: // Bypass: Not used by VEX @@ -489,6 +525,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, break; // No prefix! } + // Set the vector length to 256-bit if YMM0-YMM15 is used for (unsigned i = 0; i != MI.getNumOperands(); ++i) { if (!MI.getOperand(i).isReg()) @@ -529,6 +566,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // src1(ModR/M), MemAddr, imm8 // src1(ModR/M), MemAddr, src2(VEX_I8IMM) // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) VEX_R = 0x0; @@ -620,16 +660,16 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); - if (VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { // 2 byte VEX prefix + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix EmitByte(0xC5, CurByte, OS); EmitByte(LastByte | (VEX_R << 7), CurByte, OS); return; } // 3 byte VEX prefix - EmitByte(0xC4, CurByte, OS); + EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); - EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + EmitByte(LastByte | ((VEX_W | XOP_W) << 7), CurByte, OS); } /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 @@ -889,6 +929,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // It uses the VEX.VVVV field? bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W; + unsigned XOP_W_I8IMMOperand = 2; // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); @@ -961,9 +1003,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) SrcRegNum++; + if(HasXOP_W) // Skip 2nd src (which is encoded in I8IMM) + SrcRegNum++; + EmitRegModRMByte(MI.getOperand(SrcRegNum), GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); - CurOp = SrcRegNum + 1; + + // 2 operands skipped with HasXOP_W, comensate accordingly + CurOp = HasXOP_W ? SrcRegNum : SrcRegNum + 1; if (HasVEX_4VOp3) ++CurOp; break; @@ -975,6 +1022,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++AddrOperands; ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). } + if(HasXOP_W) // Skip second register source (encoded in I8IMM) + ++FirstMemOp; EmitByte(BaseOpcode, CurByte, OS); @@ -1062,12 +1111,24 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // according to the right size for the instruction. if (CurOp != NumOps) { // The last source register of a 4 operand instruction in AVX is encoded - // in bits[7:4] of a immediate byte, and bits[3:0] are ignored. + // in bits[7:4] of a immediate byte. if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { - const MCOperand &MO = MI.getOperand(CurOp++); + const MCOperand &MO = MI.getOperand(HasXOP_W ? XOP_W_I8IMMOperand + : CurOp); + CurOp++; bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg()); unsigned RegNum = (IsExtReg ? (1 << 7) : 0); RegNum |= GetX86RegNum(MO) << 4; + // If there is an additional 5th operand it must be an immediate, which + // is encoded in bits[3:0] + if(CurOp != NumOps) { + const MCOperand &MIMM = MI.getOperand(CurOp++); + if(MIMM.isImm()) { + unsigned Val = MIMM.getImm(); + assert(Val < 16 && "Immediate operand value out of range"); + RegNum |= Val; + } + } EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS, Fixups); } else { diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index a843515..f2a34ed 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -361,8 +361,10 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { MAI = new X86_64MCAsmInfoDarwin(TheTriple); else MAI = new X86MCAsmInfoDarwin(TheTriple); - } else if (TheTriple.isOSWindows()) { - MAI = new X86MCAsmInfoCOFF(TheTriple); + } else if (TheTriple.getOS() == Triple::Win32) { + MAI = new X86MCAsmInfoMicrosoft(TheTriple); + } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) { + MAI = new X86MCAsmInfoGNUCOFF(TheTriple); } else { MAI = new X86ELFMCAsmInfo(TheTriple); } diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 7d901af..a581993 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -922,16 +922,3 @@ _test2: ## @test2 The insertps's of $0 are pointless complex copies. //===---------------------------------------------------------------------===// - -If SSE4.1 is available we should inline rounding functions instead of emitting -a libcall. - -floor: roundsd $0x01, %xmm, %xmm -ceil: roundsd $0x02, %xmm, %xmm - -and likewise for the single precision versions. - -Currently, SelectionDAGBuilder doesn't turn calls to these functions into the -corresponding nodes and some targets (including X86) aren't ready for them. - -//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt index 4da00fa..b1d0b9f 100644 --- a/lib/Target/X86/TargetInfo/CMakeLists.txt +++ b/lib/Target/X86/TargetInfo/CMakeLists.txt @@ -4,10 +4,4 @@ add_llvm_library(LLVMX86Info X86TargetInfo.cpp ) -add_llvm_library_dependencies(LLVMX86Info - LLVMMC - LLVMSupport - LLVMTarget - ) - add_dependencies(LLVMX86Info X86CommonTableGen) diff --git a/lib/Target/X86/TargetInfo/LLVMBuild.txt b/lib/Target/X86/TargetInfo/LLVMBuild.txt index ee015bd..3c64a22 100644 --- a/lib/Target/X86/TargetInfo/LLVMBuild.txt +++ b/lib/Target/X86/TargetInfo/LLVMBuild.txt @@ -21,4 +21,3 @@ name = X86Info parent = X86 required_libraries = MC Support Target add_to_library_groups = X86 - diff --git a/lib/Target/X86/Utils/CMakeLists.txt b/lib/Target/X86/Utils/CMakeLists.txt index caffd8b..2e72c34 100644 --- a/lib/Target/X86/Utils/CMakeLists.txt +++ b/lib/Target/X86/Utils/CMakeLists.txt @@ -4,9 +4,4 @@ add_llvm_library(LLVMX86Utils X86ShuffleDecode.cpp ) -add_llvm_library_dependencies(LLVMX86Utils - LLVMCore - LLVMSupport - ) - add_dependencies(LLVMX86Utils X86CommonTableGen) diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt index 3ee441e..de0a30f 100644 --- a/lib/Target/X86/Utils/LLVMBuild.txt +++ b/lib/Target/X86/Utils/LLVMBuild.txt @@ -21,4 +21,3 @@ name = X86Utils parent = X86 required_libraries = Core Support add_to_library_groups = X86 - diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index f6c9d7b..e7631b6 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -95,54 +95,31 @@ void DecodePSHUFLWMask(unsigned Imm, ShuffleMask.push_back(7); } -void DecodePUNPCKLBWMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i8, NElts), ShuffleMask); -} - -void DecodePUNPCKLWDMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i16, NElts), ShuffleMask); -} - -void DecodePUNPCKLDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); -} - -void DecodePUNPCKLQDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); -} - -void DecodePUNPCKLMask(EVT VT, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(VT, ShuffleMask); -} +void DecodeSHUFPMask(EVT VT, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); -void DecodePUNPCKHMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i+NElts/2); - ShuffleMask.push_back(i+NElts+NElts/2); - } -} + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; -void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - // Part that reads from dest. - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(Imm % NElts); - Imm /= NElts; - } - // Part that reads from src. - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(Imm % NElts + NElts); - Imm /= NElts; + int NewImm = Imm; + for (unsigned l = 0; l < NumLanes; ++l) { + unsigned LaneStart = l * NumLaneElts; + // Part that reads from dest. + for (unsigned i = 0; i != NumLaneElts/2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + LaneStart); + NewImm /= NumLaneElts; + } + // Part that reads from src. + for (unsigned i = 0; i != NumLaneElts/2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + LaneStart); + NewImm /= NumLaneElts; + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm } } -void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -161,10 +138,10 @@ void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { } } -/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd /// etc. VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. -void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -183,36 +160,23 @@ void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { } } -// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes and the mask of the first lane must -// be the same of the second. -void DecodeVPERMILPSMask(unsigned NumElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - unsigned NumLanes = (NumElts*32)/128; - unsigned LaneSize = NumElts/NumLanes; - - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = 0; i != LaneSize; ++i) { - unsigned Idx = (Imm >> (i*2)) & 0x3 ; - ShuffleMask.push_back(Idx+(l*LaneSize)); - } - } -} +// DecodeVPERMILPMask - Decodes VPERMILPS/ VPERMILPD permutes for any 128-bit +// 32-bit or 64-bit elements. For 256-bit vectors, it's considered as two 128 +// lanes. For VPERMILPS, referenced elements can't cross lanes and the mask of +// the first lane must be the same of the second. +void DecodeVPERMILPMask(EVT VT, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); -// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes but the mask of the first lane can -// be the different of the second (not like VPERMILPS). -void DecodeVPERMILPDMask(unsigned NumElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - unsigned NumLanes = (NumElts*64)/128; - unsigned LaneSize = NumElts/NumLanes; + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; - for (unsigned l = 0; l < NumLanes; ++l) { - for (unsigned i = l*LaneSize; i < LaneSize*(l+1); ++i) { - unsigned Idx = (Imm >> i) & 0x1; - ShuffleMask.push_back(Idx+(l*LaneSize)); + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneStart = l*NumLaneElts; + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Idx = NumLaneElts == 4 ? (Imm >> (i*2)) & 0x3 + : (Imm >> (i+LaneStart)) & 0x1; + ShuffleMask.push_back(Idx+LaneStart); } } } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 35f6530..243728f 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -46,50 +46,25 @@ void DecodePSHUFHWMask(unsigned Imm, void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask); -void DecodePUNPCKLBWMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodePUNPCKLWDMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodePUNPCKLDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodePUNPCKLQDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodePUNPCKLMask(EVT VT, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodePUNPCKHMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeSHUFPMask(EVT VT, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); -/// DecodeUNPCKHPMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd /// etc. VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. -void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask); -/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd /// etc. VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. -void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask); - +void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask); -// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes and the mask of the first lane must -// be the same of the second. -void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); -// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes but the mask of the first lane can -// be the different of the second (not like VPERMILPS). -void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, +// DecodeVPERMILPMask - Decodes VPERMILPS/ VPERMILPD permutes for any 128-bit +// 32-bit or 64-bit elements. For 256-bit vectors, it's considered as two 128 +// lanes. For VPERMILPS, referenced elements can't cross lanes and the mask of +// the first lane must be the same of the second. +void DecodeVPERMILPMask(EVT VT, unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask); void DecodeVPERM2F128Mask(unsigned Imm, diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 62a7016..8229ca5 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -91,6 +91,8 @@ def FeatureFMA3 : SubtargetFeature<"fma3", "HasFMA3", "true", "Enable three-operand fused multiple-add">; def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", "Enable four-operand fused multiple-add">; +def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", + "Enable XOP instructions">; def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", "HasVectorUAMem", "true", "Allow unaligned memory operands on vector/SIMD instructions">; @@ -194,14 +196,16 @@ def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, + Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, FeatureSlowBTMem]>; -def : Proc<"barcelona", [FeatureSSE3, FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; -def : Proc<"istanbul", [Feature3DNowA, FeatureCMPXCHG16B, - FeatureSSE4A]>; -def : Proc<"shanghai", [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A]>; +// FIXME: Disabling AVX for now since it's not ready. +def : Proc<"bdver1", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureAES, FeatureCLMUL, FeatureFMA4, + FeatureXOP, FeatureLZCNT]>; +def : Proc<"bdver2", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureAES, FeatureCLMUL, FeatureFMA4, + FeatureXOP, FeatureF16C, FeatureLZCNT, + FeatureBMI]>; def : Proc<"winchip-c6", [FeatureMMX]>; def : Proc<"winchip2", [Feature3DNow]>; diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 77b9905..aab2a05 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -158,10 +158,15 @@ def CC_X86_64_C : CallingConv<[ CCIfSubtarget<"hasXMM()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, - // The first 8 256-bit vector arguments are passed in YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>, + // The first 8 256-bit vector arguments are passed in YMM registers, unless + // this is a vararg function. + // FIXME: This isn't precisely correct; the x86-64 ABI document says that + // fixed arguments to vararg functions are supposed to be passed in + // registers. Actually modeling that would be a lot of work, though. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, + YMM4, YMM5, YMM6, YMM7]>>>>, // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index ba615a8..ed16e88 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -1004,7 +1004,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, break; } - if (!Desc->isVariadic() && CurOp != NumOps) { + if (!MI.isVariadic() && CurOp != NumOps) { #ifndef NDEBUG dbgs() << "Cannot encode all operands of: " << MI << "\n"; #endif diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 32f1770..1589439 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -728,7 +728,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && GuaranteedTailCallOpt) + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; // Let SDISel handle vararg functions. @@ -1529,7 +1529,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && GuaranteedTailCallOpt) + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); @@ -1543,7 +1543,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Fast-isel doesn't know about callee-pop yet. if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg, - GuaranteedTailCallOpt)) + TM.Options.GuaranteedTailCallOpt)) return false; // Check whether the function can return without sret-demotion. @@ -2121,7 +2121,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { default: return false; case MVT::f32: if (X86ScalarSSEf32) { - Opc = Subtarget->hasAVX() ? X86::VFsFLD0SS : X86::FsFLD0SS; + Opc = X86::FsFLD0SS; RC = X86::FR32RegisterClass; } else { Opc = X86::LD_Fp032; @@ -2130,7 +2130,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { break; case MVT::f64: if (X86ScalarSSEf64) { - Opc = Subtarget->hasAVX() ? X86::VFsFLD0SD : X86::FsFLD0SD; + Opc = X86::FsFLD0SD; RC = X86::FR64RegisterClass; } else { Opc = X86::LD_Fp064; diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 819d242..6a40cc1 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -47,7 +47,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineModuleInfo &MMI = MF.getMMI(); const TargetRegisterInfo *RI = TM.getRegisterInfo(); - return (DisableFramePointerElim(MF) || + return (MF.getTarget().Options.DisableFramePointerElim(MF) || RI->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || @@ -210,7 +210,7 @@ static void mergeSPUpdatesDown(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, uint64_t *NumBytes = NULL) { - // FIXME: THIS ISN'T RUN!!! + // FIXME: THIS ISN'T RUN!!! return; if (MBBI == MBB.end()) return; @@ -351,20 +351,22 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, /// register. The number corresponds to the enum lists in /// compact_unwind_encoding.h. static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) { - int Idx = 1; - for (; *CURegs; ++CURegs, ++Idx) + for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; return -1; } +// Number of registers that can be saved in a compact unwind encoding. +#define CU_NUM_SAVED_REGS 6 + /// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding /// used with frameless stacks. It is passed the number of registers to be saved /// and an array of the registers saved. -static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6], - unsigned RegCount, - bool Is64Bit) { +static uint32_t +encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], + unsigned RegCount, bool Is64Bit) { // The saved registers are numbered from 1 to 6. In order to encode the order // in which they were saved, we re-number them according to their place in the // register order. The re-numbering is relative to the last re-numbered @@ -385,14 +387,21 @@ static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6], }; const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); - uint32_t RenumRegs[6]; - for (unsigned i = 6 - RegCount; i < 6; ++i) { + for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]); if (CUReg == -1) return ~0U; SavedRegs[i] = CUReg; + } + + // Reverse the list. + std::swap(SavedRegs[0], SavedRegs[5]); + std::swap(SavedRegs[1], SavedRegs[4]); + std::swap(SavedRegs[2], SavedRegs[3]); + uint32_t RenumRegs[CU_NUM_SAVED_REGS]; + for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) { unsigned Countless = 0; - for (unsigned j = 6 - RegCount; j < i; ++j) + for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) if (SavedRegs[j] < SavedRegs[i]) ++Countless; @@ -435,8 +444,9 @@ static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6], /// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a /// compact encoding with a frame pointer. -static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6], - bool Is64Bit) { +static uint32_t +encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], + bool Is64Bit) { static const unsigned CU32BitRegs[] = { X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 }; @@ -448,13 +458,16 @@ static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6], // Encode the registers in the order they were saved, 3-bits per register. The // registers are numbered from 1 to 6. uint32_t RegEnc = 0; - for (int I = 5; I >= 0; --I) { + for (int I = 0; I != 6; --I) { unsigned Reg = SavedRegs[I]; if (Reg == 0) break; int CURegNum = getCompactUnwindRegNum(CURegs, Reg); if (CURegNum == -1) return ~0U; - RegEnc |= (CURegNum & 0x7) << (5 - I); + + // Encode the 3-bit register number in order, skipping over 3-bits for each + // register. + RegEnc |= (CURegNum & 0x7) << ((5 - I) * 3); } assert((RegEnc & 0x7FFF) == RegEnc && "Invalid compact register encoding!"); @@ -466,14 +479,11 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned StackPtr = RegInfo->getStackRegister(); - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); - bool Is64Bit = STI.is64Bit(); bool HasFP = hasFP(MF); - unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 }; - int SavedRegIdx = 6; + unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 }; + unsigned SavedRegIdx = 0; unsigned OffsetSize = (Is64Bit ? 8 : 4); @@ -481,14 +491,13 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { unsigned PushInstrSize = 1; unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); unsigned MoveInstrSize = (Is64Bit ? 3 : 2); - unsigned SubtractInstr = getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta); unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2); unsigned StackDivide = (Is64Bit ? 8 : 4); unsigned InstrOffset = 0; - unsigned CFAOffset = 0; unsigned StackAdjust = 0; + unsigned StackSize = 0; MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB. bool ExpectEnd = false; @@ -504,10 +513,10 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { if (Opc == PushInstr) { // If there are too many saved registers, we cannot use compact encoding. - if (--SavedRegIdx < 0) return 0; + if (SavedRegIdx >= CU_NUM_SAVED_REGS) return 0; - SavedRegs[SavedRegIdx] = MI.getOperand(0).getReg(); - CFAOffset += OffsetSize; + SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg(); + StackAdjust += OffsetSize; InstrOffset += PushInstrSize; } else if (Opc == MoveInstr) { unsigned SrcReg = MI.getOperand(1).getReg(); @@ -516,13 +525,14 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { if (DstReg != FramePtr || SrcReg != StackPtr) return 0; - CFAOffset = 0; + StackAdjust = 0; memset(SavedRegs, 0, sizeof(SavedRegs)); - SavedRegIdx = 6; + SavedRegIdx = 0; InstrOffset += MoveInstrSize; - } else if (Opc == SubtractInstr) { - if (StackAdjust) - // We all ready have a stack pointer adjustment. + } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) { + if (StackSize) + // We already have a stack size. return 0; if (!MI.getOperand(0).isReg() || @@ -533,7 +543,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // %RSP<def> = SUB64ri8 %RSP, 48 return 0; - StackAdjust = MI.getOperand(2).getImm() / StackDivide; + StackSize = MI.getOperand(2).getImm() / StackDivide; SubtractInstrIdx += InstrOffset; ExpectEnd = true; } @@ -541,28 +551,30 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // Encode that we are using EBP/RBP as the frame pointer. uint32_t CompactUnwindEncoding = 0; - CFAOffset /= StackDivide; + StackAdjust /= StackDivide; if (HasFP) { - if ((CFAOffset & 0xFF) != CFAOffset) + if ((StackAdjust & 0xFF) != StackAdjust) // Offset was too big for compact encoding. return 0; // Get the encoding of the saved registers when we have a frame pointer. uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit); - if (RegEnc == ~0U) - return 0; + if (RegEnc == ~0U) return 0; CompactUnwindEncoding |= 0x01000000; - CompactUnwindEncoding |= (CFAOffset & 0xFF) << 16; + CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16; CompactUnwindEncoding |= RegEnc & 0x7FFF; } else { - unsigned FullOffset = CFAOffset + StackAdjust; - if ((FullOffset & 0xFF) == FullOffset) { - // Frameless stack. + ++StackAdjust; + uint32_t TotalStackSize = StackAdjust + StackSize; + if ((TotalStackSize & 0xFF) == TotalStackSize) { + // Frameless stack with a small stack size. CompactUnwindEncoding |= 0x02000000; - CompactUnwindEncoding |= (FullOffset & 0xFF) << 16; + + // Encode the stack size. + CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16; } else { - if ((CFAOffset & 0x7) != CFAOffset) + if ((StackAdjust & 0x7) != StackAdjust) // The extra stack adjustments are too big for us to handle. return 0; @@ -573,16 +585,21 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // instruction. CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16; - // Encode any extra stack stack changes (done via push instructions). - CompactUnwindEncoding |= (CFAOffset & 0x7) << 13; + // Encode any extra stack stack adjustments (done via push instructions). + CompactUnwindEncoding |= (StackAdjust & 0x7) << 13; } + // Encode the number of registers saved. + CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10; + // Get the encoding of the saved registers when we don't have a frame // pointer. - uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegs, - 6 - SavedRegIdx, - Is64Bit); + uint32_t RegEnc = + encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx, + Is64Bit); if (RegEnc == ~0U) return 0; + + // Encode the register encoding. CompactUnwindEncoding |= RegEnc & 0x3FF; } @@ -638,10 +655,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // stack pointer (we fit in the Red Zone). if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) && !RegInfo->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64 && // Win64 has no Red Zone - !EnableSegmentedStacks) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); @@ -978,7 +995,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned Opc = PI->getOpcode(); if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && - !PI->getDesc().isTerminator()) + !PI->isTerminator()) break; --MBBI; @@ -1306,6 +1323,10 @@ GetScratchRegister(bool Is64Bit, const MachineFunction &MF) { } } +// The stack limit in the TCB is set to this many bytes above the actual stack +// limit. +static const uint64_t kSplitStackAvailable = 256; + void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); @@ -1360,16 +1381,24 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { TlsReg = X86::FS; TlsOffset = 0x70; - BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) - .addImm(0).addReg(0).addImm(-StackSize).addReg(0); + if (StackSize < kSplitStackAvailable) + ScratchReg = X86::RSP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) + .addImm(0).addReg(0).addImm(-StackSize).addReg(0); + BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { TlsReg = X86::GS; TlsOffset = 0x30; - BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) - .addImm(0).addReg(0).addImm(-StackSize).addReg(0); + if (StackSize < kSplitStackAvailable) + ScratchReg = X86::ESP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) + .addImm(0).addReg(0).addImm(-StackSize).addReg(0); + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } @@ -1394,9 +1423,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MF.getRegInfo().setPhysRegUsed(X86::R10); MF.getRegInfo().setPhysRegUsed(X86::R11); } else { - // Since we'll call __morestack, stack alignment needs to be preserved. - BuildMI(allocMBB, DL, TII.get(X86::SUB32ri), X86::ESP).addReg(X86::ESP) - .addImm(8); BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) @@ -1411,11 +1437,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__morestack"); - // __morestack only seems to remove 8 bytes off the stack. Add back the - // additional 8 bytes we added before pushing the arguments. - if (!Is64Bit) - BuildMI(allocMBB, DL, TII.get(X86::ADD32ri), X86::ESP).addReg(X86::ESP) - .addImm(8); if (IsNested) BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); else diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 96c6f41..03727a2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -256,7 +256,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -270,7 +270,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); @@ -313,7 +313,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. @@ -378,6 +378,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i16 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i64 , Expand); if (Subtarget->hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i8 , Promote); } else { @@ -388,6 +392,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i8 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i64 , Expand); if (Subtarget->hasLZCNT()) { setOperationAction(ISD::CTLZ , MVT::i8 , Promote); } else { @@ -537,14 +545,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); - else if (EnableSegmentedStacks) + else if (TM.Options.EnableSegmentedStacks) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Expand); - if (!UseSoftFloat && X86ScalarSSEf64) { + if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -576,7 +584,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - } else if (!UseSoftFloat && X86ScalarSSEf32) { + } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -605,11 +613,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, X86::RFP64RegisterClass); @@ -620,7 +628,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } @@ -639,7 +647,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87. - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { addRegisterClass(MVT::f80, X86::RFP80RegisterClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -658,11 +666,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80 , Expand); setOperationAction(ISD::FCOS , MVT::f80 , Expand); } + setOperationAction(ISD::FFLOOR, MVT::f80, Expand); + setOperationAction(ISD::FCEIL, MVT::f80, Expand); + setOperationAction(ISD::FTRUNC, MVT::f80, Expand); + setOperationAction(ISD::FRINT, MVT::f80, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } @@ -714,7 +727,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); @@ -748,7 +763,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. - if (!UseSoftFloat && Subtarget->hasMMX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); // No operations on x86mmx supported, everything uses intrinsics. } @@ -785,7 +800,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - if (!UseSoftFloat && Subtarget->hasXMM()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -802,7 +817,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } - if (!UseSoftFloat && Subtarget->hasXMMInt()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM @@ -983,7 +998,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->hasSSE42orAVX()) setOperationAction(ISD::SETCC, MVT::v2i64, Custom); - if (!UseSoftFloat && Subtarget->hasAVX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); @@ -1211,10 +1226,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; - setPrefLoopAlignment(16); + setPrefLoopAlignment(4); // 2^4 bytes. benefitFromCodePlacementOpt = true; - setPrefFunctionAlignment(4); + setPrefFunctionAlignment(4); // 2^4 bytes. } @@ -1709,7 +1724,8 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. -static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { +static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, + bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && IsTailCallConvention(CC); } @@ -1723,7 +1739,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); + bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, + getTargetMachine().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -1873,7 +1890,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. - if (FuncIsMadeTailCallSafe(CallConv)) + if (FuncIsMadeTailCallSafe(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for @@ -1918,9 +1936,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); assert(!(NumXMMRegs && !Subtarget->hasXMM()) && "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && + assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && + NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasXMM()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; @@ -1998,7 +2018,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } // Some CCs need callee pop. - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. @@ -2098,7 +2119,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Sibcalls are automatically detected tailcalls which do not require // ABI changes. - if (!GuaranteedTailCallOpt && isTailCall) + if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) @@ -2126,7 +2147,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) + else if (getTargetMachine().Options.GuaranteedTailCallOpt && + IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -2305,7 +2327,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, int FI = 0; // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) @@ -2485,7 +2507,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPush; - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + getTargetMachine().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) // If this is a call to a struct-return function, the callee @@ -2643,7 +2666,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; @@ -2843,23 +2866,10 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: return true; } return false; @@ -2885,10 +2895,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: + case X86ISD::VPERMILP: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -2902,7 +2909,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PALIGN: case X86ISD::SHUFPD: case X86ISD::SHUFPS: - case X86ISD::VPERM2F128: + case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } @@ -2920,18 +2927,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } return SDValue(); @@ -3231,7 +3228,7 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, bool hasSSSE3OrAVX) { int i, e = VT.getVectorNumElements(); - if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) + if (VT.getSizeInBits() != 128) return false; // Do not handle v2i64 / v2f64 shuffles with palignr. @@ -3261,17 +3258,17 @@ static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand +/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// VSHUFPSY. -static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { +static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX, bool Commuted = false) { int NumElems = VT.getVectorNumElements(); - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + if (!HasAVX || VT.getSizeInBits() != 256) return false; - if (NumElems != 8) + if (NumElems != 4 && NumElems != 8) return false; // VSHUFPSY divides the resulting vector into 4 chunks. @@ -3284,124 +3281,63 @@ static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, // Y3..Y0, Y3..Y0, X3..X0, X3..X0 // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - - // The mask of the second half must be the same as the first but with - // the appropriate offsets. This works in the same way as VPERMILPS - // works with masks. - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - } - for (int i = QuarterSize*3; i < NumElems; ++i) { - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - - } - - return true; -} - -/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPSY instruction. -static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - - assert(NumElems == 8 && VT.getSizeInBits() == 256 && - "Only supports v8i32 and v8f32 types"); - - int HalfSize = NumElems/2; - unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) - continue; - // The mask of the first half must be equal to the second one. - unsigned Shamt = (i%HalfSize)*2; - unsigned Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << Shamt; - } - - return Mask; -} - -/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS -/// version and the mask of the second half isn't binded with the first -/// one. -static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElems = VT.getVectorNumElements(); - - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) - return false; - - if (NumElems != 4) - return false; - - // VSHUFPSY divides the resulting vector into 4 chunks. + // VSHUFPDY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // // SRC1 => X3 X2 X1 X0 // SRC2 => Y3 Y2 Y1 Y0 // - // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0 + // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - for (int i = QuarterSize*3; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; + unsigned QuarterSize = NumElems/4; + unsigned HalfSize = QuarterSize*2; + for (unsigned l = 0; l != 2; ++l) { + unsigned LaneStart = l*HalfSize; + for (unsigned s = 0; s != 2; ++s) { + unsigned QuarterStart = s*QuarterSize; + unsigned Src = (Commuted) ? (1-s) : s; + unsigned SrcStart = Src*NumElems + LaneStart; + for (unsigned i = 0; i != QuarterSize; ++i) { + int Idx = Mask[i+QuarterStart+LaneStart]; + if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize)) + return false; + // For VSHUFPSY, the mask of the second half must be the same as the first + // but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. + if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0) + continue; + if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize)) + return false; + } + } + } return true; } -/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPDY instruction. -static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { +/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions. +static unsigned getShuffleVSHUFPYImmediate(SDNode *N) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); EVT VT = SVOp->getValueType(0); int NumElems = VT.getVectorNumElements(); - assert(NumElems == 4 && VT.getSizeInBits() == 256 && - "Only supports v4i64 and v4f64 types"); + assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types"); + assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types"); int HalfSize = NumElems/2; + unsigned Mul = (NumElems == 8) ? 2 : 1; unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) + for (int i = 0; i != NumElems; ++i) { + int Elt = SVOp->getMaskElt(i); + if (Elt < 0) continue; - int Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << i; + Elt %= HalfSize; + unsigned Shamt = i; + // For VSHUFPSY, the mask of the first half must be equal to the second one. + if (NumElems == 8) Shamt %= HalfSize; + Mask |= Elt << (Shamt*Mul); } return Mask; @@ -3409,8 +3345,8 @@ static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); +static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, + unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { int idx = Mask[i]; if (idx < 0) @@ -3422,31 +3358,13 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { } } -/// isCommutedVSHUFP() - Return true if swapping operands will -/// allow to use the "vshufpd" or "vshufps" instruction -/// for 256-bit vectors -static bool isCommutedVSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - - unsigned NumElems = VT.getVectorNumElements(); - if ((VT.getSizeInBits() != 256) || ((NumElems != 4) && (NumElems != 8))) - return false; - - SmallVector<int, 8> CommutedMask; - for (unsigned i = 0; i < NumElems; ++i) - CommutedMask.push_back(Mask[i]); - - CommuteVectorShuffleMask(CommutedMask, VT); - return (NumElems == 4) ? isVSHUFPDYMask(CommutedMask, VT, Subtarget): - isVSHUFPSYMask(CommutedMask, VT, Subtarget); -} - - /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit -/// SHUFPS and SHUFPD. -static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); +/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be +/// reverse of what x86 shuffles want. +static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool Commuted = false) { + unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) return false; @@ -3454,12 +3372,14 @@ static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4) return false; - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) + unsigned Half = NumElems / 2; + unsigned SrcStart = Commuted ? NumElems : 0; + for (unsigned i = 0; i != Half; ++i) + if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) + SrcStart = Commuted ? 0 : NumElems; + for (unsigned i = Half; i != NumElems; ++i) + if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) return false; return true; @@ -3471,32 +3391,6 @@ bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { return ::isSHUFPMask(M, N->getValueType(0)); } -/// isCommutedSHUFP - Returns true if the shuffle mask is exactly -/// the reverse of what x86 shuffles want. x86 shuffles requires the lower -/// half elements to come from vector 1 (which would equal the dest.) and -/// the upper half to come from vector 2. -static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) - return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) - return false; - return true; -} - -static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return isCommutedSHUFPMask(M, N->getValueType(0)); -} - /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { @@ -3765,15 +3659,15 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } -/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) +static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX) { + if (!HasAVX || VT.getSizeInBits() != 256) return false; // The shuffle result is divided into half A and half B. In total the two @@ -3801,10 +3695,9 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, return MatchA && MatchB; } -/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERM2F128 instructions. -static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. +static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); int HalfSize = VT.getVectorNumElements()/2; @@ -3826,81 +3719,47 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { return (FstHalf | (SndHalf << 4)); } -/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand +/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. -static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { +static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits()/128; - if (!Subtarget->hasAVX()) + if (!HasAVX) return false; - // Only match 256-bit with 64-bit types - if (VT.getSizeInBits() != 256 || NumElts != 4) + // Only match 256-bit with 32/64-bit types + if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) return false; - // The mask on the high lane is independent of the low. Both can match - // any element in inside its own lane, but can't cross. int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int LaneStart = l*LaneSize; - if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) + for (int l = 0; l != NumLanes; ++l) { + int LaneStart = l*LaneSize; + for (int i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize)) + return false; + if (NumElts == 4 || l == 0) + continue; + // VPERMILPS handling + if (Mask[i] < 0) + continue; + if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize)) return false; } - - return true; -} - -/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. -/// Note that VPERMIL mask matching is different depending whether theunderlying -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point -/// to the same elements of the low, but to the higher half of the source. -/// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - - if (!Subtarget->hasAVX()) - return false; - - // Only match 256-bit with 32-bit types - if (VT.getSizeInBits() != 256 || NumElts != 8) - return false; - - // The mask on the high lane should be the same as the low. Actually, - // they can differ if any of the corresponding index in a lane is undef - // and the other stays in range. - int LaneSize = NumElts/NumLanes; - for (int i = 0; i < LaneSize; ++i) { - int HighElt = i+LaneSize; - bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); - bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); - - if (!HighValid || !LowValid) - return false; - if (Mask[i] < 0 || Mask[HighElt] < 0) - continue; - if (Mask[HighElt]-Mask[i] != LaneSize) - return false; } return true; } -/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPS* instructions. -static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions. +static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); int NumElts = VT.getVectorNumElements(); @@ -3911,43 +3770,22 @@ static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { // where a mask will match because the same mask element is undef on the // first half but valid on the second. This would get pathological cases // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. + unsigned Shift = (LaneSize == 4) ? 2 : 1; unsigned Mask = 0; - for (int l = 0; l < NumLanes; ++l) { - for (int i = 0; i < LaneSize; ++i) { - int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); - if (MaskElt < 0) - continue; - if (MaskElt >= LaneSize) - MaskElt -= LaneSize; - Mask |= MaskElt << (i*2); - } + for (int i = 0; i != NumElts; ++i) { + int MaskElt = SVOp->getMaskElt(i); + if (MaskElt < 0) + continue; + MaskElt %= LaneSize; + unsigned Shamt = i; + // VPERMILPSY, the mask of the first half must be equal to the second one + if (NumElts == 8) Shamt %= LaneSize; + Mask |= MaskElt << (Shamt*Shift); } return Mask; } -/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPD* instructions. -static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - - unsigned Mask = 0; - int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int MaskElt = SVOp->getMaskElt(i); - if (MaskElt < 0) - continue; - Mask |= (MaskElt-l*LaneSize) << i; - } - - return Mask; -} - /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. @@ -4035,21 +3873,18 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); +static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool HasAVX) { int NumElts = VT.getVectorNumElements(); - bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || - !V2IsUndef || NumElts != 4) + if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) return false; for (int i = 0; i != NumElts/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), 0)) + if (!isUndefOrEqual(Mask[i], 0)) return false; for (int i = NumElts/2; i != NumElts; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) + if (!isUndefOrEqual(Mask[i], NumElts/2)) return false; return true; } @@ -4164,14 +3999,13 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. -unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VVT = N->getValueType(0); - unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; int Val = 0; unsigned i, e; - for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { + for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) { Val = SVOp->getMaskElt(i); if (Val >= 0) break; @@ -4631,29 +4465,14 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, case X86ISD::SHUFPS: case X86ISD::SHUFPD: ImmN = N->getOperand(N->getNumOperands()-1); - DecodeSHUFPSMask(NumElems, - cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - DecodePUNPCKHMask(NumElems, ShuffleMask); - break; - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - DecodeUNPCKHPMask(VT, ShuffleMask); + DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); break; - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - DecodePUNPCKLMask(VT, ShuffleMask); + case X86ISD::UNPCKH: + DecodeUNPCKHMask(VT, ShuffleMask); break; - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - DecodeUNPCKLPMask(VT, ShuffleMask); + case X86ISD::UNPCKL: + DecodeUNPCKLMask(VT, ShuffleMask); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, ShuffleMask); @@ -4686,27 +4505,12 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, Depth+1); } - case X86ISD::VPERMILPS: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPSY: + case X86ISD::VPERMILP: ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), + DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); break; - case X86ISD::VPERMILPD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPDY: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERM2F128: + case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); @@ -5334,8 +5138,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG); } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); - EVT MiddleVT = MVT::v4i32; + unsigned NumBits = VT.getSizeInBits(); + assert((NumBits == 128 || NumBits == 256) && + "Expected an SSE or AVX value type!"); + EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32; Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasXMMInt(), DAG); @@ -6256,7 +6062,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // from X. if (NumHi == 3) { // Normalize it so the 3 elements come from V1. - CommuteVectorShuffleMask(PermMask, VT); + CommuteVectorShuffleMask(PermMask, 4); std::swap(V1, V2); } @@ -6566,70 +6372,6 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { X86::getShuffleSHUFImmediate(SVOp), DAG); } -static inline unsigned getUNPCKLOpcode(EVT VT, bool HasAVX2) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKLDQ; - case MVT::v2i64: return X86ISD::PUNPCKLQDQ; - case MVT::v8i32: - if (HasAVX2) return X86ISD::PUNPCKLDQ; - // else use fp unit for int unpack. - case MVT::v8f32: - case MVT::v4f32: return X86ISD::UNPCKLPS; - case MVT::v4i64: - if (HasAVX2) return X86ISD::PUNPCKLQDQ; - // else use fp unit for int unpack. - case MVT::v4f64: - case MVT::v2f64: return X86ISD::UNPCKLPD; - case MVT::v32i8: - case MVT::v16i8: return X86ISD::PUNPCKLBW; - case MVT::v16i16: - case MVT::v8i16: return X86ISD::PUNPCKLWD; - default: - llvm_unreachable("Unknown type for unpckl"); - } - return 0; -} - -static inline unsigned getUNPCKHOpcode(EVT VT, bool HasAVX2) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKHDQ; - case MVT::v2i64: return X86ISD::PUNPCKHQDQ; - case MVT::v8i32: - if (HasAVX2) return X86ISD::PUNPCKHDQ; - // else use fp unit for int unpack. - case MVT::v8f32: - case MVT::v4f32: return X86ISD::UNPCKHPS; - case MVT::v4i64: - if (HasAVX2) return X86ISD::PUNPCKHQDQ; - // else use fp unit for int unpack. - case MVT::v4f64: - case MVT::v2f64: return X86ISD::UNPCKHPD; - case MVT::v32i8: - case MVT::v16i8: return X86ISD::PUNPCKHBW; - case MVT::v16i16: - case MVT::v8i16: return X86ISD::PUNPCKHWD; - default: - llvm_unreachable("Unknown type for unpckh"); - } - return 0; -} - -static inline unsigned getVPERMILOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: - case MVT::v4f32: return X86ISD::VPERMILPS; - case MVT::v2i64: - case MVT::v2f64: return X86ISD::VPERMILPD; - case MVT::v8i32: - case MVT::v8f32: return X86ISD::VPERMILPSY; - case MVT::v4i64: - case MVT::v4f64: return X86ISD::VPERMILPDY; - default: - llvm_unreachable("Unknown type for vpermil"); - } - return 0; -} - static SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI, @@ -6703,17 +6445,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; bool HasXMMInt = Subtarget->hasXMMInt(); + bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef"); + // Vector shuffle lowering takes 3 steps: // // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable @@ -6738,11 +6482,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1, - DAG); + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1, - DAG); + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() && V2IsUndef && RelaxedMayFoldVectorLoad(V1)) @@ -6754,8 +6496,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Use to match splats if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) - return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1, - DAG); + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); if (X86::isPSHUFDMask(SVOp)) { // The actual implementation will match the mask in the if above and then @@ -6787,8 +6528,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (X86::isMOVLMask(SVOp)) { - if (V1IsUndef) - return V2; if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); if (!X86::isMOVLPMask(SVOp)) { @@ -6834,17 +6573,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { V2IsSplat = isSplatVector(V2.getNode()); // Canonicalize the splat or undef, if present, to be on the RHS. - if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { + if (V1IsSplat && !V2IsSplat) { Op = CommuteVectorShuffle(SVOp, DAG); SVOp = cast<ShuffleVectorSDNode>(Op); V1 = SVOp->getOperand(0); V2 = SVOp->getOperand(1); std::swap(V1IsSplat, V2IsSplat); - std::swap(V1IsUndef, V2IsUndef); Commuted = true; } - if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { + SmallVector<int, 32> M; + SVOp->getMask(M); + + if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { // Shuffling low element of v1 into undef, just return v1. if (V2IsUndef) return V1; @@ -6854,13 +6595,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getMOVL(DAG, dl, VT, V2, V1); } - if (X86::isUNPCKLMask(SVOp, HasAVX2)) - return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V2, - DAG); + if (isUNPCKLMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (X86::isUNPCKHMask(SVOp, HasAVX2)) - return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V2, - DAG); + if (isUNPCKHMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first @@ -6884,35 +6623,30 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); if (X86::isUNPCKLMask(NewSVOp, HasAVX2)) - return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V2, V1, - DAG); + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG); if (X86::isUNPCKHMask(NewSVOp, HasAVX2)) - return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V2, V1, - DAG); + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG); } // Normalize the node to match x86 shuffle ops if needed - if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) + if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) || + isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - SmallVector<int, 16> M; - SVOp->getMask(M); if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX())) return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, - X86::getShufflePALIGNRImmediate(SVOp), + getShufflePALIGNRImmediate(SVOp), DAG); if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); - if (VT == MVT::v2i64) - return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } if (isPSHUFHWMask(M, VT)) @@ -6929,12 +6663,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, X86::getShuffleSHUFImmediate(SVOp), DAG); - if (X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1, - DAG); - if (X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1, - DAG); + if (isUNPCKL_v_undef_Mask(M, VT)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (isUNPCKH_v_undef_Mask(M, VT)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); //===--------------------------------------------------------------------===// // Generate target specific nodes for 128 or 256-bit shuffles only @@ -6942,44 +6674,23 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // // Handle VMOVDDUPY permutations - if (isMOVDDUPYMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); - // Handle VPERMILPS* permutations - if (isVPERMILPSMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPSImmediate(SVOp), DAG); - - // Handle VPERMILPD* permutations - if (isVPERMILPDMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPDImmediate(SVOp), DAG); + // Handle VPERMILPS/D* permutations + if (isVPERMILPMask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + getShuffleVPERMILPImmediate(SVOp), DAG); - // Handle VPERM2F128 permutations - if (isVPERM2F128Mask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, - getShuffleVPERM2F128Immediate(SVOp), DAG); + // Handle VPERM2F128/VPERM2I128 permutations + if (isVPERM2X128Mask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, + V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - // Handle VSHUFPSY permutations - if (isVSHUFPSYMask(M, VT, Subtarget)) + // Handle VSHUFPS/DY permutations + if (isVSHUFPYMask(M, VT, HasAVX)) return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPSYImmediate(SVOp), DAG); - - // Handle VSHUFPDY permutations - if (isVSHUFPDYMask(M, VT, Subtarget)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPDYImmediate(SVOp), DAG); - - // Try to swap operands in the node to match x86 shuffle ops - if (isCommutedVSHUFPMask(M, VT, Subtarget)) { - // Now we need to commute operands. - SVOp = cast<ShuffleVectorSDNode>(CommuteVectorShuffle(SVOp, DAG)); - V1 = SVOp->getOperand(0); - V2 = SVOp->getOperand(1); - unsigned Immediate = (NumElems == 4) ? getShuffleVSHUFPDYImmediate(SVOp): - getShuffleVSHUFPSYImmediate(SVOp); - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, Immediate, DAG); - } + getShuffleVSHUFPYImmediate(SVOp), DAG); //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -7888,7 +7599,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, LLVMContext *Context = DAG.getContext(); // Build some magic constants. - std::vector<Constant*> CV0; + SmallVector<Constant*,4> CV0; CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); @@ -7896,7 +7607,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, Constant *C0 = ConstantVector::get(CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); - std::vector<Constant*> CV1; + SmallVector<Constant*,2> CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); CV1.push_back( @@ -8176,17 +7887,13 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, EVT EltVT = VT; if (VT.isVector()) EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; + SmallVector<Constant*,4> CV; if (EltVT == MVT::f64) { Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); - CV.push_back(C); - CV.push_back(C); + CV.assign(2, C); } else { Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + CV.assign(4, C); } Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); @@ -8201,19 +7908,18 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; + NumElts = VT.getVectorNumElements(); + } + SmallVector<Constant*,8> CV; if (EltVT == MVT::f64) { Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); - CV.push_back(C); - CV.push_back(C); + CV.assign(NumElts, C); } else { Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + CV.assign(NumElts, C); } Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); @@ -8221,11 +7927,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo::getConstantPool(), false, false, false, 16); if (VT.isVector()) { + MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::XOR, dl, MVT::v2i64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::XOR, dl, XORVT, + DAG.getNode(ISD::BITCAST, dl, XORVT, Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); + DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); } else { return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } @@ -8254,7 +7961,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // type, and that won't be f80 since that is not custom lowered. // First get the sign bit of second operand. - std::vector<Constant*> CV; + SmallVector<Constant*,4> CV; if (SrcVT == MVT::f64) { CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); @@ -9253,7 +8960,7 @@ SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || - EnableSegmentedStacks) && + getTargetMachine().Options.EnableSegmentedStacks) && "This should be used only on Windows targets or when segmented stacks " "are being used"); assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); @@ -9267,7 +8974,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - if (EnableSegmentedStacks) { + if (getTargetMachine().Options.EnableSegmentedStacks) { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -9403,7 +9110,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!UseSoftFloat && + assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && Subtarget->hasXMM()); @@ -10472,7 +10179,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(4, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); @@ -10487,13 +10194,13 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(2, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); // return pblendv(r, r+r, a); R = DAG.getNode(ISD::VSELECT, dl, VT, Op, - R, DAG.getNode(ISD::ADD, dl, VT, R, R)); + DAG.getNode(ISD::ADD, dl, VT, R, R), R); return R; } @@ -11194,6 +10901,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDV: return "X86ISD::BLENDV"; + case X86ISD::HADD: return "X86ISD::HADD"; + case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::FMAX: return "X86ISD::FMAX"; @@ -11266,24 +10975,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; - case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; - case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; - case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; - case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; - case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; - case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; - case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; - case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; - case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; - case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; - case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; - case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; + case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; - case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; - case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; - case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; - case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; - case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; + case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -11391,7 +11087,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const { // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) - return isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()); + return false; // FIXME: pshufb, blends, shifts. return (VT.getVectorNumElements() == 2 || @@ -11419,7 +11115,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || isSHUFPMask(Mask, VT) || - isCommutedSHUFPMask(Mask, VT)); + isSHUFPMask(Mask, VT, /* Commuted */ true)); } return false; } @@ -12289,7 +11985,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - assert(EnableSegmentedStacks); + assert(getTargetMachine().Options.EnableSegmentedStacks); unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; @@ -13169,7 +12865,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -13179,7 +12875,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMIN; @@ -13197,7 +12893,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; @@ -13207,7 +12903,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -13233,7 +12929,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -13243,7 +12939,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; @@ -13268,7 +12964,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -14048,7 +13744,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, X, Y); + Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } } @@ -14232,7 +13928,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatination of two XMM registers, perform two stores. + // If we are saving a concatenation of two XMM registers, perform two stores. // This is better in Sandy Bridge cause one 256-bit mem op is done via two // 128-bit ones. If in the future the cost becomes only one memory access the // first version would be better. @@ -14342,7 +14038,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps + bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasXMMInt(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && @@ -14458,7 +14154,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > @@ -14474,7 +14170,18 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { return false; EVT VT = LHS.getValueType(); - unsigned N = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for horizontal add/sub"); + + // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to + // operate independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts / NumLanes; + assert((NumLaneElts % 2 == 0) && + "Vector type should have an even number of elements in each lane"); + unsigned HalfLaneElts = NumLaneElts/2; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask @@ -14483,7 +14190,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // NOTE: in what follows a default initialized SDValue represents an UNDEF of // type VT. SDValue A, B; - SmallVector<int, 8> LMask(N); + SmallVector<int, 16> LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) A = LHS.getOperand(0); @@ -14493,14 +14200,14 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { } else { if (LHS.getOpcode() != ISD::UNDEF) A = LHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector<int, 8> RMask(N); + SmallVector<int, 16> RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) C = RHS.getOperand(0); @@ -14510,7 +14217,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { } else { if (RHS.getOpcode() != ISD::UNDEF) C = RHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } @@ -14525,30 +14232,28 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) - for (unsigned i = 0; i != N; ++i) { - unsigned Idx = RMask[i]; - if (Idx < N) - RMask[i] += N; - else if (Idx < 2*N) - RMask[i] -= N; - } + CommuteVectorShuffleMask(RMask, NumElts); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned i = 0; i != N; ++i) { - unsigned LIdx = LMask[i], RIdx = RMask[i]; + for (unsigned i = 0; i != NumElts; ++i) { + int LIdx = LMask[i], RIdx = RMask[i]; // Ignore any UNDEF components. - if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N)) - || (!B.getNode() && (LIdx >= N || RIdx >= N))) + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. - if (!(LIdx == 2*i && RIdx == 2*i + 1) && - !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i)) + unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs + unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; + int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } @@ -14565,7 +14270,8 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if (Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -14579,7 +14285,8 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal subs from subs of shuffles. - if (Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -14783,7 +14490,8 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if ((Subtarget->hasSSSE3orAVX()) && (VT == MVT::v8i16 || VT == MVT::v4i32) && + if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); @@ -14815,8 +14523,9 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. EVT VT = N->getValueType(0); - if ((Subtarget->hasSSSE3orAVX()) && (VT == MVT::v8i16 || VT == MVT::v4i32) && - isHorizontalBinOp(Op0, Op1, false)) + if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); @@ -14857,18 +14566,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::SHUFPS: // Handle all target specific shuffles case X86ISD::SHUFPD: case X86ISD::PALIGN: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFD: @@ -14876,11 +14575,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index ccff3a5..cfc1f88 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -273,23 +273,10 @@ namespace llvm { MOVLPD, MOVSD, MOVSS, - UNPCKLPS, - UNPCKLPD, - UNPCKHPS, - UNPCKHPD, - PUNPCKLBW, - PUNPCKLWD, - PUNPCKLDQ, - PUNPCKLQDQ, - PUNPCKHBW, - PUNPCKHWD, - PUNPCKHDQ, - PUNPCKHQDQ, - VPERMILPS, - VPERMILPSY, - VPERMILPD, - VPERMILPDY, - VPERM2F128, + UNPCKL, + UNPCKH, + VPERMILP, + VPERM2X128, VBROADCAST, // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, @@ -468,10 +455,6 @@ namespace llvm { /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction. unsigned getShufflePSHUFLWImmediate(SDNode *N); - /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle - /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. - unsigned getShufflePALIGNRImmediate(SDNode *N); - /// getExtractVEXTRACTF128Immediate - Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index /// with VEXTRACTF128 instructions. diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index d868773..f443088 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -58,3 +58,391 @@ let isAsmParserOnly = 1 in { defm VFNMSUBPS : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">; defm VFNMSUBPD : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W; } + +//===----------------------------------------------------------------------===// +// FMA4 - AMD 4 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + + +multiclass fma4s<bits<8> opc, string OpcodeStr> { + def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_W; + def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_W; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>; + +} + +multiclass fma4p<bits<8> opc, string OpcodeStr> { + def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_W; + def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_W; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>; + def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_W; + def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_W; + def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>; +} + +let isAsmParserOnly = 1 in { + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss">; + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd">; + defm VFMADDPS4 : fma4p<0x68, "vfmaddps">; + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd">; + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss">; + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd">; + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps">; + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd">; + defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss">; + defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd">; + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps">; + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd">; + defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss">; + defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd">; + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps">; + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd">; + defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps">; + defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd">; + defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps">; + defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd">; +} + +// FMA4 Intrinsics patterns + +// VFMADD +def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFMSUB +def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFNMADD +def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFNMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFNMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFNMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFNMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFNMSUB +def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFNMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFNMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFNMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFNMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFNMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFNMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFNMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFNMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFNMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFNMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFMADDSUB +def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMADDSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMADDSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMADDSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMADDSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMADDSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMADDSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMADDSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMADDSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMADDSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +// VFMSUBADD +def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2, + (alignedloadv4f32 addr:$src3)), + (VFMSUBADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2), + VR128:$src3), + (VFMSUBADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMSUBADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMSUBADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMSUBADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>; + +def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2, + (alignedloadv8f32 addr:$src3)), + (VFMSUBADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, + (alignedloadv8f32 addr:$src2), + VR256:$src3), + (VFMSUBADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>; + +def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3), + (VFMSUBADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2, + (alignedloadv4f64 addr:$src3)), + (VFMSUBADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, + (alignedloadv4f64 addr:$src2), + VR256:$src3), + (VFMSUBADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>; diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index ecd6a93..7ba3639 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -110,6 +110,8 @@ class A7 { bits<5> Prefix = 16; } class T8XD { bits<5> Prefix = 17; } class T8XS { bits<5> Prefix = 18; } class TAXD { bits<5> Prefix = 19; } +class XOP8 { bits<5> Prefix = 20; } +class XOP9 { bits<5> Prefix = 21; } class VEX { bit hasVEXPrefix = 1; } class VEX_W { bit hasVEX_WPrefix = 1; } class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; } @@ -118,7 +120,8 @@ class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } - +class XOP_W { bit hasXOP_WPrefix = 1; } +class XOP { bit hasXOP_Prefix = 1; } class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, Domain d = GenericDomain> : Instruction { @@ -158,6 +161,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? + bit hasXOP_WPrefix = 0; // Same bit as VEX_W, but used for swapping operands + bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix? // TSFlags layout should be kept in sync with X86InstrInfo.h. let TSFlags{5-0} = FormBits; @@ -179,6 +184,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{38} = hasVEX_L; let TSFlags{39} = ignoresVEX_L; let TSFlags{40} = has3DNow0F0FOpcode; + let TSFlags{41} = hasXOP_WPrefix; + let TSFlags{42} = hasXOP_Prefix; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -332,6 +339,10 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, TB, Requires<[HasAVX]>; +class VoPSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, + Requires<[HasXMM]>; // SSE2 Instruction Templates: // @@ -496,6 +507,30 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize, VEX_4V, Requires<[HasFMA3]>; +// FMA4 Instruction Templates +class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>; + +// XOP 2, 3 and 4 Operand Instruction Template +class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, + XOP, XOP9, Requires<[HasXOP]>; + +// XOP 2, 3 and 4 Operand Instruction Templates with imm byte +class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, + XOP, XOP8, Requires<[HasXOP]>; + +// XOP 5 operand instruction (VEX encoding!) +class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>; + // X86-64 Instruction templates... // diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 791bbe6..cd13bc4 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -130,28 +130,12 @@ def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; -def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>; -def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>; +def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; +def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; -def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>; -def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>; +def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; -def X86Punpcklbw : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>; -def X86Punpcklwd : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>; -def X86Punpckldq : SDNode<"X86ISD::PUNPCKLDQ", SDTShuff2Op>; -def X86Punpcklqdq : SDNode<"X86ISD::PUNPCKLQDQ", SDTShuff2Op>; - -def X86Punpckhbw : SDNode<"X86ISD::PUNPCKHBW", SDTShuff2Op>; -def X86Punpckhwd : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>; -def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>; -def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>; - -def X86VPermilps : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>; -def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>; -def X86VPermilpd : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>; -def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>; - -def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>; +def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; @@ -363,12 +347,6 @@ def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{ return getI8Imm(X86::getShufflePSHUFLWImmediate(N)); }]>; -// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to -// a PALIGNR imm. -def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{ - return getI8Imm(X86::getShufflePALIGNRImmediate(N)); -}]>; - // EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index // to VEXTRACTF128 imm. def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 24c4a53..7d1b9a1 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1528,9 +1528,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. - BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2); + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); InsMI2 = - BuildMI(*MFI, MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) .addReg(leaInReg2, RegState::Define, X86::sub_16bit) .addReg(Src2, getKillRegState(isKill2)); addRegReg(MIB, leaInReg, true, leaInReg2, true); @@ -2040,13 +2040,12 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { } bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { - const MCInstrDesc &MCID = MI->getDesc(); - if (!MCID.isTerminator()) return false; + if (!MI->isTerminator()) return false; // Conditional branch is a special case. - if (MCID.isBranch() && !MCID.isBarrier()) + if (MI->isBranch() && !MI->isBarrier()) return true; - if (!MCID.isPredicable()) + if (!MI->isPredicable()) return true; return !isPredicated(MI); } @@ -2072,7 +2071,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, // A terminator that isn't a branch can't easily be handled by this // analysis. - if (!I->getDesc().isBranch()) + if (!I->isBranch()) return true; // Handle unconditional branches. @@ -2556,6 +2555,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); switch (MI->getOpcode()) { case X86::V_SET0: + case X86::FsFLD0SS: + case X86::FsFLD0SD: return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); @@ -2771,7 +2772,9 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::RCPSSr: case X86::RCPSSr_Int: case X86::ROUNDSDr: + case X86::ROUNDSDr_Int: case X86::ROUNDSSr: + case X86::ROUNDSSr_Int: case X86::RSQRTSSr: case X86::RSQRTSSr_Int: case X86::SQRTSSr: @@ -2783,7 +2786,9 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::Int_VCVTSS2SDrr: case X86::VRCPSSr: case X86::VROUNDSDr: + case X86::VROUNDSDr_Int: case X86::VROUNDSSr: + case X86::VROUNDSSr_Int: case X86::VRSQRTSSr: case X86::VSQRTSSr: return true; @@ -2911,11 +2916,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 16; break; case X86::FsFLD0SD: - case X86::VFsFLD0SD: Alignment = 8; break; case X86::FsFLD0SS: - case X86::VFsFLD0SS: Alignment = 4; break; default: @@ -2950,9 +2953,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, case X86::AVX_SETALLONES: case X86::AVX2_SETALLONES: case X86::FsFLD0SD: - case X86::FsFLD0SS: - case X86::VFsFLD0SD: - case X86::VFsFLD0SS: { + case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -2978,9 +2979,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI->getOpcode(); - if (Opc == X86::FsFLD0SS || Opc == X86::VFsFLD0SS) + if (Opc == X86::FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); - else if (Opc == X86::FsFLD0SD || Opc == X86::VFsFLD0SD) + else if (Opc == X86::FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY) Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8); @@ -3569,7 +3570,13 @@ static const unsigned ReplaceableInstrsAVX2[][3] = { { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm }, { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, - { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr } + { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, + { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, + { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, + { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, + { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, + { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr } }; // FIXME: Some shuffle and unpack instructions have equivalents in different diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 35631d5..0bc3afa 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -473,6 +473,7 @@ def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; +def HasXMM : Predicate<"Subtarget->hasXMM()">; def HasXMMInt : Predicate<"Subtarget->hasXMMInt()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; @@ -480,6 +481,7 @@ def HasAES : Predicate<"Subtarget->hasAES()">; def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">; def HasFMA3 : Predicate<"Subtarget->hasFMA3()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; def HasF16C : Predicate<"Subtarget->hasF16C()">; @@ -1502,6 +1504,9 @@ include "X86InstrFragmentsSIMD.td" // FMA - Fused Multiply-Add support (requires FMA) include "X86InstrFMA.td" +// XOP +include "X86InstrXOP.td" + // SSE, MMX and 3DNow! vector support. include "X86InstrSSE.td" include "X86InstrMMX.td" diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 7cadac1..345f606 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -240,21 +240,13 @@ let Predicates = [HasAVX] in { } // Alias instructions that map fld0 to pxor for sse. -// FIXME: Set encoding to pseudo! -let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1, - canFoldAsLoad = 1 in { - def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, - Requires<[HasSSE1]>, TB, OpSize; - def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, - Requires<[HasSSE2]>, TB, OpSize; - def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, - Requires<[HasAVX]>, TB, OpSize, VEX_4V; - def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, - Requires<[HasAVX]>, TB, OpSize, VEX_4V; +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1 in { + def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, Requires<[HasXMM]>; + def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, Requires<[HasXMMInt]>; } //===----------------------------------------------------------------------===// @@ -569,6 +561,16 @@ let Predicates = [HasAVX] in { (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8f32 VR256:$src), sub_ss)), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v8i32 VR256:$src), sub_ss)), sub_xmm)>; } let AddedComplexity = 20 in { @@ -596,6 +598,9 @@ let Predicates = [HasAVX] in { // Represent the same patterns above but in the form they appear for // 256-bit types + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; @@ -613,6 +618,15 @@ let Predicates = [HasAVX] in { (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4f64 VR256:$src), sub_sd)), sub_xmm)>; // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), @@ -634,6 +648,16 @@ let Predicates = [HasAVX] in { (VMOVSSrr (v4f32 VR128:$src1), (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + // 256-bit variants + def : Pat<(v8i32 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_ss), + (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_ss)), sub_xmm)>; + def : Pat<(v8f32 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_ss), + (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_ss)), sub_xmm)>; + // Shuffle with VMOVSD def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), (VMOVSDrr VR128:$src1, FR64:$src2)>; @@ -650,6 +674,17 @@ let Predicates = [HasAVX] in { (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; + // 256-bit variants + def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_sd), + (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_sd)), sub_xmm)>; + def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_sd), + (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_sd)), sub_xmm)>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -657,6 +692,9 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2), + sub_sd))>; def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; @@ -761,6 +799,22 @@ let isCodeGenOnly = 1 in { "movupd\t{$src, $dst|$dst, $src}", []>, VEX; } +let Predicates = [HasAVX] in { +def : Pat<(v8i32 (X86vzmovl + (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v4i64 (X86vzmovl + (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v8f32 (X86vzmovl + (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v4f64 (X86vzmovl + (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +} + + def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>; def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), (VMOVUPSYmr addr:$dst, VR256:$src)>; @@ -1156,14 +1210,17 @@ let Predicates = [HasAVX] in { (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), (VMOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), (VMOVHPSrm VR128:$src1, addr:$src2)>; - // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold cause // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; @@ -1174,10 +1231,10 @@ let Predicates = [HasAVX] in { // Store patterns def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst), + (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst), (VMOVHPSmr addr:$dst, VR128:$src)>; def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))), addr:$dst), + (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; } @@ -1189,21 +1246,24 @@ let Predicates = [HasSSE1] in { (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), (MOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), (MOVHPSrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst), + (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst), (MOVHPSmr addr:$dst, VR128:$src)>; } let Predicates = [HasSSE2] in { - // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold cause // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; @@ -1214,7 +1274,7 @@ let Predicates = [HasSSE2] in { // Store patterns def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst), + (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))),addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; } @@ -1943,7 +2003,7 @@ def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // whenever possible to avoid declaring two versions of each one. def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), (VCVTDQ2PSYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvtdq2_ps_256 (memopv8i32 addr:$src)), +def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))), (VCVTDQ2PSYrm addr:$src)>; def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src), @@ -2430,27 +2490,27 @@ let AddedComplexity = 10 in { } // AddedComplexity let Predicates = [HasSSE1] in { - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), + def : Pat<(v4f32 (X86Unpckl VR128:$src1, (memopv4f32 addr:$src2))), (UNPCKLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), + def : Pat<(v4f32 (X86Unpckl VR128:$src1, VR128:$src2)), (UNPCKLPSrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))), + def : Pat<(v4f32 (X86Unpckh VR128:$src1, (memopv4f32 addr:$src2))), (UNPCKHPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), + def : Pat<(v4f32 (X86Unpckh VR128:$src1, VR128:$src2)), (UNPCKHPSrr VR128:$src1, VR128:$src2)>; } let Predicates = [HasSSE2] in { - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (memopv2f64 addr:$src2))), (UNPCKLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), + def : Pat<(v2f64 (X86Unpckl VR128:$src1, VR128:$src2)), (UNPCKLPDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), + def : Pat<(v2f64 (X86Unpckh VR128:$src1, (memopv2f64 addr:$src2))), (UNPCKHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)), + def : Pat<(v2f64 (X86Unpckh VR128:$src1, VR128:$src2)), (UNPCKHPDrr VR128:$src1, VR128:$src2)>; - // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the + // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel // time and the fold opportunity reappears. @@ -2463,59 +2523,43 @@ let Predicates = [HasSSE2] in { } let Predicates = [HasAVX] in { - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), + def : Pat<(v4f32 (X86Unpckl VR128:$src1, (memopv4f32 addr:$src2))), (VUNPCKLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), + def : Pat<(v4f32 (X86Unpckl VR128:$src1, VR128:$src2)), (VUNPCKLPSrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))), + def : Pat<(v4f32 (X86Unpckh VR128:$src1, (memopv4f32 addr:$src2))), (VUNPCKHPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), + def : Pat<(v4f32 (X86Unpckh VR128:$src1, VR128:$src2)), (VUNPCKHPSrr VR128:$src1, VR128:$src2)>; - def : Pat<(v8f32 (X86Unpcklps VR256:$src1, (memopv8f32 addr:$src2))), + def : Pat<(v8f32 (X86Unpckl VR256:$src1, (memopv8f32 addr:$src2))), (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8f32 (X86Unpcklps VR256:$src1, VR256:$src2)), - (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpcklps VR256:$src1, VR256:$src2)), + def : Pat<(v8f32 (X86Unpckl VR256:$src1, VR256:$src2)), (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpcklps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), - (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8f32 (X86Unpckhps VR256:$src1, (memopv8f32 addr:$src2))), - (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8f32 (X86Unpckhps VR256:$src1, VR256:$src2)), - (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpckhps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + def : Pat<(v8f32 (X86Unpckh VR256:$src1, (memopv8f32 addr:$src2))), (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86Unpckhps VR256:$src1, VR256:$src2)), + def : Pat<(v8f32 (X86Unpckh VR256:$src1, VR256:$src2)), (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (memopv2f64 addr:$src2))), (VUNPCKLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), + def : Pat<(v2f64 (X86Unpckl VR128:$src1, VR128:$src2)), (VUNPCKLPDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), + def : Pat<(v2f64 (X86Unpckh VR128:$src1, (memopv2f64 addr:$src2))), (VUNPCKHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)), + def : Pat<(v2f64 (X86Unpckh VR128:$src1, VR128:$src2)), (VUNPCKHPDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4f64 (X86Unpcklpd VR256:$src1, (memopv4f64 addr:$src2))), + def : Pat<(v4f64 (X86Unpckl VR256:$src1, (memopv4f64 addr:$src2))), (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4f64 (X86Unpcklpd VR256:$src1, VR256:$src2)), + def : Pat<(v4f64 (X86Unpckl VR256:$src1, VR256:$src2)), (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpcklpd VR256:$src1, (memopv4i64 addr:$src2))), - (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpcklpd VR256:$src1, VR256:$src2)), - (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4f64 (X86Unpckhpd VR256:$src1, (memopv4f64 addr:$src2))), - (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4f64 (X86Unpckhpd VR256:$src1, VR256:$src2)), - (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpckhpd VR256:$src1, (memopv4i64 addr:$src2))), + def : Pat<(v4f64 (X86Unpckh VR256:$src1, (memopv4f64 addr:$src2))), (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpckhpd VR256:$src1, VR256:$src2)), + def : Pat<(v4f64 (X86Unpckh VR256:$src1, VR256:$src2)), (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; - // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the + // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel // time and the fold opportunity reappears. @@ -2869,7 +2913,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins ssmem:$src1, VR128:$src2), + (ins VR128:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; } @@ -3198,13 +3242,13 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), //===----------------------------------------------------------------------===// // Prefetch intrinsic. -def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src), +def PREFETCHT0 : VoPSI<0x18, MRM1m, (outs), (ins i8mem:$src), "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>; -def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src), +def PREFETCHT1 : VoPSI<0x18, MRM2m, (outs), (ins i8mem:$src), "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>; -def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src), +def PREFETCHT2 : VoPSI<0x18, MRM3m, (outs), (ins i8mem:$src), "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>; -def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src), +def PREFETCHNTA : VoPSI<0x18, MRM0m, (outs), (ins i8mem:$src), "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>; // Flush cache @@ -3652,6 +3696,8 @@ defm VPOR : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64, i128mem, 1, 0>, VEX_4V; defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64, i128mem, 1, 0>, VEX_4V; +defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64, + i128mem, 0, 0>, VEX_4V; let ExeDomain = SSEPackedInt in { let neverHasSideEffects = 1 in { @@ -3666,17 +3712,6 @@ let ExeDomain = SSEPackedInt in { VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. } - def VPANDNrr : PDI<0xDF, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (v2i64 (X86andnp VR128:$src1, VR128:$src2)))]>,VEX_4V; - - def VPANDNrm : PDI<0xDF, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (X86andnp VR128:$src1, - (memopv2i64 addr:$src2)))]>, VEX_4V; } } @@ -3714,6 +3749,8 @@ defm VPORY : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64, i256mem, 1, 0>, VEX_4V; defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64, i256mem, 1, 0>, VEX_4V; +defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64, + i256mem, 0, 0>, VEX_4V; let ExeDomain = SSEPackedInt in { let neverHasSideEffects = 1 in { @@ -3728,17 +3765,6 @@ let ExeDomain = SSEPackedInt in { VEX_4V; // PSRADQYri doesn't exist in SSE[1-3]. } - def VPANDNYrr : PDI<0xDF, MRMSrcReg, - (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), - "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR256:$dst, - (v4i64 (X86andnp VR256:$src1, VR256:$src2)))]>,VEX_4V; - - def VPANDNYrm : PDI<0xDF, MRMSrcMem, - (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), - "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR256:$dst, (X86andnp VR256:$src1, - (memopv4i64 addr:$src2)))]>, VEX_4V; } } @@ -3776,6 +3802,8 @@ defm POR : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64, i128mem, 1>; defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64, i128mem, 1>; +defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64, + i128mem, 0>; let ExeDomain = SSEPackedInt in { let neverHasSideEffects = 1 in { @@ -3787,14 +3815,6 @@ let ExeDomain = SSEPackedInt in { (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), "psrldq\t{$src2, $dst|$dst, $src2}", []>; // PSRADQri doesn't exist in SSE[1-3]. - def PANDNrr : PDI<0xDF, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", []>; - - let mayLoad = 1 in - def PANDNrm : PDI<0xDF, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", []>; } } } // Constraints = "$src1 = $dst" @@ -4198,66 +4218,88 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, } let Predicates = [HasAVX] in { - defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw, + defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, bc_v16i8, 0>, VEX_4V; - defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Punpcklwd, + defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, bc_v8i16, 0>, VEX_4V; - defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq, + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, bc_v4i32, 0>, VEX_4V; - defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Punpcklqdq, + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, bc_v2i64, 0>, VEX_4V; - defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw, + defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, bc_v16i8, 0>, VEX_4V; - defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Punpckhwd, + defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, bc_v8i16, 0>, VEX_4V; - defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq, + defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, bc_v4i32, 0>, VEX_4V; - defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Punpckhqdq, + defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, bc_v2i64, 0>, VEX_4V; } let Predicates = [HasAVX2] in { - defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Punpcklbw, + defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, bc_v32i8>, VEX_4V; - defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Punpcklwd, + defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, bc_v16i16>, VEX_4V; - defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Punpckldq, + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, bc_v8i32>, VEX_4V; - defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Punpcklqdq, + defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, bc_v4i64>, VEX_4V; - defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Punpckhbw, + defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, bc_v32i8>, VEX_4V; - defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Punpckhwd, + defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, bc_v16i16>, VEX_4V; - defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Punpckhdq, + defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, bc_v8i32>, VEX_4V; - defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Punpckhqdq, + defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, bc_v4i64>, VEX_4V; } let Constraints = "$src1 = $dst" in { - defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, + defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, bc_v16i8>; - defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, + defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, bc_v8i16>; - defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq, + defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, bc_v4i32>; - defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Punpcklqdq, + defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, bc_v2i64>; - defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw, + defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, bc_v16i8>; - defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd, + defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, bc_v8i16>; - defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq, + defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, bc_v4i32>; - defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Punpckhqdq, + defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, bc_v2i64>; } } // ExeDomain = SSEPackedInt +// Patterns for using AVX1 instructions with integer vectors +// Here to give AVX2 priority +let Predicates = [HasAVX] in { + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; +} + // Splat v2f64 / v2i64 let AddedComplexity = 10 in { def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), @@ -4784,7 +4826,7 @@ def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // AVX 256-bit register conversion intrinsics def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src), (VCVTDQ2PDYrr VR128:$src)>; -def : Pat<(int_x86_avx_cvtdq2_pd_256 (memopv4i32 addr:$src)), +def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))), (VCVTDQ2PDYrm addr:$src)>; def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src), @@ -4794,7 +4836,7 @@ def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)), def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PDYrr VR128:$src)>; -def : Pat<(v4f64 (sint_to_fp (memopv4i32 addr:$src))), +def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), (VCVTDQ2PDYrm addr:$src)>; //===---------------------------------------------------------------------===// @@ -5085,7 +5127,7 @@ let Constraints = "$src1 = $dst" in { /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, - PatFrag mem_frag128, Intrinsic IntId128> { + Intrinsic IntId128> { def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -5097,12 +5139,12 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 - (bitconvert (mem_frag128 addr:$src))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src))))]>, OpSize; } /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, - PatFrag mem_frag256, Intrinsic IntId256> { + Intrinsic IntId256> { def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -5114,32 +5156,32 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId256 - (bitconvert (mem_frag256 addr:$src))))]>, OpSize; + (bitconvert (memopv4i64 addr:$src))))]>, OpSize; } let Predicates = [HasAVX] in { - defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8, + defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128>, VEX; - defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16, + defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128>, VEX; - defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", memopv4i32, + defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128>, VEX; } let Predicates = [HasAVX2] in { - defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", memopv32i8, + defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", int_x86_avx2_pabs_b>, VEX; - defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", memopv16i16, + defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", int_x86_avx2_pabs_w>, VEX; - defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", memopv8i32, + defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", int_x86_avx2_pabs_d>, VEX; } -defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", memopv16i8, +defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128>; -defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", memopv8i16, +defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128>; -defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32, +defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128>; //===---------------------------------------------------------------------===// @@ -5148,8 +5190,7 @@ defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32, /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, - PatFrag mem_frag128, Intrinsic IntId128, - bit Is2Addr = 1> { + Intrinsic IntId128, bit Is2Addr = 1> { let isCommutable = 1 in def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -5165,11 +5206,11 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (mem_frag128 addr:$src2))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; } multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, - PatFrag mem_frag256, Intrinsic IntId256> { + Intrinsic IntId256> { let isCommutable = 1 in def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), @@ -5181,94 +5222,94 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntId256 VR256:$src1, - (bitconvert (mem_frag256 addr:$src2))))]>, OpSize; + (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; } let ImmT = NoImm, Predicates = [HasAVX] in { let isCommutable = 0 in { - defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16, + defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", int_x86_ssse3_phadd_w_128, 0>, VEX_4V; - defm VPHADDD : SS3I_binop_rm_int<0x02, "vphaddd", memopv4i32, + defm VPHADDD : SS3I_binop_rm_int<0x02, "vphaddd", int_x86_ssse3_phadd_d_128, 0>, VEX_4V; - defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", memopv8i16, + defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, 0>, VEX_4V; - defm VPHSUBW : SS3I_binop_rm_int<0x05, "vphsubw", memopv8i16, + defm VPHSUBW : SS3I_binop_rm_int<0x05, "vphsubw", int_x86_ssse3_phsub_w_128, 0>, VEX_4V; - defm VPHSUBD : SS3I_binop_rm_int<0x06, "vphsubd", memopv4i32, + defm VPHSUBD : SS3I_binop_rm_int<0x06, "vphsubd", int_x86_ssse3_phsub_d_128, 0>, VEX_4V; - defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", memopv8i16, + defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, 0>, VEX_4V; - defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv16i8, + defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V; - defm VPSHUFB : SS3I_binop_rm_int<0x00, "vpshufb", memopv16i8, + defm VPSHUFB : SS3I_binop_rm_int<0x00, "vpshufb", int_x86_ssse3_pshuf_b_128, 0>, VEX_4V; - defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", memopv16i8, + defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", int_x86_ssse3_psign_b_128, 0>, VEX_4V; - defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", memopv8i16, + defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", int_x86_ssse3_psign_w_128, 0>, VEX_4V; - defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", memopv4i32, + defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", int_x86_ssse3_psign_d_128, 0>, VEX_4V; } -defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv8i16, +defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V; } let ImmT = NoImm, Predicates = [HasAVX2] in { let isCommutable = 0 in { - defm VPHADDW : SS3I_binop_rm_int_y<0x01, "vphaddw", memopv16i16, + defm VPHADDW : SS3I_binop_rm_int_y<0x01, "vphaddw", int_x86_avx2_phadd_w>, VEX_4V; - defm VPHADDD : SS3I_binop_rm_int_y<0x02, "vphaddd", memopv8i32, + defm VPHADDD : SS3I_binop_rm_int_y<0x02, "vphaddd", int_x86_avx2_phadd_d>, VEX_4V; - defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", memopv16i16, + defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", int_x86_avx2_phadd_sw>, VEX_4V; - defm VPHSUBW : SS3I_binop_rm_int_y<0x05, "vphsubw", memopv16i16, + defm VPHSUBW : SS3I_binop_rm_int_y<0x05, "vphsubw", int_x86_avx2_phsub_w>, VEX_4V; - defm VPHSUBD : SS3I_binop_rm_int_y<0x06, "vphsubd", memopv8i32, + defm VPHSUBD : SS3I_binop_rm_int_y<0x06, "vphsubd", int_x86_avx2_phsub_d>, VEX_4V; - defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", memopv16i16, + defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", int_x86_avx2_phsub_sw>, VEX_4V; - defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", memopv32i8, + defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", int_x86_avx2_pmadd_ub_sw>, VEX_4V; - defm VPSHUFB : SS3I_binop_rm_int_y<0x00, "vpshufb", memopv32i8, + defm VPSHUFB : SS3I_binop_rm_int_y<0x00, "vpshufb", int_x86_avx2_pshuf_b>, VEX_4V; - defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", memopv32i8, + defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b>, VEX_4V; - defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", memopv16i16, + defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w>, VEX_4V; - defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", memopv8i32, + defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d>, VEX_4V; } -defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", memopv16i16, +defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", int_x86_avx2_pmul_hr_sw>, VEX_4V; } // None of these have i8 immediate fields. let ImmT = NoImm, Constraints = "$src1 = $dst" in { let isCommutable = 0 in { - defm PHADDW : SS3I_binop_rm_int<0x01, "phaddw", memopv8i16, + defm PHADDW : SS3I_binop_rm_int<0x01, "phaddw", int_x86_ssse3_phadd_w_128>; - defm PHADDD : SS3I_binop_rm_int<0x02, "phaddd", memopv4i32, + defm PHADDD : SS3I_binop_rm_int<0x02, "phaddd", int_x86_ssse3_phadd_d_128>; - defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", memopv8i16, + defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", int_x86_ssse3_phadd_sw_128>; - defm PHSUBW : SS3I_binop_rm_int<0x05, "phsubw", memopv8i16, + defm PHSUBW : SS3I_binop_rm_int<0x05, "phsubw", int_x86_ssse3_phsub_w_128>; - defm PHSUBD : SS3I_binop_rm_int<0x06, "phsubd", memopv4i32, + defm PHSUBD : SS3I_binop_rm_int<0x06, "phsubd", int_x86_ssse3_phsub_d_128>; - defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", memopv8i16, + defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", int_x86_ssse3_phsub_sw_128>; - defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv16i8, + defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", int_x86_ssse3_pmadd_ub_sw_128>; - defm PSHUFB : SS3I_binop_rm_int<0x00, "pshufb", memopv16i8, + defm PSHUFB : SS3I_binop_rm_int<0x00, "pshufb", int_x86_ssse3_pshuf_b_128>; - defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", memopv16i8, + defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128>; - defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", memopv8i16, + defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128>; - defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", memopv4i32, + defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128>; } -defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16, +defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", int_x86_ssse3_pmul_hr_sw_128>; } @@ -6017,8 +6058,18 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, Intrinsic F32Int, Intrinsic F64Int, bit Is2Addr = 1> { let ExeDomain = GenericDomain in { - // Intrinsic operation, reg. + // Operation, reg. def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; + + // Intrinsic operation, reg. + def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -6040,8 +6091,18 @@ let ExeDomain = GenericDomain in { (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, OpSize; - // Intrinsic operation, reg. + // Operation, reg. def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; + + // Intrinsic operation, reg. + def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -6079,6 +6140,27 @@ let Predicates = [HasAVX] in { defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", int_x86_sse41_round_ss, int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + + def : Pat<(ffloor FR32:$src), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + def : Pat<(f64 (ffloor FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + def : Pat<(f64 (fceil FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + def : Pat<(f32 (frint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -6088,6 +6170,27 @@ let Constraints = "$src1 = $dst" in defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; +def : Pat<(ffloor FR32:$src), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; +def : Pat<(f64 (ffloor FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; +def : Pat<(f32 (fnearbyint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; +def : Pat<(f64 (fnearbyint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; +def : Pat<(f32 (fceil FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; +def : Pat<(f64 (fceil FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; +def : Pat<(f32 (frint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; +def : Pat<(f64 (frint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; +def : Pat<(f32 (ftrunc FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; +def : Pat<(f64 (ftrunc FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// @@ -6195,7 +6298,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 - (bitconvert (memopv8i16 addr:$src))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src))))]>, OpSize; } let Predicates = [HasAVX] in @@ -6221,7 +6324,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; } /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator @@ -6237,7 +6340,7 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntId256 VR256:$src1, - (bitconvert (memopv32i8 addr:$src2))))]>, OpSize; + (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; } let Predicates = [HasAVX] in { @@ -6400,38 +6503,38 @@ let Predicates = [HasAVX] in { let isCommutable = 0 in { let ExeDomain = SSEPackedSingle in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv4f32, i128mem, 0>, VEX_4V; defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V; + int_x86_avx_blend_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V; } let ExeDomain = SSEPackedDouble in { defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv2f64, i128mem, 0>, VEX_4V; defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V; + int_x86_avx_blend_pd_256, VR256, memopv4f64, i256mem, 0>, VEX_4V; } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv2i64, i128mem, 0>, VEX_4V; defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv2i64, i128mem, 0>, VEX_4V; } let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv4f32, i128mem, 0>, VEX_4V; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv2f64, i128mem, 0>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, - VR256, memopv32i8, i256mem, 0>, VEX_4V; + VR256, memopv8f32, i256mem, 0>, VEX_4V; } let Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, memopv32i8, i256mem, 0>, VEX_4V; + VR256, memopv4i64, i256mem, 0>, VEX_4V; defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, - VR256, memopv32i8, i256mem, 0>, VEX_4V; + VR256, memopv4i64, i256mem, 0>, VEX_4V; } } @@ -6439,35 +6542,35 @@ let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, - VR128, memopv16i8, i128mem>; + VR128, memopv4f32, i128mem>; let ExeDomain = SSEPackedDouble in defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, - VR128, memopv16i8, i128mem>; + VR128, memopv2f64, i128mem>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, - VR128, memopv16i8, i128mem>; + VR128, memopv2i64, i128mem>; defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv16i8, i128mem>; + VR128, memopv2i64, i128mem>; } let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, - VR128, memopv16i8, i128mem>; + VR128, memopv4f32, i128mem>; let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, - VR128, memopv16i8, i128mem>; + VR128, memopv2f64, i128mem>; } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, Intrinsic IntId> { - def rr : I<opc, MRMSrcReg, (outs RC:$dst), + def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; - def rm : I<opc, MRMSrcMem, (outs RC:$dst), + def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -6480,23 +6583,23 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem, - memopv16i8, int_x86_sse41_blendvpd>; + memopv2f64, int_x86_sse41_blendvpd>; defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem, - memopv32i8, int_x86_avx_blendv_pd_256>; + memopv4f64, int_x86_avx_blendv_pd_256>; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem, - memopv16i8, int_x86_sse41_blendvps>; + memopv4f32, int_x86_sse41_blendvps>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem, - memopv32i8, int_x86_avx_blendv_ps_256>; + memopv8f32, int_x86_avx_blendv_ps_256>; } // ExeDomain = SSEPackedSingle defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - memopv16i8, int_x86_sse41_pblendvb>; + memopv2i64, int_x86_sse41_pblendvb>; } let Predicates = [HasAVX2] in { defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - memopv32i8, int_x86_avx2_pblendvb>; + memopv4i64, int_x86_avx2_pblendvb>; } let Predicates = [HasAVX] in { @@ -6537,7 +6640,8 @@ let Predicates = [HasAVX2] in { /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { - multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + Intrinsic IntId> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, @@ -6551,15 +6655,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, - (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize; + (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize; } } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>; +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, + int_x86_sse41_blendvpd>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, + int_x86_sse41_blendvps>; +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, + int_x86_sse41_pblendvb>; let Predicates = [HasSSE41] in { def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), @@ -6614,8 +6721,7 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize; } /// SS42I_binop_rm_int - Simple SSE 4.2 binary operator @@ -6630,8 +6736,7 @@ multiclass SS42I_binop_rm_int_y<bits<8> opc, string OpcodeStr, (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (IntId256 VR256:$src1, - (bitconvert (memopv32i8 addr:$src2))))]>, OpSize; + (IntId256 VR256:$src1, (memopv4i64 addr:$src2)))]>, OpSize; } let Predicates = [HasAVX] in { @@ -6913,7 +7018,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; } // Perform One Round of an AES Encryption/Decryption Flow @@ -7144,7 +7249,7 @@ def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), //===----------------------------------------------------------------------===// // VINSERTF128 - Insert packed floating-point values // -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -7163,35 +7268,10 @@ def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3), def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3), (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), - (i32 imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), - (i32 imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), - (i32 imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), - (i32 imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), - (i32 imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), - (i32 imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; - //===----------------------------------------------------------------------===// // VEXTRACTF128 - Extract packed floating-point values // -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), (ins VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -7210,31 +7290,6 @@ def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), (VEXTRACTF128rr VR256:$src1, imm:$src2)>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), - (v4f32 (VEXTRACTF128rr - (v8f32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), - (v2f64 (VEXTRACTF128rr - (v4f64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), - (v4i32 (VEXTRACTF128rr - (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), - (v2i64 (VEXTRACTF128rr - (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), - (v8i16 (VEXTRACTF128rr - (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), - (v16i8 (VEXTRACTF128rr - (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; - //===----------------------------------------------------------------------===// // VMASKMOV - Conditional SIMD Packed Loads and Stores // @@ -7288,7 +7343,8 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop_i:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (IntVar RC:$src1, (i_frag addr:$src2)))]>, VEX_4V; + [(set RC:$dst, (IntVar RC:$src1, + (bitconvert (i_frag addr:$src2))))]>, VEX_4V; def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), @@ -7302,11 +7358,11 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, let ExeDomain = SSEPackedSingle in { defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, - memopv4f32, memopv4i32, + memopv4f32, memopv2i64, int_x86_avx_vpermilvar_ps, int_x86_avx_vpermil_ps>; defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, - memopv8f32, memopv8i32, + memopv8f32, memopv4i64, int_x86_avx_vpermilvar_ps_256, int_x86_avx_vpermil_ps_256>; } @@ -7321,19 +7377,28 @@ let ExeDomain = SSEPackedDouble in { int_x86_avx_vpermil_pd_256>; } -def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))), +def : Pat<(v8f32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4f64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))), +def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; +def : Pat<(v8f32 (X86VPermilp (memopv8f32 addr:$src1), (i8 imm:$imm))), + (VPERMILPSYmi addr:$src1, imm:$imm)>; +def : Pat<(v4f64 (X86VPermilp (memopv4f64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDYmi addr:$src1, imm:$imm)>; +def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)), + (i8 imm:$imm))), + (VPERMILPSYmi addr:$src1, imm:$imm)>; +def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDYmi addr:$src1, imm:$imm)>; //===----------------------------------------------------------------------===// // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks // -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -7359,22 +7424,9 @@ def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, (memopv4f64 addr:$src2), imm:$src3), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; def : Pat<(int_x86_avx_vperm2f128_si_256 - VR256:$src1, (memopv8i32 addr:$src2), imm:$src3), + VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), imm:$src3), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; - //===----------------------------------------------------------------------===// // VZERO - Zero YMM registers // @@ -7451,9 +7503,9 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, let isCommutable = 0 in { defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, - VR128, memopv16i8, i128mem>; + VR128, memopv2i64, i128mem>; defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, - VR256, memopv32i8, i256mem>; + VR256, memopv4i64, i256mem>; } //===----------------------------------------------------------------------===// @@ -7541,11 +7593,12 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (Int VR256:$src1, (mem_frag addr:$src2)))]>, + [(set VR256:$dst, (Int VR256:$src1, + (bitconvert (mem_frag addr:$src2))))]>, VEX_4V; } -defm VPERMD : avx2_perm<0x36, "vpermd", memopv8i32, int_x86_avx2_permd>; +defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, int_x86_avx2_permd>; let ExeDomain = SSEPackedSingle in defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, int_x86_avx2_permps>; @@ -7571,7 +7624,7 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>, VEX_W; //===----------------------------------------------------------------------===// -// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks +// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks // def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), @@ -7587,6 +7640,64 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), imm:$src3))]>, VEX_4V; +let Predicates = [HasAVX2] in { +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + +// AVX1 patterns +def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1, + (memopv8f32 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, + (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, + (memopv4i64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, + (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; + + //===----------------------------------------------------------------------===// // VINSERTI128 - Insert packed integer values // @@ -7603,6 +7714,51 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), (int_x86_avx2_vinserti128 VR256:$src1, (memopv2i64 addr:$src2), imm:$src3))]>, VEX_4V; +let Predicates = [HasAVX2] in { +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + +// AVX1 patterns +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; + //===----------------------------------------------------------------------===// // VEXTRACTI128 - Extract packed integer values // @@ -7617,6 +7773,51 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, i8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX; +let Predicates = [HasAVX2] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2i64 (VEXTRACTI128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i32 (VEXTRACTI128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v8i16 (VEXTRACTI128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v16i8 (VEXTRACTI128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +} + +// AVX1 patterns +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4f32 (VEXTRACTF128rr + (v8f32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2f64 (VEXTRACTF128rr + (v4f64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2i64 (VEXTRACTF128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i32 (VEXTRACTF128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v8i16 (VEXTRACTF128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v16i8 (VEXTRACTF128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + //===----------------------------------------------------------------------===// // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores // diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td new file mode 100644 index 0000000..64cc44d --- /dev/null +++ b/lib/Target/X86/X86InstrXOP.td @@ -0,0 +1,243 @@ +//====- X86InstrXOP.td - Describe the X86 Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes XOP (eXtended OPerations) +// +//===----------------------------------------------------------------------===// + +multiclass xop2op<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", f128mem>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", f128mem>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", f128mem>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", f128mem>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", f128mem>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", f128mem>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", f128mem>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", f128mem>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", f128mem>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", f128mem>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", f128mem>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", f128mem>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", f128mem>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", f128mem>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", f128mem>; + defm VFRCZSS : xop2op<0x82, "vfrczss", f32mem>; + defm VFRCZSD : xop2op<0x83, "vfrczsd", f64mem>; + defm VFRCZPS : xop2op<0x80, "vfrczps", f128mem>; + defm VFRCZPD : xop2op<0x81, "vfrczpd", f128mem>; +} + +multiclass xop2op256<bits<8> opc, string OpcodeStr> { + def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, VEX, VEX_L; + def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VFRCZPS : xop2op256<0x80, "vfrczps">; + defm VFRCZPD : xop2op256<0x81, "vfrczpd">; +} + +multiclass xop3op<bits<8> opc, string OpcodeStr> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4VOp3; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_W; + def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins f128mem:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4VOp3; +} + +let isAsmParserOnly = 1 in { + defm VPSHLW : xop3op<0x95, "vpshlw">; + defm VPSHLQ : xop3op<0x97, "vpshlq">; + defm VPSHLD : xop3op<0x96, "vpshld">; + defm VPSHLB : xop3op<0x94, "vpshlb">; + defm VPSHAW : xop3op<0x99, "vpshaw">; + defm VPSHAQ : xop3op<0x9B, "vpshaq">; + defm VPSHAD : xop3op<0x9A, "vpshad">; + defm VPSHAB : xop3op<0x98, "vpshab">; + defm VPROTW : xop3op<0x91, "vprotw">; + defm VPROTQ : xop3op<0x93, "vprotq">; + defm VPROTD : xop3op<0x92, "vprotd">; + defm VPROTB : xop3op<0x90, "vprotb">; +} + +multiclass xop3opimm<bits<8> opc, string OpcodeStr> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins f128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VPROTW : xop3opimm<0xC1, "vprotw">; + defm VPROTQ : xop3opimm<0xC3, "vprotq">; + defm VPROTD : xop3opimm<0xC2, "vprotd">; + defm VPROTB : xop3opimm<0xC0, "vprotb">; +} + +// Instruction where second source can be memory, but third must be register +multiclass xop4opm2<bits<8> opc, string OpcodeStr> { + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd">; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd">; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww">; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd">; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww">; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd">; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql">; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh">; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd">; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql">; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh">; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd">; +} + +// Instruction where second source can be memory, third must be imm8 +multiclass xop4opimm<bits<8> opc, string OpcodeStr> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V; +} + +let isAsmParserOnly = 1 in { + defm VPCOMW : xop4opimm<0xCD, "vpcomw">; + defm VPCOMUW : xop4opimm<0xED, "vpcomuw">; + defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq">; + defm VPCOMUD : xop4opimm<0xEE, "vpcomud">; + defm VPCOMUB : xop4opimm<0xEC, "vpcomub">; + defm VPCOMQ : xop4opimm<0xCF, "vpcomq">; + defm VPCOMD : xop4opimm<0xCE, "vpcomd">; + defm VPCOMB : xop4opimm<0xCC, "vpcomb">; +} + +// Instruction where either second or third source can be memory +multiclass xop4op<bits<8> opc, string OpcodeStr> { + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM, XOP_W; + def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPPERM : xop4op<0xA3, "vpperm">; + defm VPCMOV : xop4op<0xA2, "vpcmov">; +} + +multiclass xop4op256<bits<8> opc, string OpcodeStr> { + def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM; + def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM, XOP_W; + def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPCMOV : xop4op256<0xA2, "vpcmov">; +} + +multiclass xop5op<bits<8> opc, string OpcodeStr> { + def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + []>; + def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + []>, XOP_W; + def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + []>; + def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + []>; + def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + []>, XOP_W; + def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + []>; +} + +let isAsmParserOnly = 1 in { + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd">; + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps">; +} diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index 3f88fa6..2145a33 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -424,7 +424,9 @@ X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) { TargetJITInfo::LazyResolverFn X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { + TsanIgnoreWritesBegin(); JITCompilerFunction = F; + TsanIgnoreWritesEnd(); #if defined (X86_32_JIT) && !defined (_MSC_VER) if (Subtarget->hasSSE1()) diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 81ee665..9232196 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -368,10 +368,6 @@ ReSimplify: case X86::SETB_C64r: LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break; case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; - case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::VFsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; - case X86::VFsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; case X86::AVX_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index c1ac9f3..4e80432 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -452,7 +452,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - return (RealignStack && + return (MF.getTarget().Options.RealignStack && !MFI->hasVarSizedObjects()); } @@ -583,7 +583,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // sure we restore the stack pointer immediately after the call, there may // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. MachineBasicBlock::iterator B = MBB.begin(); - while (I != B && !llvm::prior(I)->getDesc().isCall()) + while (I != B && !llvm::prior(I)->isCall()) --I; MBB.insert(I, New); } @@ -665,7 +665,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { case MVT::i8: if (High) { switch (Reg) { - default: return 0; + default: return getX86SubSuperRegister(Reg, MVT::i64, High); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AH; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -785,6 +785,22 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { return X86::R15D; } case MVT::i64: + // For 64-bit mode if we've requested a "high" register and the + // Q or r constraints we want one of these high registers or + // just the register name otherwise. + if (High) { + switch (Reg) { + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + // Fallthrough. + } + } switch (Reg) { default: return Reg; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index e7bcbf8..6e092c7 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -273,6 +273,8 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { if (IsAMD && ((ECX >> 16) & 0x1)) { HasFMA4 = true; ToggleFeature(X86::FeatureFMA4); + HasXOP = true; + ToggleFeature(X86::FeatureXOP); } } } @@ -317,6 +319,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, , HasCLMUL(false) , HasFMA3(false) , HasFMA4(false) + , HasXOP(false) , HasMOVBE(false) , HasRDRAND(false) , HasF16C(false) @@ -387,9 +390,6 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, assert((!In64BitMode || HasX86_64) && "64-bit code requested on a subtarget that doesn't support it!"); - if(EnableSegmentedStacks && !isTargetELF()) - report_fatal_error("Segmented stacks are only implemented on ELF."); - // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both // 32 and 64 bit) and for all 64-bit targets. if (StackAlignOverride) diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index e93f8e9..ccb9be0 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -93,6 +93,9 @@ protected: /// HasFMA4 - Target has 4-operand fused multiply-add bool HasFMA4; + /// HasXOP - Target has XOP instructions + bool HasXOP; + /// HasMOVBE - True if the processor has the MOVBE instruction. bool HasMOVBE; @@ -198,6 +201,7 @@ public: bool hasCLMUL() const { return HasCLMUL; } bool hasFMA3() const { return HasFMA3; } bool hasFMA4() const { return HasFMA4; } + bool hasXOP() const { return HasXOP; } bool hasMOVBE() const { return HasMOVBE; } bool hasRDRAND() const { return HasRDRAND; } bool hasF16C() const { return HasF16C; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 1c9f3bd..126042e 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -31,9 +31,10 @@ extern "C" void LLVMInitializeX86Target() { X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : X86TargetMachine(T, TT, CPU, FS, RM, CM, OL, false), + : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false), DataLayout(getSubtargetImpl()->isTargetDarwin() ? "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-" "n8:16:32-S128" : @@ -52,9 +53,10 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : X86TargetMachine(T, TT, CPU, FS, RM, CM, OL, true), + : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true), DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-" "n8:16:32:64-S128"), InstrInfo(*this), @@ -67,11 +69,12 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, /// X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool is64Bit) - : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL), - Subtarget(TT, CPU, FS, StackAlignmentOverride, is64Bit), + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit), FrameLowering(*this, Subtarget), ELFWriterInfo(is64Bit, true) { // Determine the PICStyle based on the target selected. @@ -95,8 +98,11 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, } // default to hard float ABI - if (FloatABIType == FloatABI::Default) - FloatABIType = FloatABI::Hard; + if (Options.FloatABIType == FloatABI::Default) + this->Options.FloatABIType = FloatABI::Hard; + + if (Options.EnableSegmentedStacks && !Subtarget.isTargetELF()) + report_fatal_error("Segmented stacks are only implemented on ELF."); } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 64be458..3ac1769 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -38,7 +38,7 @@ class X86TargetMachine : public LLVMTargetMachine { public: X86TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, + StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool is64Bit); @@ -85,7 +85,7 @@ class X86_32TargetMachine : public X86TargetMachine { X86JITInfo JITInfo; public: X86_32TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, + StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); virtual const TargetData *getTargetData() const { return &DataLayout; } @@ -113,7 +113,7 @@ class X86_64TargetMachine : public X86TargetMachine { X86JITInfo JITInfo; public: X86_64TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, + StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); virtual const TargetData *getTargetData() const { return &DataLayout; } diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 9bb54a8..f8c30eb 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -220,7 +220,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { MachineInstr *MI = I; DebugLoc dl = I->getDebugLoc(); - bool isControlFlow = MI->getDesc().isCall() || MI->getDesc().isReturn(); + bool isControlFlow = MI->isCall() || MI->isReturn(); // Shortcut: don't need to check regular instructions in dirty state. if (!isControlFlow && CurState == ST_DIRTY) |