aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Gohman <gohman@apple.com>2008-10-17 01:23:35 +0000
committerDan Gohman <gohman@apple.com>2008-10-17 01:23:35 +0000
commit74feef261a43392bc85280f66c75fbd4e2ccf73d (patch)
treec3632ba600f38a66d23a0d10aadb7f53ca87f058
parenta1fcd77ccfc61087dfad4fad69752a414179836c (diff)
downloadexternal_llvm-74feef261a43392bc85280f66c75fbd4e2ccf73d.zip
external_llvm-74feef261a43392bc85280f66c75fbd4e2ccf73d.tar.gz
external_llvm-74feef261a43392bc85280f66c75fbd4e2ccf73d.tar.bz2
Define patterns for shld and shrd that match immediate
shift counts, and patterns that match dynamic shift counts when the subtract is obscured by a truncate node. Add DAGCombiner support for recognizing rotate patterns when the shift counts are defined by truncate nodes. Fix and simplify the code for commuting shld and shrd instructions to work even when the given instruction doesn't have a parent, and when the caller needs a new instruction. These changes allow LLVM to use the shld, shrd, rol, and ror instructions on x86 to replace equivalent code using two shifts and an or in many more cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@57662 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp14
-rw-r--r--lib/Target/X86/X86Instr64bit.td32
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp32
-rw-r--r--lib/Target/X86/X86InstrInfo.td92
-rw-r--r--test/CodeGen/X86/rot16.ll73
-rw-r--r--test/CodeGen/X86/rot32.ll73
-rw-r--r--test/CodeGen/X86/rot64.ll73
7 files changed, 364 insertions, 25 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f546ed4..4b1945e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2053,13 +2053,15 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS) {
}
}
- // Look for sign/zext/any-extended cases:
+ // Look for sign/zext/any-extended or truncate cases:
if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
|| LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
- || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND) &&
+ || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
+ || LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
(RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
|| RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
- || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND)) {
+ || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
+ || RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
if (RExtOp0.getOpcode() == ISD::SUB &&
@@ -2068,7 +2070,8 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS) {
// (rotl x, y)
// fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
// (rotr x, (sub 32, y))
- if (ConstantSDNode *SUBC = cast<ConstantSDNode>(RExtOp0.getOperand(0))) {
+ if (ConstantSDNode *SUBC =
+ dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0))) {
if (SUBC->getAPIntValue() == OpSizeInBits) {
return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, VT, LHSShiftArg,
HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
@@ -2080,7 +2083,8 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS) {
// (rotr x, y)
// fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
// (rotl x, (sub 32, y))
- if (ConstantSDNode *SUBC = cast<ConstantSDNode>(LExtOp0.getOperand(0))) {
+ if (ConstantSDNode *SUBC =
+ dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0))) {
if (SUBC->getAPIntValue() == OpSizeInBits) {
return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, VT, LHSShiftArg,
HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 026c359..5085f54 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -1397,6 +1397,22 @@ def : Pat<(store (or (srl (loadi64 addr:$dst), CL:$amt),
(shl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
(SHRD64mrCL addr:$dst, GR64:$src2)>;
+def : Pat<(or (srl GR64:$src1, (i8 (trunc RCX:$amt))),
+ (shl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), (i8 (trunc RCX:$amt))),
+ (shl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ addr:$dst),
+ (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+ (SHRD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi64 addr:$dst), (i8 imm:$amt1),
+ GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHRD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
// (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
def : Pat<(or (shl GR64:$src1, CL:$amt),
(srl GR64:$src2, (sub 64, CL:$amt))),
@@ -1406,6 +1422,22 @@ def : Pat<(store (or (shl (loadi64 addr:$dst), CL:$amt),
(srl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
(SHLD64mrCL addr:$dst, GR64:$src2)>;
+def : Pat<(or (shl GR64:$src1, (i8 (trunc RCX:$amt))),
+ (srl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), (i8 (trunc RCX:$amt))),
+ (srl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ addr:$dst),
+ (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+ (SHLD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
+ GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHLD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
// X86 specific add which produces a flag.
def : Pat<(addc GR64:$src1, GR64:$src2),
(ADD64rr GR64:$src1, GR64:$src2)>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index e105b0f..b19c8b9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1248,26 +1248,14 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
}
unsigned Amt = MI->getOperand(3).getImm();
- unsigned A = MI->getOperand(0).getReg();
- unsigned B = MI->getOperand(1).getReg();
- unsigned C = MI->getOperand(2).getReg();
- bool AisDead = MI->getOperand(0).isDead();
- bool BisKill = MI->getOperand(1).isKill();
- bool CisKill = MI->getOperand(2).isKill();
- // If machine instrs are no longer in two-address forms, update
- // destination register as well.
- if (A == B) {
- // Must be two address instruction!
- assert(MI->getDesc().getOperandConstraint(0, TOI::TIED_TO) &&
- "Expecting a two-address instruction!");
- A = C;
- CisKill = false;
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
}
- MachineFunction &MF = *MI->getParent()->getParent();
- return BuildMI(MF, get(Opc))
- .addReg(A, true, false, false, AisDead)
- .addReg(C, false, false, CisKill)
- .addReg(B, false, false, BisKill).addImm(Size-Amt);
+ MI->setDesc(get(Opc));
+ MI->getOperand(3).setImm(Size-Amt);
+ return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
}
case X86::CMOVB16rr:
case X86::CMOVB32rr:
@@ -1357,7 +1345,11 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
}
-
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
MI->setDesc(get(Opc));
// Fallthrough intended.
}
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 7a78e03..16dc366 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -325,6 +325,34 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
return N->hasOneUse();
}]>;
+// 'shld' and 'shrd' instruction patterns. Note that even though these have
+// the srl and shl in their patterns, the C++ code must still check for them,
+// because predicates are tested before children nodes are explored.
+
+def shrd : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+ (or (srl node:$src1, node:$amt1),
+ (shl node:$src2, node:$amt2)), [{
+ assert(N->getOpcode() == ISD::OR);
+ return N->getOperand(0).getOpcode() == ISD::SRL &&
+ N->getOperand(1).getOpcode() == ISD::SHL &&
+ isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+ N->getOperand(0).getConstantOperandVal(1) ==
+ N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
+def shld : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+ (or (shl node:$src1, node:$amt1),
+ (srl node:$src2, node:$amt2)), [{
+ assert(N->getOpcode() == ISD::OR);
+ return N->getOperand(0).getOpcode() == ISD::SHL &&
+ N->getOperand(1).getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+ N->getOperand(0).getConstantOperandVal(1) ==
+ N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
//===----------------------------------------------------------------------===//
// Instruction list...
//
@@ -2973,6 +3001,22 @@ def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt),
(shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
(SHRD32mrCL addr:$dst, GR32:$src2)>;
+def : Pat<(or (srl GR32:$src1, (i8 (trunc ECX:$amt))),
+ (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+ (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ addr:$dst),
+ (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+ (SHRD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi32 addr:$dst), (i8 imm:$amt1),
+ GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHRD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
def : Pat<(or (shl GR32:$src1, CL:$amt),
(srl GR32:$src2, (sub 32, CL:$amt))),
@@ -2982,6 +3026,22 @@ def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt),
(srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
(SHLD32mrCL addr:$dst, GR32:$src2)>;
+def : Pat<(or (shl GR32:$src1, (i8 (trunc ECX:$amt))),
+ (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+ (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ addr:$dst),
+ (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+ (SHLD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi32 addr:$dst), (i8 imm:$amt1),
+ GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHLD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
def : Pat<(or (srl GR16:$src1, CL:$amt),
(shl GR16:$src2, (sub 16, CL:$amt))),
@@ -2991,6 +3051,22 @@ def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt),
(shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
(SHRD16mrCL addr:$dst, GR16:$src2)>;
+def : Pat<(or (srl GR16:$src1, (i8 (trunc CX:$amt))),
+ (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+ (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ addr:$dst),
+ (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+ (SHRD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi16 addr:$dst), (i8 imm:$amt1),
+ GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHRD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
def : Pat<(or (shl GR16:$src1, CL:$amt),
(srl GR16:$src2, (sub 16, CL:$amt))),
@@ -3000,6 +3076,22 @@ def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt),
(srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
(SHLD16mrCL addr:$dst, GR16:$src2)>;
+def : Pat<(or (shl GR16:$src1, (i8 (trunc CX:$amt))),
+ (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+ (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ addr:$dst),
+ (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+ (SHLD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi16 addr:$dst), (i8 imm:$amt1),
+ GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHLD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
//===----------------------------------------------------------------------===//
// Floating Point Stack Support
//===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/X86/rot16.ll b/test/CodeGen/X86/rot16.ll
new file mode 100644
index 0000000..c196ce2
--- /dev/null
+++ b/test/CodeGen/X86/rot16.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-as < %s | llc -march=x86 > %t
+; RUN: grep rol %t | count 3
+; RUN: grep ror %t | count 1
+; RUN: grep shld %t | count 2
+; RUN: grep shrd %t | count 2
+
+define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = shl i16 %x, %z
+ %1 = sub i16 16, %z
+ %2 = lshr i16 %x, %1
+ %3 = or i16 %2, %0
+ ret i16 %3
+}
+
+define i16 @bar(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = shl i16 %y, %z
+ %1 = sub i16 16, %z
+ %2 = lshr i16 %x, %1
+ %3 = or i16 %2, %0
+ ret i16 %3
+}
+
+define i16 @un(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = lshr i16 %x, %z
+ %1 = sub i16 16, %z
+ %2 = shl i16 %x, %1
+ %3 = or i16 %2, %0
+ ret i16 %3
+}
+
+define i16 @bu(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = lshr i16 %y, %z
+ %1 = sub i16 16, %z
+ %2 = shl i16 %x, %1
+ %3 = or i16 %2, %0
+ ret i16 %3
+}
+
+define i16 @xfoo(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = lshr i16 %x, 11
+ %1 = shl i16 %x, 5
+ %2 = or i16 %0, %1
+ ret i16 %2
+}
+
+define i16 @xbar(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = shl i16 %y, 5
+ %1 = lshr i16 %x, 11
+ %2 = or i16 %0, %1
+ ret i16 %2
+}
+
+define i16 @xun(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = lshr i16 %x, 5
+ %1 = shl i16 %x, 11
+ %2 = or i16 %0, %1
+ ret i16 %2
+}
+
+define i16 @xbu(i16 %x, i16 %y, i16 %z) nounwind readnone {
+entry:
+ %0 = lshr i16 %y, 5
+ %1 = shl i16 %x, 11
+ %2 = or i16 %0, %1
+ ret i16 %2
+}
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
new file mode 100644
index 0000000..7cebcb8
--- /dev/null
+++ b/test/CodeGen/X86/rot32.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-as < %s | llc -march=x86 > %t
+; RUN: grep rol %t | count 3
+; RUN: grep ror %t | count 1
+; RUN: grep shld %t | count 2
+; RUN: grep shrd %t | count 2
+
+define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = shl i32 %x, %z
+ %1 = sub i32 32, %z
+ %2 = lshr i32 %x, %1
+ %3 = or i32 %2, %0
+ ret i32 %3
+}
+
+define i32 @bar(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = shl i32 %y, %z
+ %1 = sub i32 32, %z
+ %2 = lshr i32 %x, %1
+ %3 = or i32 %2, %0
+ ret i32 %3
+}
+
+define i32 @un(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = lshr i32 %x, %z
+ %1 = sub i32 32, %z
+ %2 = shl i32 %x, %1
+ %3 = or i32 %2, %0
+ ret i32 %3
+}
+
+define i32 @bu(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = lshr i32 %y, %z
+ %1 = sub i32 32, %z
+ %2 = shl i32 %x, %1
+ %3 = or i32 %2, %0
+ ret i32 %3
+}
+
+define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = lshr i32 %x, 25
+ %1 = shl i32 %x, 7
+ %2 = or i32 %0, %1
+ ret i32 %2
+}
+
+define i32 @xbar(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = shl i32 %y, 7
+ %1 = lshr i32 %x, 25
+ %2 = or i32 %0, %1
+ ret i32 %2
+}
+
+define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = lshr i32 %x, 7
+ %1 = shl i32 %x, 25
+ %2 = or i32 %0, %1
+ ret i32 %2
+}
+
+define i32 @xbu(i32 %x, i32 %y, i32 %z) nounwind readnone {
+entry:
+ %0 = lshr i32 %y, 7
+ %1 = shl i32 %x, 25
+ %2 = or i32 %0, %1
+ ret i32 %2
+}
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
new file mode 100644
index 0000000..2408359
--- /dev/null
+++ b/test/CodeGen/X86/rot64.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-as < %s | llc -march=x86-64 > %t
+; RUN: grep rol %t | count 3
+; RUN: grep ror %t | count 1
+; RUN: grep shld %t | count 2
+; RUN: grep shrd %t | count 2
+
+define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = shl i64 %x, %z
+ %1 = sub i64 64, %z
+ %2 = lshr i64 %x, %1
+ %3 = or i64 %2, %0
+ ret i64 %3
+}
+
+define i64 @bar(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = shl i64 %y, %z
+ %1 = sub i64 64, %z
+ %2 = lshr i64 %x, %1
+ %3 = or i64 %2, %0
+ ret i64 %3
+}
+
+define i64 @un(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = lshr i64 %x, %z
+ %1 = sub i64 64, %z
+ %2 = shl i64 %x, %1
+ %3 = or i64 %2, %0
+ ret i64 %3
+}
+
+define i64 @bu(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = lshr i64 %y, %z
+ %1 = sub i64 64, %z
+ %2 = shl i64 %x, %1
+ %3 = or i64 %2, %0
+ ret i64 %3
+}
+
+define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = lshr i64 %x, 57
+ %1 = shl i64 %x, 7
+ %2 = or i64 %0, %1
+ ret i64 %2
+}
+
+define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = shl i64 %y, 7
+ %1 = lshr i64 %x, 57
+ %2 = or i64 %0, %1
+ ret i64 %2
+}
+
+define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = lshr i64 %x, 7
+ %1 = shl i64 %x, 57
+ %2 = or i64 %0, %1
+ ret i64 %2
+}
+
+define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
+entry:
+ %0 = lshr i64 %y, 7
+ %1 = shl i64 %x, 57
+ %2 = or i64 %0, %1
+ ret i64 %2
+}