X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)

This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is uint64_t foo(uint64_t x) { return (x&1) << 42; } which used to compile into bloated code: shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a] movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00] andq %rdi, %rax ## encoding: [0x48,0x21,0xf8] ret ## encoding: [0xc3] with this patch we can fold the immediate into the and: andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01] movq %rdi, %rax ## encoding: [0x48,0x89,0xf8] shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a] ret ## encoding: [0xc3] It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing that without making this code even more complicated. See the TODOs in the code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@129990 91177308-0d34-0410-b5e6-96231b3b80d8
author: Benjamin Kramer <benny.kra@googlemail.com> 2011-04-22 15:30:40 +0000
committer: Benjamin Kramer <benny.kra@googlemail.com> 2011-04-22 15:30:40 +0000
commit: b20a8fc8a6bf57dbde0e9238cf535abb4326dc80 (patch)
tree: 5ae0148f9eb1ca78f9934cdc6a0d618f89a8946c
parent: eab631362d676c0113e052cc7e877eef4da544b8 (diff)
download: external_llvm-b20a8fc8a6bf57dbde0e9238cf535abb4326dc80.zip
external_llvm-b20a8fc8a6bf57dbde0e9238cf535abb4326dc80.tar.gz
external_llvm-b20a8fc8a6bf57dbde0e9238cf535abb4326dc80.tar.bz2
2 files changed, 176 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9b0ec6e..e156cef 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1580,6 +1580,81 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       return RetVal;
     break;
   }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    // For operations of the form (x << C1) op C2, check if we can use a smaller
+    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+      break;
+
+    // i8 is unshrinkable, i16 should be promoted to i32.
+    if (NVT != MVT::i32 && NVT != MVT::i64)
+      break;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!Cst || !ShlCst)
+      break;
+
+    int64_t Val = Cst->getSExtValue();
+    uint64_t ShlVal = ShlCst->getZExtValue();
+
+    // Make sure that we don't change the operation by removing bits.
+    // This only matters for OR and XOR, AND is unaffected.
+    if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
+      break;
+
+    unsigned ShlOp, Op;
+    EVT CstVT = NVT;
+
+    // Check the minimum bitwidth for the new constant.
+    // TODO: AND32ri is the same as AND64ri32 with zext imm.
+    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+      CstVT = MVT::i8;
+    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+      CstVT = MVT::i32;
+
+    // Bail if there is no smaller encoding.
+    if (NVT == CstVT)
+      break;
+
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      assert(CstVT == MVT::i8);
+      ShlOp = X86::SHL32ri;
+
+      switch (Opcode) {
+      case ISD::AND: Op = X86::AND32ri8; break;
+      case ISD::OR:  Op =  X86::OR32ri8; break;
+      case ISD::XOR: Op = X86::XOR32ri8; break;
+      }
+      break;
+    case MVT::i64:
+      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+      ShlOp = X86::SHL64ri;
+
+      switch (Opcode) {
+      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
+      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+      }
+      break;
+    }
+
+    // Emit the smaller op and the shift.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                                getI8Imm(ShlVal));
+    break;
+  }
   case X86ISD::UMUL: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
diff --git a/test/CodeGen/X86/narrow-shl-cst.ll b/test/CodeGen/X86/narrow-shl-cst.ll
new file mode 100644
index 0000000..a404f34
--- /dev/null
+++ b/test/CodeGen/X86/narrow-shl-cst.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; PR5039
+
+define i32 @test1(i32 %x) nounwind {
+  %and = shl i32 %x, 10
+  %shl = and i32 %and, 31744
+  ret i32 %shl
+; CHECK: test1:
+; CHECK: andl $31
+; CHECK: shll $10
+}
+
+define i32 @test2(i32 %x) nounwind {
+  %or = shl i32 %x, 10
+  %shl = or i32 %or, 31744
+  ret i32 %shl
+; CHECK: test2:
+; CHECK: orl $31
+; CHECK: shll $10
+}
+
+define i32 @test3(i32 %x) nounwind {
+  %xor = shl i32 %x, 10
+  %shl = xor i32 %xor, 31744
+  ret i32 %shl
+; CHECK: test3:
+; CHECK: xorl $31
+; CHECK: shll $10
+}
+
+define i64 @test4(i64 %x) nounwind {
+  %and = shl i64 %x, 40
+  %shl = and i64 %and, 264982302294016
+  ret i64 %shl
+; CHECK: test4:
+; CHECK: andq $241
+; CHECK: shlq $40
+}
+
+define i64 @test5(i64 %x) nounwind {
+  %and = shl i64 %x, 40
+  %shl = and i64 %and, 34084860461056
+  ret i64 %shl
+; CHECK: test5:
+; CHECK: andq $31
+; CHECK: shlq $40
+}
+
+define i64 @test6(i64 %x) nounwind {
+  %and = shl i64 %x, 32
+  %shl = and i64 %and, -281474976710656
+  ret i64 %shl
+; CHECK: test6:
+; CHECK: andq $-65536
+; CHECK: shlq $32
+}
+
+define i64 @test7(i64 %x) nounwind {
+  %or = shl i64 %x, 40
+  %shl = or i64 %or, 264982302294016
+  ret i64 %shl
+; CHECK: test7:
+; CHECK: orq $241
+; CHECK: shlq $40
+}
+
+define i64 @test8(i64 %x) nounwind {
+  %or = shl i64 %x, 40
+  %shl = or i64 %or, 34084860461056
+  ret i64 %shl
+; CHECK: test8:
+; CHECK: orq $31
+; CHECK: shlq $40
+}
+
+define i64 @test9(i64 %x) nounwind {
+  %xor = shl i64 %x, 40
+  %shl = xor i64 %xor, 264982302294016
+  ret i64 %shl
+; CHECK: test9:
+; CHECK: orq $241
+; CHECK: shlq $40
+}
+
+define i64 @test10(i64 %x) nounwind {
+  %xor = shl i64 %x, 40
+  %shl = xor i64 %xor, 34084860461056
+  ret i64 %shl
+; CHECK: test10:
+; CHECK: xorq $31
+; CHECK: shlq $40
+}
+
+define i64 @test11(i64 %x) nounwind {
+  %xor = shl i64 %x, 33
+  %shl = xor i64 %xor, -562949953421312
+  ret i64 %shl
+; CHECK: test11:
+; CHECK: xorq $-65536
+; CHECK: shlq $33
+}
author	Benjamin Kramer <benny.kra@googlemail.com>	2011-04-22 15:30:40 +0000
committer	Benjamin Kramer <benny.kra@googlemail.com>	2011-04-22 15:30:40 +0000
commit	b20a8fc8a6bf57dbde0e9238cf535abb4326dc80 (patch)
tree	5ae0148f9eb1ca78f9934cdc6a0d618f89a8946c
parent	eab631362d676c0113e052cc7e877eef4da544b8 (diff)
download	external_llvm-b20a8fc8a6bf57dbde0e9238cf535abb4326dc80.zip external_llvm-b20a8fc8a6bf57dbde0e9238cf535abb4326dc80.tar.gz external_llvm-b20a8fc8a6bf57dbde0e9238cf535abb4326dc80.tar.bz2